@@ -25,9 +25,6 @@ struct whisper_params {
2525 int32_t audio_ctx = 0 ;
2626 int32_t beam_size = -1 ;
2727
28- float vad_thold = 0 .6f ;
29- float freq_thold = 100 .0f ;
30-
3128 bool translate = false ;
3229 bool no_fallback = false ;
3330 bool print_special = false ;
@@ -37,10 +34,21 @@ struct whisper_params {
3734 bool save_audio = false ; // save audio to wav file
3835 bool use_gpu = true ;
3936 bool flash_attn = false ;
37+ bool no_prints = false ;
4038
4139 std::string language = " en" ;
4240 std::string model = " models/ggml-base.en.bin" ;
4341 std::string fname_out;
42+
43+ // Voice Activity Detection (VAD) parameters
44+ bool vad = false ;
45+ std::string vad_model = " models/for-tests-silero-v5.1.2-ggml.bin" ;
46+ float vad_threshold = 0 .5f ;
47+ int vad_min_speech_duration_ms = 250 ;
48+ int vad_min_silence_duration_ms = 100 ;
49+ float vad_max_speech_duration_s = FLT_MAX;
50+ int vad_speech_pad_ms = 30 ;
51+ float vad_samples_overlap = 0 .1f ;
4452};
4553
4654void whisper_print_usage (int argc, char ** argv, const whisper_params & params);
@@ -61,8 +69,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
6169 else if (arg == " -mt" || arg == " --max-tokens" ) { params.max_tokens = std::stoi (argv[++i]); }
6270 else if (arg == " -ac" || arg == " --audio-ctx" ) { params.audio_ctx = std::stoi (argv[++i]); }
6371 else if (arg == " -bs" || arg == " --beam-size" ) { params.beam_size = std::stoi (argv[++i]); }
64- else if (arg == " -vth" || arg == " --vad-thold" ) { params.vad_thold = std::stof (argv[++i]); }
65- else if (arg == " -fth" || arg == " --freq-thold" ) { params.freq_thold = std::stof (argv[++i]); }
6672 else if (arg == " -tr" || arg == " --translate" ) { params.translate = true ; }
6773 else if (arg == " -nf" || arg == " --no-fallback" ) { params.no_fallback = true ; }
6874 else if (arg == " -ps" || arg == " --print-special" ) { params.print_special = true ; }
@@ -74,7 +80,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
7480 else if (arg == " -sa" || arg == " --save-audio" ) { params.save_audio = true ; }
7581 else if (arg == " -ng" || arg == " --no-gpu" ) { params.use_gpu = false ; }
7682 else if (arg == " -fa" || arg == " --flash-attn" ) { params.flash_attn = true ; }
77-
83+ else if (arg == " -np" || arg == " --no-prints" ) { params.no_prints = true ; }
84+ // Voice Activity Detection (VAD)
85+ else if ( arg == " --vad" ) { params.vad = true ; }
86+ else if (arg == " -vm" || arg == " --vad-model" ) { params.vad_model = argv[++i]; }
87+ else if (arg == " -vt" || arg == " --vad-threshold" ) { params.vad_threshold = std::stof (argv[++i]); }
88+ else if (arg == " -vsd" || arg == " --vad-min-speech-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
89+ else if (arg == " -vsd" || arg == " --vad-min-silence-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
90+ else if (arg == " -vmsd" || arg == " --vad-max-speech-duration-s" ) { params.vad_max_speech_duration_s = std::stof (argv[++i]); }
91+ else if (arg == " -vp" || arg == " --vad-speech-pad-ms" ) { params.vad_speech_pad_ms = std::stoi (argv[++i]); }
92+ else if (arg == " -vo" || arg == " --vad-samples-overlap" ) { params.vad_samples_overlap = std::stof (argv[++i]); }
7893 else {
7994 fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
8095 whisper_print_usage (argc, argv, params);
@@ -99,8 +114,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
99114 fprintf (stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n " , params.max_tokens );
100115 fprintf (stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n " , params.audio_ctx );
101116 fprintf (stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n " , params.beam_size );
102- fprintf (stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n " , params.vad_thold );
103- fprintf (stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n " , params.freq_thold );
104117 fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
105118 fprintf (stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n " , params.no_fallback ? " true" : " false" );
106119 fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
@@ -112,6 +125,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112125 fprintf (stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n " , params.save_audio ? " true" : " false" );
113126 fprintf (stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n " , params.use_gpu ? " false" : " true" );
114127 fprintf (stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n " , params.flash_attn ? " true" : " false" );
128+ fprintf (stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n " , params.no_prints ? " true" : " false" );
129+ // Voice Activity Detection (VAD) parameters
130+ fprintf (stderr, " \n Voice Activity Detection (VAD) options:\n " );
131+ fprintf (stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n " , params.vad ? " true" : " false" );
132+ fprintf (stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n " , params.vad_model .c_str ());
133+ fprintf (stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n " , params.vad_threshold );
134+ fprintf (stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n " , params.vad_min_speech_duration_ms );
135+ fprintf (stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n " , params.vad_min_silence_duration_ms );
136+ fprintf (stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n " , params.vad_max_speech_duration_s == FLT_MAX ?
137+ std::string (" FLT_MAX" ).c_str () :
138+ std::to_string (params.vad_max_speech_duration_s ).c_str ());
139+ fprintf (stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n " , params.vad_speech_pad_ms );
140+ fprintf (stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n " , params.vad_samples_overlap );
115141 fprintf (stderr, " \n " );
116142}
117143
@@ -122,20 +148,22 @@ int main(int argc, char ** argv) {
122148 return 1 ;
123149 }
124150
151+ if (params.no_prints ) {
152+ whisper_log_set ([](enum ggml_log_level, const char *, void *) { }, NULL );
153+ }
154+
125155 params.keep_ms = std::min (params.keep_ms , params.step_ms );
126156 params.length_ms = std::max (params.length_ms , params.step_ms );
127157
158+
128159 const int n_samples_step = (1e-3 *params.step_ms )*WHISPER_SAMPLE_RATE;
129160 const int n_samples_len = (1e-3 *params.length_ms )*WHISPER_SAMPLE_RATE;
130161 const int n_samples_keep = (1e-3 *params.keep_ms )*WHISPER_SAMPLE_RATE;
131162 const int n_samples_30s = (1e-3 *30000.0 )*WHISPER_SAMPLE_RATE;
132163
133- const bool use_vad = n_samples_step <= 0 ; // sliding window mode uses VAD
134-
135- const int n_new_line = !use_vad ? std::max (1 , params.length_ms / params.step_ms - 1 ) : 1 ; // number of steps to print new line
136-
137- params.no_timestamps = !use_vad;
138- params.no_context |= use_vad;
164+ const int n_new_line = !params.vad ? std::max (1 , params.length_ms / params.step_ms - 1 ) : 1 ; // number of steps to print new line
165+ params.no_timestamps = !params.vad ;
166+ params.no_context |= params.vad ;
139167 params.max_tokens = 0 ;
140168
141169 // init audio
@@ -189,7 +217,7 @@ int main(int argc, char ** argv) {
189217 params.translate ? " translate" : " transcribe" ,
190218 params.no_timestamps ? 0 : 1 );
191219
192- if (!use_vad ) {
220+ if (!params. vad ) {
193221 fprintf (stderr, " %s: n_new_line = %d, no_context = %d\n " , __func__, n_new_line, params.no_context );
194222 } else {
195223 fprintf (stderr, " %s: using VAD, will transcribe on speech activity\n " , __func__);
@@ -242,7 +270,7 @@ int main(int argc, char ** argv) {
242270
243271 // process new audio
244272
245- if (!use_vad ) {
273+ if (!params. vad ) {
246274 while (true ) {
247275 // handle Ctrl + C
248276 is_running = sdl_poll_events ();
@@ -270,7 +298,7 @@ int main(int argc, char ** argv) {
270298 // take up to params.length_ms audio from previous iteration
271299 const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
272300
273- // printf( "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
301+ // fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
274302
275303 pcmf32.resize (n_samples_new + n_samples_take);
276304
@@ -285,22 +313,29 @@ int main(int argc, char ** argv) {
285313 const auto t_now = std::chrono::high_resolution_clock::now ();
286314 const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count ();
287315
288- if (t_diff < 2000 ) {
289- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
290-
316+ if (t_diff < params.step_ms ) {
317+ std::this_thread::sleep_for (std::chrono::milliseconds (params.step_ms ));
291318 continue ;
292319 }
293320
294- audio.get (2000 , pcmf32_new);
321+ // Get new audio for this step
322+ audio.get (params.step_ms , pcmf32_new);
295323
296- if (::vad_simple (pcmf32_new, WHISPER_SAMPLE_RATE, 1000 , params.vad_thold , params.freq_thold , false )) {
297- audio.get (params.length_ms , pcmf32);
298- } else {
299- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
324+ // Calculate how much old audio to keep
325+ const int n_samples_new = pcmf32_new.size ();
326+ const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
300327
301- continue ;
328+ // Combine old + new audio with overlap
329+ pcmf32.resize (n_samples_new + n_samples_take);
330+
331+ // Copy kept portion from previous iteration
332+ for (int i = 0 ; i < n_samples_take; i++) {
333+ pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
302334 }
303335
336+ // Append new audio
337+ memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new * sizeof (float ));
338+
304339 t_last = t_now;
305340 }
306341
@@ -313,7 +348,6 @@ int main(int argc, char ** argv) {
313348 wparams.print_realtime = false ;
314349 wparams.print_timestamps = !params.no_timestamps ;
315350 wparams.translate = params.translate ;
316- wparams.single_segment = !use_vad;
317351 wparams.max_tokens = params.max_tokens ;
318352 wparams.language = params.language .c_str ();
319353 wparams.n_threads = params.n_threads ;
@@ -330,27 +364,30 @@ int main(int argc, char ** argv) {
330364 wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data ();
331365 wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size ();
332366
367+ wparams.vad = params.vad ;
368+ wparams.vad_model_path = params.vad_model .c_str ();
369+
370+ wparams.vad_params .threshold = params.vad_threshold ;
371+ wparams.vad_params .min_speech_duration_ms = params.vad_min_speech_duration_ms ;
372+ wparams.vad_params .min_silence_duration_ms = params.vad_min_silence_duration_ms ;
373+ wparams.vad_params .max_speech_duration_s = params.vad_max_speech_duration_s ;
374+ wparams.vad_params .speech_pad_ms = params.vad_speech_pad_ms ;
375+ wparams.vad_params .samples_overlap = params.vad_samples_overlap ;
376+
333377 if (whisper_full (ctx, wparams, pcmf32.data (), pcmf32.size ()) != 0 ) {
334378 fprintf (stderr, " %s: failed to process audio\n " , argv[0 ]);
335379 return 6 ;
336380 }
337381
338382 // print result;
339383 {
340- if (!use_vad ) {
384+ if (!params. vad ) {
341385 printf (" \33 [2K\r " );
342386
343387 // print long empty line to clear the previous line
344388 printf (" %s" , std::string (100 , ' ' ).c_str ());
345389
346390 printf (" \33 [2K\r " );
347- } else {
348- const int64_t t1 = (t_last - t_start).count ()/1000000 ;
349- const int64_t t0 = std::max (0.0 , t1 - pcmf32.size ()*1000.0 /WHISPER_SAMPLE_RATE);
350-
351- printf (" \n " );
352- printf (" ### Transcription %d START | t0 = %d ms | t1 = %d ms\n " , n_iter, (int ) t0, (int ) t1);
353- printf (" \n " );
354391 }
355392
356393 const int n_segments = whisper_full_n_segments (ctx);
@@ -389,15 +426,11 @@ int main(int argc, char ** argv) {
389426 fout << std::endl;
390427 }
391428
392- if (use_vad) {
393- printf (" \n " );
394- printf (" ### Transcription %d END\n " , n_iter);
395- }
396429 }
397430
398431 ++n_iter;
399432
400- if (!use_vad && (n_iter % n_new_line) == 0 ) {
433+ if (!params. vad && (n_iter % n_new_line) == 0 ) {
401434 printf (" \n " );
402435
403436 // keep part of the audio for next iteration to try to mitigate word boundary issues
0 commit comments