@@ -36,6 +36,7 @@ std::string to_timestamp(int64_t t) {
36
36
struct whisper_params {
37
37
int32_t seed = -1 ; // RNG seed, not used currently
38
38
int32_t n_threads = std::min(4 , (int32_t ) std::thread::hardware_concurrency());
39
+ int32_t step_ms = 3000 ;
39
40
40
41
bool verbose = false ;
41
42
bool translate = false ;
@@ -57,6 +58,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
57
58
params.seed = std::stoi (argv[++i]);
58
59
} else if (arg == " -t" || arg == " --threads" ) {
59
60
params.n_threads = std::stoi (argv[++i]);
61
+ } else if (arg == " --step" ) {
62
+ params.step_ms = std::stoi (argv[++i]);
60
63
} else if (arg == " -v" || arg == " --verbose" ) {
61
64
params.verbose = true ;
62
65
} else if (arg == " --translate" ) {
@@ -97,6 +100,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
97
100
fprintf (stderr, " -h, --help show this help message and exit\n " );
98
101
fprintf (stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n " );
99
102
fprintf (stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n " , params.n_threads );
103
+ fprintf (stderr, " --step N audio step size in milliseconds (default: %d)\n " , params.step_ms );
100
104
fprintf (stderr, " -v, --verbose verbose output\n " );
101
105
fprintf (stderr, " --translate translate from source language to english\n " );
102
106
fprintf (stderr, " -ps, --print_special print special tokens\n " );
@@ -197,6 +201,7 @@ int main(int argc, char ** argv) {
197
201
198
202
struct whisper_context * ctx = whisper_init (params.model .c_str ());
199
203
204
+ const int n_samples = (params.step_ms /1000.0 )*WHISPER_SAMPLE_RATE;
200
205
const int n_samples_30s = 30 *WHISPER_SAMPLE_RATE;
201
206
std::vector<float > pcmf32 (n_samples_30s, 0 .0f );
202
207
std::vector<float > pcmf32_old;
@@ -212,7 +217,7 @@ int main(int argc, char ** argv) {
212
217
}
213
218
}
214
219
printf (" %s: processing %d samples (%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n " ,
215
- __func__, int (pcmf32. size ()) , float (pcmf32. size () )/WHISPER_SAMPLE_RATE, params.n_threads ,
220
+ __func__, n_samples , float (n_samples )/WHISPER_SAMPLE_RATE, params.n_threads ,
216
221
params.language .c_str (),
217
222
params.translate ? " translate" : " transcribe" ,
218
223
params.no_timestamps ? 0 : 1 );
@@ -238,7 +243,7 @@ int main(int argc, char ** argv) {
238
243
}
239
244
240
245
// process 3 seconds of new audio
241
- while (SDL_GetQueuedAudioSize (g_dev_id_in) < 3 *WHISPER_SAMPLE_RATE *sizeof (float )) {
246
+ while (SDL_GetQueuedAudioSize (g_dev_id_in) < n_samples *sizeof (float )) {
242
247
SDL_Delay (1 );
243
248
}
244
249
const int n_samples_new = SDL_GetQueuedAudioSize (g_dev_id_in)/sizeof (float );
0 commit comments