-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathmain_get_vad.py
214 lines (182 loc) · 8.13 KB
/
main_get_vad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python
"""Perform voice activity detection (VAD) using WebRTC's implementation.
To perform VAD for all WAV files under the directory ``wav_dir/`` and write
the output to the directory ``vad_dir/`` as HTK label files:
python main_get_vad.py --wav_dir wav_dir/ --output_dir vad_dir/
For each file with the ``.wav`` extension under ``wav_dir/``, there will now be
a corresponding label file with the extension ``.sad`` under ``vad_dir/``. Each
label file will contain one speech segment per line, each consisting of three
space-delimited fields:
- onset -- the onset of the segment in seconds
- offset -- the offset of the segment in seconds
- label -- the label for the segment; controlled by the ``--speech_label`` flag
If ``--output_dir`` is not specified, these files will be output to ``wav_dir/``.
Alternately, you may specify the files to process via a script file of paths to
WAV files with one path per line:
/path/to/file1.wav
/path/to/file2.wav
/path/to/file3.wav
...
This functionality is enabled via the ``-S`` flag, as in the following:
python main_get_vad.py -S some.scp --output_dir vad_dir/
which will perform VAD for those file listed in ``some.scp`` and output label files
to ``vad_dir. Note that if you use a script file, you *MUST* specify an output
directory.
WebRTC exposes several parameters for tuning it's output, which may be adjusted via
the following flags:
- ``--fs_vad`` -- controls the sample rate the audio is resampled to prior to
performing VAD; possible values are 8 kHz, 16 kHz, 32 kHz, and 48 kHz
- ``--hoplength`` -- the duration in milliseconds of the frames for VAD; possible
values are 10 ms, 20 ms, and 30 ms
- ``--mode`` -- the WebRTC aggressiveness mode, which controls how aggressive
WebRTC is about filter out non-speech; 0 is least aggressive and 3 most aggressive
Optionally, label smoothing may be applied to the output of WebRTC to eliminate short,
irregular silences and speech segments. Label smoothing is done using a median filter
applied to the frame-level labeling produced by WebRTC and is controlled by the
``--med_filt_width`` parameter.
When processing large batches of audio, it may be desireable to parallelize the
computation, which may be done by specifying the number of parallel processes to
employ via the ``--n_jobs`` flag:
python main_get_vad.py --n_jobs 40 -S some.scp --output_dir vad_dir/
References
----------
- https://github.com/wiseman/py-webrtcvad.git
- https://webrtc.org/
"""
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import numbers
import os
import sys
import traceback
from joblib import delayed, Parallel
import librosa
import utils
from utils import VALID_VAD_SRS, VALID_VAD_FRAME_LENGTHS, VALID_VAD_MODES
def perform_vad(wav_file, segs_file, speech_label, **kwargs):
"""Perform VAD for WAV file.
If an exception is raised during processing, it returns the exception as well as
the full traceback. Otherwise, returns ``None``.
Parameters
----------
wav_file : str
Path to WAV file to perform VAD for.
segs_file : str
Path to output segments file.
speech_label : str
Label for speech segments.
kwargs
Keyword arguments to pass to ``utils.vad``.
"""
try:
data, fs = librosa.load(wav_file, sr=None)
vad_info = utils.vad(data, fs, **kwargs)
segments = utils.get_segments(vad_info, fs)
utils.write_segments(segs_file, segments, label=speech_label)
return None
except Exception as e:
tb = traceback.format_exc()
return e, tb
def main():
"""Main."""
# Parse command line arguments.
parser = argparse.ArgumentParser(
description='Perform VAD using webrtcvad.', add_help=True)
parser.add_argument(
'--wav_dir', nargs=None, type=str, metavar='STR',
help='directory containing WAV files to perform VAD for '
'(default: %(default)s)')
parser.add_argument(
'-S', dest='scpf', nargs=None, type=str, metavar='STR',
help='script file of paths to WAV files to perform VAD for (default: %(default)s)')
parser.add_argument(
'--output_dir', nargs=None, type=str, metavar='STR',
help='output directory for label files (default: None)')
parser.add_argument(
'--output_ext', nargs=None, default='.sad', type=str, metavar='STR',
help='extension for output label files (default: %(default)s)')
parser.add_argument(
'--speech_label', nargs=None, default='', type=str, metavar='STR',
help='label for speech segments (default: %(default)s)')
parser.add_argument(
'--fs_vad', nargs=None, default=16000, type=int, metavar='INT',
help='target sample rate in Hz for VAD (default: %(default)s)')
parser.add_argument(
'--hoplength', nargs=None, default=30, type=int, metavar='INT',
help='duration between frames in ms (default: %(default)s)')
parser.add_argument(
'--mode', nargs=None, default=3, type=int, metavar='INT',
help='WebRTC VAD aggressiveness (default: %(default)s)')
parser.add_argument(
'--med_filt_width', nargs=None, default=1, type=int, metavar='INT',
help='window size in frames for median smoothing of VAD output; '
'<=1 disables (default: %(default)s')
parser.add_argument(
'--verbose', default=False, action='store_true',
help='print full stacktrace for files with errors')
parser.add_argument(
'--n_jobs', nargs=None, default=1, type=int, metavar='INT',
help='number of parallel jobs (default: %(default)s)')
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
if not utils.xor(args.wav_dir, args.scpf):
parser.error('Exactly one of --wav_dir and -S must be set.')
sys.exit(1)
if not (args.wav_dir or args.output_dir):
parser.error(
'At least one of --wav_dir or --output_dir must be set.')
sys.exit(1)
if args.fs_vad not in VALID_VAD_SRS:
parser.error(
'--fs_vad must be one of %s' % VALID_VAD_SRS)
sys.exit(1)
if args.hoplength not in VALID_VAD_FRAME_LENGTHS:
parser.error(
'--hop_length must be one of %s' % VALID_VAD_FRAME_LENGTHS)
sys.exit(1)
if args.mode not in VALID_VAD_MODES:
parser.error('--mode must be one of %s' % VALID_VAD_MODES)
sys.exit(1)
if (not isinstance(args.med_filt_width, numbers.Integral) or
args.med_filt_width % 2 == 0):
parser.error('--med_filt_width must be an odd integer')
sys.exit(1)
args.frame_length = args.hoplength # Retain hoplength argument for compatibility.
# Determine files to perform VAD on.
if args.scpf is not None:
wav_files = utils.load_script_file(args.scpf, '.wav')
else:
wav_files = utils.listdir(args.wav_dir, ext='.wav')
# Determine output directory for VAD.
if args.output_dir is None and args.wav_dir is not None:
utils.warn('Output directory not specified. Defaulting to "%s"' %
args.wav_dir)
args.output_dir = args.wav_dir
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
# Perform VAD.
def kwargs_gen():
for wav_file in wav_files:
bn = os.path.basename(wav_file)
segs_file = os.path.join(
args.output_dir, bn.replace('.wav', args.output_ext))
yield dict(
wav_file=wav_file, segs_file=segs_file,
speech_label=args.speech_label, fs_vad=args.fs_vad,
frame_length=args.frame_length, vad_mode=args.mode,
med_filt_width=args.med_filt_width)
f = delayed(perform_vad)
res = Parallel(n_jobs=args.n_jobs)(f(**kwargs) for kwargs in kwargs_gen())
for res_, wav_file in zip(res, wav_files):
if res_ is None:
continue
e, tb = res_
msg = 'Problem encountered while processing file "%s". Skipping.' % wav_file
if args.verbose:
msg = '%s Full error output:\n%s' % (msg, tb)
utils.error(msg)
if __name__ == '__main__':
main()