-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathvoxforge.py
98 lines (82 loc) · 4.21 KB
/
voxforge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
from six.moves import urllib
import argparse
import re
import tempfile
import shutil
import subprocess
import tarfile
import io
from tqdm import tqdm
from utils import create_manifest
VOXFORGE_URL_16kHz = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'
parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.')
parser.add_argument("--target-dir", default='voxforge_dataset/', type=str, help="Directory to store the dataset.")
parser.add_argument('--sample-rate', default=16000,
type=int, help='Sample rate')
parser.add_argument('--min-duration', default=1, type=int,
help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
parser.add_argument('--max-duration', default=15, type=int,
help='Prunes training samples longer than the max duration (given in seconds, default 15)')
args = parser.parse_args()
def _get_recordings_dir(sample_dir, recording_name):
wav_dir = os.path.join(sample_dir, recording_name, "wav")
if os.path.exists(wav_dir):
return "wav", wav_dir
flac_dir = os.path.join(sample_dir, recording_name, "flac")
if os.path.exists(flac_dir):
return "flac", flac_dir
raise Exception("wav or flac directory was not found for recording name: {}".format(recording_name))
def prepare_sample(recording_name, url, target_folder):
"""
Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder.
"""
wav_dir = os.path.join(target_folder, "wav")
if not os.path.exists(wav_dir):
os.makedirs(wav_dir)
txt_dir = os.path.join(target_folder, "txt")
if not os.path.exists(txt_dir):
os.makedirs(txt_dir)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
content = response.read()
response.close()
with tempfile.NamedTemporaryFile(suffix=".tgz", mode='wb') as target_tgz:
target_tgz.write(content)
target_tgz.flush()
dirpath = tempfile.mkdtemp()
tar = tarfile.open(target_tgz.name)
tar.extractall(dirpath)
tar.close()
recordings_type, recordings_dir = _get_recordings_dir(dirpath, recording_name)
tgz_prompt_file = os.path.join(dirpath, recording_name, "etc", "PROMPTS")
if os.path.exists(recordings_dir) and os.path.exists(tgz_prompt_file):
transcriptions = open(tgz_prompt_file).read().strip().split("\n")
transcriptions = {t.split()[0]: " ".join(t.split()[1:]) for t in transcriptions}
for wav_file in os.listdir(recordings_dir):
recording_id = wav_file.split('.{}'.format(recordings_type))[0]
transcription_key = recording_name + "/mfc/" + recording_id
if transcription_key not in transcriptions:
continue
utterance = transcriptions[transcription_key]
target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id))
target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
with io.FileIO(target_txt_file, "w") as file:
file.write(utterance.encode('utf-8'))
original_wav_file = os.path.join(recordings_dir, wav_file)
subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
target_wav_file)], shell=True)
shutil.rmtree(dirpath)
if __name__ == '__main__':
target_dir = args.target_dir
sample_rate = args.sample_rate
if not os.path.isdir(target_dir):
os.makedirs(target_dir)
request = urllib.request.Request(VOXFORGE_URL_16kHz)
response = urllib.request.urlopen(request)
content = response.read()
all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
for f in tqdm(all_files, total=len(all_files)):
prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir)
print('Creating manifests...')
create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration)