Skip to content

Commit 3deb871

Browse files
author
Kevin Obst
committed
Initial commit
1 parent 2b89b91 commit 3deb871

File tree

2 files changed

+173
-0
lines changed

2 files changed

+173
-0
lines changed

downloader.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import argparse
2+
import os
3+
import tarfile
4+
import urllib3
5+
6+
from concurrent.futures import ThreadPoolExecutor, wait
7+
8+
from urllib.request import urlopen
9+
10+
from bs4 import BeautifulSoup
11+
12+
13+
def download_and_extract(url: str, target_directory: str, current_item=0, items_count=0):
14+
"""
15+
Downloads the speaker file and extracts it into the target directory
16+
:param url: url to the speaker directory
17+
:param target_directory:
18+
:param current_item: this items index for logging
19+
:param items_count: total amount of items to download. Displayed for logging
20+
"""
21+
print('Downloading %s / %s' % (current_item, items_count))
22+
23+
http = urllib3.PoolManager()
24+
25+
with http.request('GET', url, preload_content=False) as r, tarfile.open(fileobj=r, mode="r|gz") as tar:
26+
for item in tar:
27+
tar.extract(item, target_directory)
28+
return
29+
30+
31+
def ensure_directory(path: str):
32+
"""
33+
Creates a directory if does not exist
34+
:param path: Path of the directory
35+
"""
36+
if not os.path.exists(path):
37+
os.makedirs(path)
38+
39+
40+
def download_corpus(target_directory: str,
41+
max_workers: int,
42+
amount: int,
43+
voxforge_url="http://www.repository.voxforge1.org/"
44+
"downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit"):
45+
"""
46+
Initiates download of the voxforge speech corpus
47+
:param target_directory: target directory for the files
48+
:param max_workers: amount of threads to be used for the download and extraction
49+
:param amount: amount of speaker files to be downloaded
50+
:param voxforge_url: url to the voxforge corpus
51+
"""
52+
# input validation
53+
if not max_workers:
54+
max_workers = 10
55+
if not target_directory:
56+
target_directory = "voxforge-corpus"
57+
58+
ensure_directory(target_directory)
59+
60+
# collect links from voxforge
61+
html_page = urlopen(voxforge_url)
62+
soup = BeautifulSoup(html_page, "html5lib")
63+
links = soup.findAll('a')
64+
speaker_refs = [link['href'] for link in links if '.tgz' in link['href']]
65+
66+
# run multiple threads which download and extract the speaker files
67+
executor = ThreadPoolExecutor(max_workers)
68+
futures = []
69+
70+
if not amount:
71+
amount = len(speaker_refs)
72+
73+
for i, ref in enumerate(speaker_refs):
74+
if i < amount:
75+
futures.append(executor.submit(
76+
download_and_extract,
77+
os.path.join(voxforge_url, ref).replace('\\', '/'),
78+
target_directory,
79+
i + 1,
80+
amount))
81+
else:
82+
break
83+
84+
wait(futures)
85+
86+
87+
if __name__ == '__main__':
88+
parser = argparse.ArgumentParser(description="Downloader for the voxforge speech corpus")
89+
parser.add_argument('directory', help='directory where to store the downloaded corpus')
90+
parser.add_argument('-n', '--number', type=int, help="amount of files to download")
91+
parser.add_argument('-w', '--workers', type=int, help="amount of parallel downloads")
92+
args = parser.parse_args()
93+
94+
download_corpus(args.directory, args.workers, args.number)

generator.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import os
2+
import json
3+
import argparse
4+
5+
from typing import List
6+
7+
8+
def read_prompt_file(speaker_directory) -> List[str]:
9+
"""
10+
:param speaker_directory: a directory containing the transcriptions for the audio files
11+
:return: a list containing the transcription for each audio file
12+
"""
13+
try:
14+
with open(os.path.join(speaker_directory, 'etc', 'PROMPTS')) as file:
15+
return file.readlines()
16+
except FileNotFoundError as ex:
17+
raise FileNotFoundError('"%s" has no PROMTS file' % os.path.abspath(speaker_directory))
18+
19+
20+
def generate_json_file(source: str, destination: str):
21+
"""
22+
:param source:
23+
:param destination:
24+
:return:
25+
"""
26+
if not os.path.isdir(source):
27+
raise FileNotFoundError('The corpus directory "%s" does not exist' % os.path.abspath(source))
28+
29+
speaker_directories = os.listdir(source)
30+
data = []
31+
32+
for i, speaker_directory in enumerate(speaker_directories):
33+
print('Processing folder %s / %s' % (i + 1, len(speaker_directories)))
34+
35+
# get the prompt file from the speaker directory
36+
try:
37+
prompt_file = read_prompt_file(os.path.join(source, speaker_directory))
38+
except FileNotFoundError as ex:
39+
print(ex)
40+
continue
41+
42+
for row in prompt_file:
43+
try:
44+
# recreate the path to the audio file
45+
path = row.split(' ')[0]
46+
path = path.replace('/mfc/', '/wav/')
47+
path += '.wav'
48+
path = os.path.join(source, path)
49+
path = os.path.abspath(path)
50+
51+
# get transcription from prompt file
52+
transcription = row.split(' ')[1:]
53+
transcription = ' '.join(transcription).replace('\n', '').lower()
54+
transcription = transcription.replace('-', '')
55+
56+
# determine the size of audio file
57+
size = os.path.getsize(path)
58+
59+
data.append({
60+
'path': path,
61+
'text': transcription,
62+
'size': size
63+
})
64+
65+
except Exception as ex:
66+
print(ex)
67+
68+
# save training data to file
69+
with open(destination, 'w') as outfile:
70+
json.dump(data, outfile)
71+
72+
73+
if __name__ == '__main__':
74+
parser = argparse.ArgumentParser(description="Tool for preparing training data from the Voxforge corpus")
75+
parser.add_argument('source', help='directory of the corpus')
76+
parser.add_argument('destination', help='path of the new (json) file containing the training data')
77+
args = parser.parse_args()
78+
79+
generate_json_file(args.source, args.destination)

0 commit comments

Comments
 (0)