Initial commit

Kevin Obst · Kevin Obst · commit 3deb87187bb0 · 2018-05-08T21:49:28.000+02:00
diff --git a/downloader.py b/downloader.py
@@ -0,0 +1,94 @@
+import argparse
+import os
+import tarfile
+import urllib3
+
+from concurrent.futures import ThreadPoolExecutor, wait
+
+from urllib.request import urlopen
+
+from bs4 import BeautifulSoup
+
+
+def download_and_extract(url: str, target_directory: str, current_item=0, items_count=0):
+    """
+    Downloads the speaker file and extracts it into the target directory
+    :param url: url to the speaker directory
+    :param target_directory:
+    :param current_item: this items index for logging
+    :param items_count: total amount of items to download. Displayed for logging
+    """
+    print('Downloading %s / %s' % (current_item, items_count))
+
+    http = urllib3.PoolManager()
+
+    with http.request('GET', url, preload_content=False) as r, tarfile.open(fileobj=r, mode="r|gz") as tar:
+        for item in tar:
+            tar.extract(item, target_directory)
+    return
+
+
+def ensure_directory(path: str):
+    """
+    Creates a directory if does not exist
+    :param path: Path of the directory
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def download_corpus(target_directory: str,
+                    max_workers: int,
+                    amount: int,
+                    voxforge_url="http://www.repository.voxforge1.org/"
+                                 "downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit"):
+    """
+    Initiates download of the voxforge speech corpus
+    :param target_directory: target directory for the files
+    :param max_workers: amount of threads to be used for the download and extraction
+    :param amount: amount of speaker files to be downloaded
+    :param voxforge_url: url to the voxforge corpus
+    """
+    # input validation
+    if not max_workers:
+        max_workers = 10
+    if not target_directory:
+        target_directory = "voxforge-corpus"
+
+    ensure_directory(target_directory)
+
+    # collect links from voxforge
+    html_page = urlopen(voxforge_url)
+    soup = BeautifulSoup(html_page, "html5lib")
+    links = soup.findAll('a')
+    speaker_refs = [link['href'] for link in links if '.tgz' in link['href']]
+
+    # run multiple threads which download and extract the speaker files
+    executor = ThreadPoolExecutor(max_workers)
+    futures = []
+
+    if not amount:
+        amount = len(speaker_refs)
+
+    for i, ref in enumerate(speaker_refs):
+        if i < amount:
+            futures.append(executor.submit(
+                download_and_extract,
+                os.path.join(voxforge_url, ref).replace('\\', '/'),
+                target_directory,
+                i + 1,
+                amount))
+        else:
+            break
+
+    wait(futures)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Downloader for the voxforge speech corpus")
+    parser.add_argument('directory', help='directory where to store the downloaded corpus')
+    parser.add_argument('-n', '--number', type=int, help="amount of files to download")
+    parser.add_argument('-w', '--workers', type=int, help="amount of parallel downloads")
+    args = parser.parse_args()
+
+    download_corpus(args.directory, args.workers, args.number)
diff --git a/generator.py b/generator.py
@@ -0,0 +1,79 @@
+import os
+import json
+import argparse
+
+from typing import List
+
+
+def read_prompt_file(speaker_directory) -> List[str]:
+    """
+    :param speaker_directory: a directory containing the transcriptions for the audio files
+    :return: a list containing the transcription for each audio file
+    """
+    try:
+        with open(os.path.join(speaker_directory, 'etc', 'PROMPTS')) as file:
+            return file.readlines()
+    except FileNotFoundError as ex:
+        raise FileNotFoundError('"%s" has no PROMTS file' % os.path.abspath(speaker_directory))
+
+
+def generate_json_file(source: str, destination: str):
+    """
+    :param source:
+    :param destination:
+    :return:
+    """
+    if not os.path.isdir(source):
+        raise FileNotFoundError('The corpus directory "%s" does not exist' % os.path.abspath(source))
+
+    speaker_directories = os.listdir(source)
+    data = []
+
+    for i, speaker_directory in enumerate(speaker_directories):
+        print('Processing folder %s / %s' % (i + 1, len(speaker_directories)))
+
+        # get the prompt file from the speaker directory
+        try:
+            prompt_file = read_prompt_file(os.path.join(source, speaker_directory))
+        except FileNotFoundError as ex:
+            print(ex)
+            continue
+
+        for row in prompt_file:
+            try:
+                # recreate the path to the audio file
+                path = row.split(' ')[0]
+                path = path.replace('/mfc/', '/wav/')
+                path += '.wav'
+                path = os.path.join(source, path)
+                path = os.path.abspath(path)
+
+                # get transcription from prompt file
+                transcription = row.split(' ')[1:]
+                transcription = ' '.join(transcription).replace('\n', '').lower()
+                transcription = transcription.replace('-', '')
+
+                # determine the size of audio file
+                size = os.path.getsize(path)
+
+                data.append({
+                    'path': path,
+                    'text': transcription,
+                    'size': size
+                })
+
+            except Exception as ex:
+                print(ex)
+
+    # save training data to file
+    with open(destination, 'w') as outfile:
+        json.dump(data, outfile)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Tool for preparing training data from the Voxforge corpus")
+    parser.add_argument('source', help='directory of the corpus')
+    parser.add_argument('destination', help='path of the new (json) file containing the training data')
+    args = parser.parse_args()
+
+    generate_json_file(args.source, args.destination)