diff --git a/MITADS-Speech/assets/corpora_collector/mitads-speech-full.yaml b/MITADS-Speech/assets/corpora_collector/mitads-speech-full.yaml new file mode 100644 index 00000000..8771575f --- /dev/null +++ b/MITADS-Speech/assets/corpora_collector/mitads-speech-full.yaml @@ -0,0 +1,43 @@ +## +name: 'mitads-speech-full' +version: '0.1' +description: 'MITADS-Speech Dataset, filter audio more than 20 second' + +corpus2collect: + voxforge: + filter: + max_duration: 20 + evalita2009: + filter: + max_duration: 20 + mspka: + filter: + max_duration: 20 + siwis: + filter: + max_duration: 20 + #common_voice: + # filter: + # max_duration: 20 + + m-ailabs: + filter: + max_duration: 20 + + mls: + filter: + max_duration: 20 + comments_contains: + ## filter ancient work by author + - Dante Alighieri + - Giovanni Francesco Straparola + - Niccolò Machiavelli + ##filter title book that is present in m-ailabs + - Novelle per un anno + - Galatea + - Il fu Mattia Pascal + - Ritratto del Diavolo + - Contessa di Karolystria + - meraviglie del Duemila + - Malavoglia + \ No newline at end of file diff --git a/MITADS-Speech/corpora_collector.py b/MITADS-Speech/corpora_collector.py index a3ba9357..640f58be 100644 --- a/MITADS-Speech/corpora_collector.py +++ b/MITADS-Speech/corpora_collector.py @@ -39,6 +39,10 @@ help='root folder of csv dataset to collect, also is root of output csv' 'default is root_project/MITADS-Speech-output') +collector_parser.add_argument('-d', '--dataset_output', type=str, default='', + help='root folder output dataset' + 'default is csv_folder') + collector_parser.add_argument('-z', '--zip_output', type=str, default='true', help='if true collect files into .zip. If false files are copyed to a folder in csv_folder') @@ -164,6 +168,9 @@ def collect_datasets(config,args): zip_output = True if args.zip_output.lower()=='true' else False csv_corpus_rootdir = args.csv_folder + + final_dataset_root = csv_corpus_rootdir if args.dataset_output=='' else args.dataset_output + corpus2collect = config['corpus2collect'] @@ -183,7 +190,7 @@ def collect_datasets(config,args): final_corpora_name = config['name'] final_corpora_version = config['version'] output_corpora_foldername = final_corpora_name + '_' + 'v' + final_corpora_version - corpora_output_dir = os.path.join(csv_corpus_rootdir, output_corpora_foldername) + corpora_output_dir = os.path.join(final_dataset_root, output_corpora_foldername) if not path.exists(corpora_output_dir): print('No path "%s" - creating ...' % corpora_output_dir) diff --git a/MITADS-Speech/corpora_importer.py b/MITADS-Speech/corpora_importer.py index 8b15aaac..f64ea990 100644 --- a/MITADS-Speech/corpora_importer.py +++ b/MITADS-Speech/corpora_importer.py @@ -294,7 +294,7 @@ def _maybe_convert_sets(self,corpus:Corpus): ## all examples are processed, even if the resample is not necessary, the duration or other filters should be evaluated samples = [ [a,corpus.make_wav_resample, corpus.utterences[a]] for a in corpus.audios ] - ##self.one_sample(samples[0]) + #self.one_sample(samples[23]) # Mutable counters for the concurrent embedded routine counter = get_counter() print(f"Converting audio files to wav {SAMPLE_RATE}hz Mono") @@ -331,6 +331,8 @@ def row_validation(self,filename,duration,comments): def one_sample(self,sample): delete_original_if_resampled = True + ##set to false if you want run importer more time (ex. local test) + #delete_original_if_resampled = False orig_filename = sample[0] make_wav_resample = sample[1] diff --git a/MITADS-Speech/requirements.txt b/MITADS-Speech/requirements.txt index 43729267..f1da140c 100644 --- a/MITADS-Speech/requirements.txt +++ b/MITADS-Speech/requirements.txt @@ -4,4 +4,5 @@ sox progressbar2==3.47.0 ## pycopy-shutil problem istall on colab charset_normalizer -ds-ctcdecoder==0.9.3 \ No newline at end of file +ds-ctcdecoder==0.9.3 +PyYAML \ No newline at end of file diff --git a/MITADS-Speech/siwis_importer.py b/MITADS-Speech/siwis_importer.py index 04597404..02904687 100644 --- a/MITADS-Speech/siwis_importer.py +++ b/MITADS-Speech/siwis_importer.py @@ -20,8 +20,8 @@ def get_corpus(self): text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt","IT") ##read transcript in prompts.txt transcripts = {} - ##cp1252 if windows os - encoding = 'cp1252' if os.name == 'nt' else 'utf-8' + ##encoding prompts files is cp1252 + encoding = 'cp1252' ###read transcript from prompts file with open(os.path.join(self.origin_data_path,self.extract_dir, "prompts","ALL_IT_prompts_iso.txt"), "r",encoding=encoding) as f: line = f.readline() @@ -103,6 +103,13 @@ def get_speaker_id(self,audio_file_path): # Validate and normalize transcriptions. Returns a cleaned version of the label # or None if it's invalid. def validate_label(self,label): + ##import unicodedata + ## normalize remove absent char è ò à + #label = ( + # unicodedata.normalize("NFKD", label.strip()) + # .encode("ascii", "ignore") + # .decode("ascii", "ignore") + # ) label = label.replace("-", " ") label = label.replace("_", " ") @@ -154,20 +161,30 @@ def validate_label(self,label): label = label.replace("741", "settecentoquarantuno") label = label.replace("103", "settecentoquarantuno") ######################## + ##other to clean + label = label.replace("\ufeff", "") + ## if re.search(r"[0-9]|[\[\]&*{]", label) is not None: return None - label = label.strip() label = label.lower() + ##DEBUG - decomment for checking normalization char by char + #DEBUG_ALPHABET = ' ,\',a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,à,è,é,ì,í,ò,ó,ô,ù,ú'.split(',') + #for c in label: + # if(c not in DEBUG_ALPHABET): + # print('CHECK char:'+ c) + return label if label else None if __name__ == "__main__": from corpora_importer import importer_parser args = importer_parser.parse_args() + #args.download_directory = "F:\\DATASET-MODELS\\speech_dataset\\CORPORA-IT-AUDIO\\SIWIS" + #args.csv_output_folder = "F:\\DATASET-MODELS\\speech_dataset\\new-speech-corpora-it" corpus_name=CORPUS_NAME archive_url = 'https://phonogenres.unige.ch/downloads/siwis_latest.zip' diff --git a/MITADS-Speech/voxforge_importer.py b/MITADS-Speech/voxforge_importer.py new file mode 100644 index 00000000..ccf4a760 --- /dev/null +++ b/MITADS-Speech/voxforge_importer.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +import time +import os +import re +from corpora_importer import ArchiveImporter,Corpus,string_escape + + +import urllib +from bs4 import BeautifulSoup +import time +CORPUS_NAME = 'voxforge' + +class VoxforgeImporter(ArchiveImporter): + + + def get_corpus(self): + ##extract training and development datasets + ##do data merge, ArchiveImporter make final train/test/dev datasets + utterances = {} + audios = [] + wav_dir = os.path.join(self.origin_data_path, self.archive_name, "wav") + text_file = os.path.join(self.origin_data_path, self.archive_name, "etc","PROMPTS") + + wav_files = [f for f in os.listdir(wav_dir) if os.path.isfile(os.path.join(wav_dir, f))] + count=0 + + with open(text_file,encoding='utf-8') as f: + for line in f: + temp_2 = line.split(" ", 1) + ref_url = temp_2[0] + transcript = temp_2[1].lower() + transcript = transcript.replace('\n','') + + temp = ref_url.split('/') + speaker_id = temp[0] + file_n = temp[-1] + for wav_file in wav_files: + if(file_n in wav_file): + ##found , is this + wav_file_path = os.path.join(wav_dir,wav_file) + utterances[ wav_file_path] = transcript + audios.append(wav_file_path) + count +=1 + break + + + ##collect corpus + corpus = Corpus(utterances,audios) + ################# + ## VoxForge need wav resample + ## + corpus.make_wav_resample = True + return corpus + + def get_speaker_id(self,audio_file_path): + + return self.archive_name + + +def get_voxforge_bad_speaker(): + + l = [] + l.append("anonymous-20080504-qvg") + l.append("anonymous-20080723-ouv") + l.append("anonymous-20080725-dey") + l.append("Vistaus-20080718-mrm") + #l.append("") + #l.append("") + + + return l + + + +if __name__ == "__main__": + + from corpora_importer import importer_parser + args = importer_parser.parse_args() + + corpus_name=CORPUS_NAME + archivie_urls = [] + + #voxforge_url = "http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit" + voxforge_url = "http://www.repository.voxforge1.org/downloads/it/Trunk/Audio/Main/16kHz_16bit/" + + + html_page = urllib.request.urlopen(voxforge_url) + soup = BeautifulSoup(html_page, "html.parser") + + # list all links + archivies = [l["href"] for l in soup.find_all("a") if ".tgz" in l["href"]] + + bad_speakers = get_voxforge_bad_speaker() + for i in range(len(archivies)): + archivie_url = voxforge_url + '' + archivies[i] + + speaker_id = archivies[i].split('.')[0] + + if(speaker_id in bad_speakers): + ##filter bad speaker + print("filter speaker {}".format(speaker_id)) + continue + + csv_append_mode = not i==0 + + _importer = VoxforgeImporter(corpus_name,archivie_url,data_dir=args.download_directory,output_path=args.csv_output_folder,csv_append_mode=csv_append_mode) + + try: + _importer.run() + except Exception as e: + print(str(e)) + print('ARCHIVE CORRUPTED {}'.format(_importer.archive_name)) + ##some archive is corrupted, pass + continue + + ##sleep ...host interrupt connection + time.sleep(2) \ No newline at end of file