diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 53ffaf18..b4237c60 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -42,6 +42,30 @@ def tweb(root_path, meta_file): # return {'text': texts, 'wavs': wavs} +def mozilla(root_path, meta_files): + """Normalizes Mozilla meta data files to TTS format""" + import glob + meta_files = glob.glob(root_path + "**/batch*.txt", recursive=True) + folders = [os.path.dirname(f.strip()) for f in meta_files] + items = [] + for idx, meta_file in enumerate(meta_files): + print(" | > {}".format(meta_file)) + folder = folders[idx] + txt_file = os.path.join(root_path, meta_file) + with open(txt_file, 'r') as ttf: + for line in ttf: + cols = line.split('|') + wav_file = os.path.join(root_path, folder, 'wavs_no_processing', cols[1].strip()) + if os.path.isfile(wav_file): + text = cols[0].strip() + items.append([text, wav_file]) + else: + print(" > Error: {}", line) + continue + random.shuffle(items) + return items + + def mailabs(root_path, meta_files): """Normalizes M-AI-Labs meta data files to TTS format""" folders = [os.path.dirname(f.strip()) for f in meta_files.split(",")]