update mozilla preprocessor

2019-04-28 14:05:06 +02:00 · 2019-04-28 14:05:06 +02:00 · 278c7a91b7
parent 70eabaf4d8
commit 278c7a91b7
1 changed files with 10 additions and 21 deletions
--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@ -41,29 +41,18 @@ def tweb(root_path, meta_file):
 #     return  {'text': texts, 'wavs': wavs}


-def mozilla(root_path, meta_files):
+def mozilla(root_path, meta_file):
    """Normalizes Mozilla meta data files to TTS format"""
-    import glob
-    meta_files = glob.glob(root_path + "/**/batch*.txt", recursive=True)
-    folders = [os.path.dirname(f.strip()) for f in meta_files]
+    txt_file = os.path.join(root_path, meta_file)
    items = []
-    for idx, meta_file in enumerate(meta_files):
-        folder = folders[idx]
-        # txt_file = os.path.join(root_path, meta_file)
-        txt_file = meta_file
-        with open(txt_file, 'r') as ttf:
-            for line in ttf:
-                cols = line.split('|')
-                # wav_file = os.path.join(root_path, folder,
-                # 'wavs_no_processing', cols[1].strip())
-                wav_file = os.path.join(folder, 'wavs_no_processing',
-                                        cols[1].strip())
-                if os.path.isfile(wav_file):
-                    text = cols[0].strip()
-                    items.append([text, wav_file])
-                else:
-                    print(" > Error: {}".format(wav_file))
-                    continue
+    with open(txt_file, 'r') as ttf:
+        for line in ttf:
+            cols = line.split('|')
+            batch_no = int(cols[1].strip().split("_")[0])
+            wav_folder = "batch{}".format(batch_no)
+            wav_file = os.path.join(root_path, wav_folder, "wavs_no_processing", cols[1].strip())
+            text = cols[0].strip()
+            items.append([text, wav_file])
    return items