Add preprocessor for TWEB dataset

2018-12-17 16:33:29 +01:00 · 2018-12-17 16:33:29 +01:00 · 4587c72a03
parent d38872a4d0
commit 4587c72a03
1 changed files with 14 additions and 4 deletions
--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@ -13,10 +13,20 @@ def tts_cache(root_path, meta_file):
    return items            


-# def tweb(root_path, meta_file):
-#     # TODO
-#     pass
-#     return 
+def tweb(root_path, meta_file):
+    """Normalize TWEB dataset. 
+    https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
+    """
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    with open(txt_file, 'r') as ttf:
+        for line in ttf:
+            cols = line.split('\t')
+            wav_file = os.path.join(root_path, cols[0]+'.wav')
+            text = cols[1]
+            items.append([text, wav_file])
+    random.shuffle(items)
+    return items
    

 # def kusal(root_path, meta_file):