From 2ca74b8ab3e78b92e4f0d319f5e8b211a2ad121e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 17 Feb 2021 13:35:23 +0000
Subject: [PATCH] add RUSLAN dataset preprocessor

---
 TTS/tts/datasets/preprocess.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index 7815d87d..bed76c86 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -153,7 +153,8 @@ def mailabs(root_path, meta_files=None):
 
 
 def ljspeech(root_path, meta_file):
-    """Normalizes the Nancy meta data file to TTS format"""
+    """Normalizes the LJSpeech meta data file to TTS format
+    https://keithito.com/LJ-Speech-Dataset/"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "ljspeech"
@@ -166,6 +167,21 @@ def ljspeech(root_path, meta_file):
     return items
 
 
+def ruslan(root_path, meta_file):
+    """Normalizes the RUSLAN meta data file to TTS format
+    https://ruslan-corpus.github.io/"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "ljspeech"
+    with open(txt_file, 'r') as ttf:
+        for line in ttf:
+            cols = line.split('|')
+            wav_file = os.path.join(root_path, 'RUSLAN', cols[0] + '.wav')
+            text = cols[1]
+            items.append([text, wav_file, speaker_name])
+    return items
+
+
 def css10(root_path, meta_file):
     """Normalizes the CSS10 dataset file to TTS format"""
     txt_file = os.path.join(root_path, meta_file)