diff --git a/config.json b/config.json index 3f3c2abd..af935658 100644 --- a/config.json +++ b/config.json @@ -51,5 +51,8 @@ "max_seq_len": 300, // DATASET-RELATED: maximum text length "output_path": "/media/erogol/data_ssd/Data/models/en_UK/", // DATASET-RELATED: output path for all training outputs. "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4 // number of evaluation data loader processes. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "phoneme_cache_path": "tmp_en_uk", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages } diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index 36e918d1..258d00cd 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -23,7 +23,9 @@ class MyDataset(Dataset): min_seq_len=0, max_seq_len=float("inf"), cached=False, - phoneme_cache_path=None): + use_phonemes=True, + phoneme_cache_path=None, + phoneme_language="en-us"): """ Args: root_path (str): root path for the data folder. @@ -41,7 +43,10 @@ class MyDataset(Dataset): max_seq_len (int): (float("inf")) maximum sequence length. cached (bool): (false) true if the given data path is created by extract_features.py. + use_phonemes (bool): (true) if true, text converted to phonemes. phoneme_cache_path (str): path to cache phoneme features. + phoneme_language (str): one the languages from + https://github.com/bootphon/phonemizer#languages """ self.root_path = root_path self.batch_group_size = batch_group_size @@ -53,9 +58,16 @@ class MyDataset(Dataset): self.max_seq_len = max_seq_len self.ap = ap self.cached = cached + self.use_phonemes = use_phonemes self.phoneme_cache_path = phoneme_cache_path + self.phoneme_language = phoneme_language + if not os.path.isdir(phoneme_cache_path): + os.makedirs(phoneme_cache_path) print(" > DataLoader initialization") print(" | > Data path: {}".format(root_path)) + print(" | > Use phonemes: {}".format(self.use_phonemes)) + if use_phonemes: + print(" | > phoneme language: {}".format(phoneme_language)) print(" | > Cached dataset: {}".format(self.cached)) print(" | > Number of instances : {}".format(len(self.items))) @@ -72,33 +84,42 @@ class MyDataset(Dataset): data = np.load(filename).astype('float32') return data + def load_phoneme_sequence(self, wav_file, text): + file_name = os.path.basename(wav_file).split('.')[0] + tmp_path = os.path.join(self.phoneme_cache_path, file_name+'_phoneme.npy') + if os.path.isfile(tmp_path): + text = np.load(tmp_path) + else: + text = np.asarray( + phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language), dtype=np.int32) + np.save(tmp_path, text) + return text + def load_data(self, idx): if self.cached: wav_name = self.items[idx][1] mel_name = self.items[idx][2] linear_name = self.items[idx][3] text = self.items[idx][0] - text = np.asarray( - text_to_sequence(text, [self.cleaners]), dtype=np.int32) + if wav_name.split('.')[-1] == 'npy': wav = self.load_np(wav_name) else: wav = np.asarray(self.load_wav(wav_name), dtype=np.float32) mel = self.load_np(mel_name) linear = self.load_np(linear_name) - sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'mel':mel, 'linear': linear} else: text, wav_file = self.items[idx] - file_name = os.path.basename(wav_file).split('.')[0] - tmp_path = os.path.join(self.phoneme_cache_path, file_name+'_phoneme.npy') - if os.path.isfile(tmp_path): - text = np.load(tmp_path) - else: - text = np.asarray( - phoneme_to_sequence(text, [self.cleaners]), dtype=np.int32) - np.save(tmp_path, text) wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) - sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1]} + mel = None + linear = None + + if self.use_phonemes: + text = self.load_phoneme_sequence(wav_file, text) + else: + text = np.asarray( + text_to_sequence(text, [self.cleaners]), dtype=np.int32) + sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'mel':mel, 'linear': linear} return sample def sort_items(self): @@ -157,12 +178,13 @@ class MyDataset(Dataset): text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) - if self.cached: - mel = [d['mel'] for d in batch] - linear = [d['linear'] for d in batch] - else: + # if specs are not computed, compute them. + if batch[0]['mel'] is None and batch[0]['linear'] is None: mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] linear = [self.ap.spectrogram(w).astype('float32') for w in wav] + else: + mel = [d['mel'] for d in batch] + linear = [d['linear'] for d in batch] mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame # compute 'stop token' targets diff --git a/utils/text/__init__.py b/utils/text/__init__.py index eb3086f2..3008418b 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -68,14 +68,14 @@ def sequence_to_phoneme(sequence): return result.replace('}{', ' ') -def text2phone(text): +def text2phone(text, language): ''' Convert graphemes to phonemes. ''' seperator = phonemizer.separator.Separator(' ', '', '|') #try: punctuations = re.findall(pat, text) - ph = phonemizer.phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language='en-us') + ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language) # Replace \n with matching punctuations. if len(punctuations) > 0: for punct in punctuations[:-1]: @@ -87,13 +87,13 @@ def text2phone(text): return ph -def phoneme_to_sequence(text, cleaner_names): +def phoneme_to_sequence(text, cleaner_names, language): ''' TODO: This ignores punctuations ''' sequence = [] clean_text = _clean_text(text, cleaner_names) - phonemes = text2phone(clean_text) + phonemes = text2phone(clean_text, language) # print(phonemes.replace('|', '')) if phonemes is None: print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))