From f59543d1273f6cc5a19947fce98c1604de92f2e8 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Fri, 19 Jul 2019 15:17:35 +0200 Subject: [PATCH 1/2] fixed usage of bos&eos char with caching --- datasets/TTSDataset.py | 54 +++++++++++++++++++++++++----------------- utils/text/__init__.py | 14 ++++++----- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index 85a67001..66cbcb7f 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -5,7 +5,7 @@ import torch import random from torch.utils.data import Dataset -from utils.text import text_to_sequence, phoneme_to_sequence +from utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos from utils.data import prepare_data, prepare_tensor, prepare_stop_target @@ -73,34 +73,44 @@ class MyDataset(Dataset): data = np.load(filename).astype('float32') return data - def load_phoneme_sequence(self, wav_file, text): + def _generate_and_cache_phoneme_sequence(self, text, cache_path): + """generate a phoneme sequence from text. + + since the usage is for subsequent caching, we never add bos and + eos chars here. Instead we add those dynamically later; based on the + config option.""" + phonemes = phoneme_to_sequence(text, [self.cleaners], + language=self.phoneme_language, + enable_eos_bos=False) + phonemes = np.asarray(phonemes, dtype=np.int32) + np.save(cache_path, phonemes) + return phonemes + + def _load_or_generate_phoneme_sequence(self, wav_file, text): file_name = os.path.basename(wav_file).split('.')[0] - tmp_path = os.path.join(self.phoneme_cache_path, - file_name + '_phoneme.npy') - if os.path.isfile(tmp_path): - try: - text = np.load(tmp_path) - except (IOError, ValueError): - print(" > ERROR: phoneme connot be loaded for {}. Recomputing.".format(wav_file)) - text = np.asarray( - phoneme_to_sequence( - text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=self.enable_eos_bos), - dtype=np.int32) - np.save(tmp_path, text) - else: - text = np.asarray( - phoneme_to_sequence( - text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=self.enable_eos_bos), - dtype=np.int32) - np.save(tmp_path, text) - return text + cache_path = os.path.join(self.phoneme_cache_path, + file_name + '_phoneme.npy') + try: + phonemes = np.load(cache_path) + except FileNotFoundError: + phonemes = self._generate_and_cache_phoneme_sequence(text, + cache_path) + except (ValueError, IOError): + print(" > ERROR: failed loading phonemes for {}. " + "Recomputing.".format(wav_file)) + phonemes = self._generate_and_cache_phoneme_sequence(text, + cache_path) + if self.enable_eos_bos: + phonemes = pad_with_eos_bos(phonemes) + + return phonemes def load_data(self, idx): text, wav_file, speaker_name = self.items[idx] wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) if self.use_phonemes: - text = self.load_phoneme_sequence(wav_file, text) + text = self._load_or_generate_phoneme_sequence(wav_file, text) else: text = np.asarray( text_to_sequence(text, [self.cleaners]), dtype=np.int32) diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 5431e46e..332163d2 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -4,7 +4,8 @@ import re import phonemizer from phonemizer.phonemize import phonemize from utils.text import cleaners -from utils.text.symbols import symbols, phonemes, _phoneme_punctuations +from utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \ + _eos # Mappings from symbol to numeric ID and vice versa: _SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)} @@ -45,11 +46,12 @@ def text2phone(text, language): return ph +def pad_with_eos_bos(phoneme_sequence): + return [_PHONEMES_TO_ID[_bos]] + phoneme_sequence + [_PHONEMES_TO_ID[_eos]] + + def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): - if enable_eos_bos: - sequence = [_PHONEMES_TO_ID['^']] - else: - sequence = [] + sequence = [] text = text.replace(":", "") clean_text = _clean_text(text, cleaner_names) to_phonemes = text2phone(clean_text, language) @@ -60,7 +62,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): sequence += _phoneme_to_sequence(phoneme) # Append EOS char if enable_eos_bos: - sequence.append(_PHONEMES_TO_ID['~']) + sequence = pad_with_eos_bos(sequence) return sequence From 116a21b659f9b560990bd46b38f5d6a81754489e Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Fri, 19 Jul 2019 16:26:36 +0200 Subject: [PATCH 2/2] lint indentation --- datasets/TTSDataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index 66cbcb7f..ecf8e9ea 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -99,7 +99,7 @@ class MyDataset(Dataset): print(" > ERROR: failed loading phonemes for {}. " "Recomputing.".format(wav_file)) phonemes = self._generate_and_cache_phoneme_sequence(text, - cache_path) + cache_path) if self.enable_eos_bos: phonemes = pad_with_eos_bos(phonemes)