add blank token in sequence for encrease glowtts results

pull/10/head
Edresson 2020-10-25 15:08:28 -03:00
parent fbea058c59
commit d9540a5857
7 changed files with 26 additions and 7 deletions

View File

@ -47,6 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len,

View File

@ -51,6 +51,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len,

View File

@ -51,6 +51,8 @@
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
// },
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",

View File

@ -51,6 +51,8 @@
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
// },
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",

View File

@ -17,6 +17,7 @@ class MyDataset(Dataset):
ap,
meta_data,
tp=None,
add_blank=False,
batch_group_size=0,
min_seq_len=0,
max_seq_len=float("inf"),
@ -55,6 +56,7 @@ class MyDataset(Dataset):
self.max_seq_len = max_seq_len
self.ap = ap
self.tp = tp
self.add_blank = add_blank
self.use_phonemes = use_phonemes
self.phoneme_cache_path = phoneme_cache_path
self.phoneme_language = phoneme_language
@ -88,7 +90,7 @@ class MyDataset(Dataset):
phonemes = phoneme_to_sequence(text, [self.cleaners],
language=self.phoneme_language,
enable_eos_bos=False,
tp=self.tp)
tp=self.tp, add_blank=self.add_blank)
phonemes = np.asarray(phonemes, dtype=np.int32)
np.save(cache_path, phonemes)
return phonemes
@ -127,7 +129,7 @@ class MyDataset(Dataset):
text = self._load_or_generate_phoneme_sequence(wav_file, text)
else:
text = np.asarray(text_to_sequence(text, [self.cleaners],
tp=self.tp),
tp=self.tp, add_blank=self.add_blank),
dtype=np.int32)
assert text.size > 0, self.items[idx][1]

View File

@ -14,10 +14,13 @@ def text_to_seqvec(text, CONFIG):
seq = np.asarray(
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
CONFIG.enable_eos_bos_chars,
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None),
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False),
dtype=np.int32)
else:
seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32)
seq = np.asarray(
text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False), dtype=np.int32)
return seq

View File

@ -57,6 +57,10 @@ def text2phone(text, language):
return ph
def intersperse(sequence, token):
result = [token] * (len(sequence) * 2 + 1)
result[1::2] = sequence
return result
def pad_with_eos_bos(phoneme_sequence, tp=None):
# pylint: disable=global-statement
@ -69,8 +73,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
# pylint: disable=global-statement
global _phonemes_to_id
if tp:
@ -88,6 +91,8 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
# Append EOS char
if enable_eos_bos:
sequence = pad_with_eos_bos(sequence, tp=tp)
if add_blank:
sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes)
return sequence
@ -107,7 +112,7 @@ def sequence_to_phoneme(sequence, tp=None):
return result.replace('}{', ' ')
def text_to_sequence(text, cleaner_names, tp=None):
def text_to_sequence(text, cleaner_names, tp=None, add_blank=False):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
@ -137,6 +142,9 @@ def text_to_sequence(text, cleaner_names, tp=None):
_clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3)
if add_blank:
sequence = intersperse(sequence, len(_symbols)) # add a blank token (new), whose id number is len(_symbols)
return sequence