mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev' into dev
commit
eb84bb2bc8
|
@ -1 +1 @@
|
|||
0.0.15
|
||||
0.0.16
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import os
|
||||
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
|
||||
version = f.read().strip()
|
||||
|
||||
|
|
|
@ -299,5 +299,6 @@ if __name__ == "__main__":
|
|||
args = parser.parse_args()
|
||||
|
||||
c = load_config(args.config_path)
|
||||
c.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
|
||||
c.audio["do_trim_silence"] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
|
||||
|
||||
main(args)
|
||||
|
|
|
@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig):
|
|||
Audio processor config object instance.
|
||||
use_phonemes (bool):
|
||||
enable / disable phoneme use.
|
||||
use_espeak_phonemes (bool):
|
||||
enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
|
||||
compute_input_seq_cache (bool):
|
||||
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
|
||||
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
|
||||
|
@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig):
|
|||
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
||||
# phoneme settings
|
||||
use_phonemes: bool = False
|
||||
use_espeak_phonemes: bool = True
|
||||
phoneme_language: str = None
|
||||
compute_input_seq_cache: bool = False
|
||||
text_cleaner: str = MISSING
|
||||
|
|
|
@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG):
|
|||
CONFIG.enable_eos_bos_chars,
|
||||
tp=CONFIG.characters,
|
||||
add_blank=CONFIG.add_blank,
|
||||
use_espeak_phonemes=CONFIG.use_espeak_phonemes,
|
||||
),
|
||||
dtype=np.int32,
|
||||
)
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
import gruut
|
||||
from packaging import version
|
||||
|
||||
from TTS.tts.utils.text import cleaners
|
||||
|
@ -25,8 +27,11 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)")
|
|||
# Regular expression matching punctuations, ignoring empty space
|
||||
PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
|
||||
|
||||
# Table for str.translate to fix gruut/TTS phoneme mismatch
|
||||
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
|
||||
|
||||
def text2phone(text, language):
|
||||
|
||||
def text2phone(text, language, use_espeak_phonemes=False):
|
||||
"""Convert graphemes to phonemes.
|
||||
Parameters:
|
||||
text (str): text to phonemize
|
||||
|
@ -39,10 +44,43 @@ def text2phone(text, language):
|
|||
# TO REVIEW : How to have a good implementation for this?
|
||||
if language == "zh-CN":
|
||||
ph = chinese_text_to_phonemes(text)
|
||||
print(" > Phonemes: {}".format(ph))
|
||||
return ph
|
||||
|
||||
if language == "ja-jp":
|
||||
ph = japanese_text_to_phonemes(text)
|
||||
print(" > Phonemes: {}".format(ph))
|
||||
return ph
|
||||
|
||||
if gruut.is_language_supported(language):
|
||||
# Use gruut for phonemization
|
||||
phonemizer_args = {
|
||||
"remove_stress": True,
|
||||
"ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA |
|
||||
"ipa_major_breaks": False, # don't replace periods with IPA ‖
|
||||
}
|
||||
|
||||
if use_espeak_phonemes:
|
||||
# Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
|
||||
# This is intended for backwards compatibility with TTS<=v0.0.13
|
||||
# pre-trained models.
|
||||
phonemizer_args["model_prefix"] = "espeak"
|
||||
|
||||
ph_list = gruut.text_to_phonemes(
|
||||
text,
|
||||
lang=language,
|
||||
return_format="word_phonemes",
|
||||
phonemizer_args=phonemizer_args,
|
||||
)
|
||||
|
||||
# Join and re-split to break apart dipthongs, suprasegmentals, etc.
|
||||
ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
|
||||
ph = "| ".join(ph_words)
|
||||
|
||||
# Fix a few phonemes
|
||||
ph = ph.translate(GRUUT_TRANS_TABLE)
|
||||
|
||||
print(" > Phonemes: {}".format(ph))
|
||||
return ph
|
||||
|
||||
raise ValueError(f" [!] Language {language} is not supported for phonemization.")
|
||||
|
@ -66,7 +104,9 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
|
|||
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
|
||||
|
||||
|
||||
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
|
||||
def phoneme_to_sequence(
|
||||
text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False
|
||||
):
|
||||
# pylint: disable=global-statement
|
||||
global _phonemes_to_id, _phonemes
|
||||
if tp:
|
||||
|
@ -75,7 +115,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
|
|||
|
||||
sequence = []
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
to_phonemes = text2phone(clean_text, language)
|
||||
to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes)
|
||||
if to_phonemes is None:
|
||||
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
|
||||
# iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
|
||||
|
@ -86,6 +126,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
|
|||
sequence = pad_with_eos_bos(sequence, tp=tp)
|
||||
if add_blank:
|
||||
sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes)
|
||||
|
||||
return sequence
|
||||
|
||||
|
||||
|
|
|
@ -102,10 +102,10 @@ class ModelManager(object):
|
|||
output_model_path = os.path.join(output_path, "model_file.pth.tar")
|
||||
output_config_path = os.path.join(output_path, "config.json")
|
||||
# NOTE : band-aid for removing phoneme support
|
||||
if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]:
|
||||
raise RuntimeError(
|
||||
" [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models."
|
||||
)
|
||||
# if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]:
|
||||
# raise RuntimeError(
|
||||
# " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models."
|
||||
# )
|
||||
if os.path.exists(output_path):
|
||||
print(f" > {model_name} is already downloaded.")
|
||||
else:
|
||||
|
|
|
@ -22,3 +22,5 @@ coqpit
|
|||
# japanese g2p deps
|
||||
mecab-python3==1.0.3
|
||||
unidic-lite==1.0.8
|
||||
# gruut+supported langs
|
||||
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
"""Tests for text to phoneme converstion"""
|
||||
import unittest
|
||||
|
||||
from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
LANG = "en-us"
|
||||
|
||||
EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||
|
||||
EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TextProcessingTextCase(unittest.TestCase):
|
||||
"""Tests for text to phoneme conversion"""
|
||||
|
||||
def test_phoneme_to_sequence(self):
|
||||
"""Verify en-us sentence phonemes without blank token"""
|
||||
self._test_phoneme_to_sequence(add_blank=False)
|
||||
|
||||
def test_phoneme_to_sequence_with_blank_token(self):
|
||||
"""Verify en-us sentence phonemes with blank token"""
|
||||
self._test_phoneme_to_sequence(add_blank=True)
|
||||
|
||||
def _test_phoneme_to_sequence(self, add_blank):
|
||||
"""Verify en-us sentence phonemes"""
|
||||
text_cleaner = ["phoneme_cleaners"]
|
||||
sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = EXPECTED_PHONEMES.replace("|", "")
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# multiple punctuations
|
||||
text = "Be a voice, not an! echo?"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# not ending with punctuation
|
||||
text = "Be a voice, not an! echo"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# original
|
||||
text = "Be a voice, not an echo!"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# extra space after the sentence
|
||||
text = "Be a voice, not an! echo. "
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# extra space after the sentence
|
||||
text = "Be a voice, not an! echo. "
|
||||
sequence = phoneme_to_sequence(
|
||||
text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True
|
||||
)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
def test_text2phone(self):
|
||||
"""Verify phones directly (with |)"""
|
||||
ph = text2phone(EXAMPLE_TEXT, LANG, use_espeak_phonemes=True)
|
||||
self.assertEqual(ph, EXPECTED_PHONEMES)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
|
@ -16,7 +16,8 @@ config = GlowTTSConfig(
|
|||
num_val_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=True,
|
||||
phoneme_language="zh-CN",
|
||||
use_espeak_phonemes=True,
|
||||
phoneme_language="en-us",
|
||||
phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
|
|
|
@ -16,7 +16,7 @@ config = SpeedySpeechConfig(
|
|||
num_val_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=True,
|
||||
phoneme_language="zh-CN",
|
||||
phoneme_language="en-us",
|
||||
phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
|
|
|
@ -20,6 +20,7 @@ config = MultibandMelganConfig(
|
|||
eval_split_size=1,
|
||||
print_step=1,
|
||||
print_eval=True,
|
||||
discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]},
|
||||
data_path="tests/data/ljspeech",
|
||||
output_path=output_path,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue