Merge branch 'dev' into dev

2021-06-26 15:32:19 -03:00 · 2021-06-26 15:32:19 -03:00 · eb84bb2bc8
parent 99d40e98d9 6c7bbcaef0
commit eb84bb2bc8
12 changed files with 165 additions and 12 deletions
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.0.15
+0.0.16
--- a/TTS/init.py
+++ b/TTS/init.py
@ -1,6 +1,5 @@
 import os

-
 with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
    version = f.read().strip()

--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@ -299,5 +299,6 @@ if __name__ == "__main__":
    args = parser.parse_args()

    c = load_config(args.config_path)
-    c.audio['do_trim_silence'] = False  # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
+    c.audio["do_trim_silence"] = False  # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
+
    main(args)
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig):
            Audio processor config object instance.
        use_phonemes (bool):
            enable / disable phoneme use.
+        use_espeak_phonemes (bool):
+            enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
        compute_input_seq_cache (bool):
            enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
            the training, It allows faster data loader time and precise limitation with `max_seq_len` and
@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig):
    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
    # phoneme settings
    use_phonemes: bool = False
+    use_espeak_phonemes: bool = True
    phoneme_language: str = None
    compute_input_seq_cache: bool = False
    text_cleaner: str = MISSING
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG):
                CONFIG.enable_eos_bos_chars,
                tp=CONFIG.characters,
                add_blank=CONFIG.add_blank,
+                use_espeak_phonemes=CONFIG.use_espeak_phonemes,
            ),
            dtype=np.int32,
        )
--- a/TTS/tts/utils/text/init.py
+++ b/TTS/tts/utils/text/init.py
@ -1,7 +1,9 @@
 # -*- coding: utf-8 -*-

 import re
+import unicodedata

+import gruut
 from packaging import version

 from TTS.tts.utils.text import cleaners
@ -25,8 +27,11 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)")
 # Regular expression matching punctuations, ignoring empty space
 PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"

+# Table for str.translate to fix gruut/TTS phoneme mismatch
+GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")

-def text2phone(text, language):
+
+def text2phone(text, language, use_espeak_phonemes=False):
    """Convert graphemes to phonemes.
    Parameters:
            text (str): text to phonemize
@ -39,10 +44,43 @@ def text2phone(text, language):
    # TO REVIEW : How to have a good implementation for this?
    if language == "zh-CN":
        ph = chinese_text_to_phonemes(text)
+        print(" > Phonemes: {}".format(ph))
        return ph

    if language == "ja-jp":
        ph = japanese_text_to_phonemes(text)
+        print(" > Phonemes: {}".format(ph))
+        return ph
+
+    if gruut.is_language_supported(language):
+        # Use gruut for phonemization
+        phonemizer_args = {
+            "remove_stress": True,
+            "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
+            "ipa_major_breaks": False,  # don't replace periods with IPA ‖
+        }
+
+        if use_espeak_phonemes:
+            # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
+            # This is intended for backwards compatibility with TTS<=v0.0.13
+            # pre-trained models.
+            phonemizer_args["model_prefix"] = "espeak"
+
+        ph_list = gruut.text_to_phonemes(
+            text,
+            lang=language,
+            return_format="word_phonemes",
+            phonemizer_args=phonemizer_args,
+        )
+
+        # Join and re-split to break apart dipthongs, suprasegmentals, etc.
+        ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
+        ph = "| ".join(ph_words)
+
+        # Fix a few phonemes
+        ph = ph.translate(GRUUT_TRANS_TABLE)
+
+        print(" > Phonemes: {}".format(ph))
        return ph

    raise ValueError(f" [!] Language {language} is not supported for phonemization.")
@ -66,7 +104,9 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
    return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]


-def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
+def phoneme_to_sequence(
+    text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False
+):
    # pylint: disable=global-statement
    global _phonemes_to_id, _phonemes
    if tp:
@ -75,7 +115,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=

    sequence = []
    clean_text = _clean_text(text, cleaner_names)
-    to_phonemes = text2phone(clean_text, language)
+    to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes)
    if to_phonemes is None:
        print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
    # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
@ -86,6 +126,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
        sequence = pad_with_eos_bos(sequence, tp=tp)
    if add_blank:
        sequence = intersperse(sequence, len(_phonemes))  # add a blank token (new), whose id number is len(_phonemes)
+
    return sequence


--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -102,10 +102,10 @@ class ModelManager(object):
        output_model_path = os.path.join(output_path, "model_file.pth.tar")
        output_config_path = os.path.join(output_path, "config.json")
        # NOTE : band-aid for removing phoneme support
-        if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]:
-            raise RuntimeError(
-                " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models."
-            )
+        # if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]:
+        #     raise RuntimeError(
+        #         " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models."
+        #     )
        if os.path.exists(output_path):
            print(f" > {model_name} is already downloaded.")
        else:
--- a/requirements.txt
+++ b/requirements.txt
@ -22,3 +22,5 @@ coqpit
 # japanese g2p deps
 mecab-python3==1.0.3
 unidic-lite==1.0.8
+# gruut+supported langs
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@ -0,0 +1,104 @@
+"""Tests for text to phoneme converstion"""
+import unittest
+
+from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone
+
+# -----------------------------------------------------------------------------
+
+LANG = "en-us"
+
+EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
+
+EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
+
+# -----------------------------------------------------------------------------
+
+
+class TextProcessingTextCase(unittest.TestCase):
+    """Tests for text to phoneme conversion"""
+
+    def test_phoneme_to_sequence(self):
+        """Verify en-us sentence phonemes without blank token"""
+        self._test_phoneme_to_sequence(add_blank=False)
+
+    def test_phoneme_to_sequence_with_blank_token(self):
+        """Verify en-us sentence phonemes with blank token"""
+        self._test_phoneme_to_sequence(add_blank=True)
+
+    def _test_phoneme_to_sequence(self, add_blank):
+        """Verify en-us sentence phonemes"""
+        text_cleaner = ["phoneme_cleaners"]
+        sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = EXPECTED_PHONEMES.replace("|", "")
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
+
+        # multiple punctuations
+        text = "Be a voice, not an! echo?"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
+
+        # not ending with punctuation
+        text = "Be a voice, not an! echo"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
+
+        # original
+        text = "Be a voice, not an echo!"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
+
+        # extra space after the sentence
+        text = "Be a voice, not an! echo.  "
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
+
+        # extra space after the sentence
+        text = "Be a voice, not an! echo.  "
+        sequence = phoneme_to_sequence(
+            text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True
+        )
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
+
+    def test_text2phone(self):
+        """Verify phones directly (with |)"""
+        ph = text2phone(EXAMPLE_TEXT, LANG, use_espeak_phonemes=True)
+        self.assertEqual(ph, EXPECTED_PHONEMES)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/tts_tests/test_glow_tts_train.py
+++ b/tests/tts_tests/test_glow_tts_train.py
@ -16,7 +16,8 @@ config = GlowTTSConfig(
    num_val_loader_workers=0,
    text_cleaner="english_cleaners",
    use_phonemes=True,
-    phoneme_language="zh-CN",
+    use_espeak_phonemes=True,
+    phoneme_language="en-us",
    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
    run_eval=True,
    test_delay_epochs=-1,
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ b/tests/tts_tests/test_speedy_speech_train.py
@ -16,7 +16,7 @@ config = SpeedySpeechConfig(
    num_val_loader_workers=0,
    text_cleaner="english_cleaners",
    use_phonemes=True,
-    phoneme_language="zh-CN",
+    phoneme_language="en-us",
    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
    run_eval=True,
    test_delay_epochs=-1,
--- a/tests/vocoder_tests/test_multiband_melgan_train.py
+++ b/tests/vocoder_tests/test_multiband_melgan_train.py
@ -20,6 +20,7 @@ config = MultibandMelganConfig(
    eval_split_size=1,
    print_step=1,
    print_eval=True,
+    discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]},
    data_path="tests/data/ljspeech",
    output_path=output_path,
 )
 @ -1 +1 @@
 .0.15
 .0.16