update for phonemizer 2.1

2020-02-12 12:21:53 +01:00 · 2020-02-12 12:21:53 +01:00 · 4130674e46
parent 78464f1ead
commit 4130674e46
2 changed files with 30 additions and 17 deletions
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@ -69,7 +69,7 @@ def test_phoneme_to_sequence():
 def test_text2phone():
    text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
-    gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i|| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n||| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
+    gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
    lang = "en-us"
    phonemes = text2phone(text, lang)
-    assert gt == phonemes
+    assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}"
--- a/utils/text/init.py
+++ b/utils/text/init.py
@ -28,8 +28,10 @@ def text2phone(text, language):
    seperator = phonemizer.separator.Separator(' |', '', '|')
    #try:
    punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
    if float(phonemizer.__version__) < 2.1:
        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
        ph = ph[:-1].strip() # skip the last empty character
        # phonemizer does not tackle punctuations. Here we do.
        # Replace \n with matching punctuations.
        if punctuations:
            # if text ends with a punctuation.
@ -43,6 +45,17 @@ def text2phone(text, language):
            else:
                for punct in punctuations:
                    ph = ph.replace('| |\n', '|'+punct+'| |', 1)
    elif float(phonemizer.__version__) == 2.1:
        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True)
        # this is a simple fix for phonemizer.
        # https://github.com/bootphon/phonemizer/issues/32
        if punctuations:
            for punctuation in punctuations:
                ph = ph.replace(f"| |{punctuation} ", f"|{punctuation}| |").replace(f"| |{punctuation}", f"|{punctuation}| |")
            ph = ph[:-3]
    else:
        raise RuntimeError(" [!] Use 'phonemizer' version 2.1 or older.")
    return ph