text processing updates with tests

2019-03-29 15:42:46 +01:00 · 2019-03-29 15:42:46 +01:00 · 103971c893
parent e1573440ba
commit 103971c893
3 changed files with 50 additions and 12 deletions
--- a/tests/text_processing_tests.py
+++ b/tests/text_processing_tests.py
@ -12,6 +12,42 @@ def test_phoneme_to_sequence():
    gt = "^ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
    assert text_hat == gt
    # multiple punctuations
    text = "Be a voice, not an! echo?"
    sequence = phoneme_to_sequence(text, text_cleaner, lang)
    text_hat = sequence_to_phoneme(sequence)
    gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
    print(text_hat)
    print(len(sequence))
    assert text_hat == gt
    # not ending with punctuation
    text = "Be a voice, not an! echo"
    sequence = phoneme_to_sequence(text, text_cleaner, lang)
    text_hat = sequence_to_phoneme(sequence)
    gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
    print(text_hat)
    print(len(sequence))
    assert text_hat == gt
    # original
    text = "Be a voice, not an echo!"
    sequence = phoneme_to_sequence(text, text_cleaner, lang)
    text_hat = sequence_to_phoneme(sequence)
    gt = "^biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
    print(text_hat)
    print(len(sequence))
    assert text_hat == gt
    # extra space after the sentence
    text = "Be a voice, not an! echo.  "
    sequence = phoneme_to_sequence(text, text_cleaner, lang)
    text_hat = sequence_to_phoneme(sequence)
    gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
    print(text_hat)
    print(len(sequence))
    assert text_hat == gt
 def test_text2phone():
    text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
--- a/utils/text/init.py
+++ b/utils/text/init.py
@ -31,19 +31,21 @@ def text2phone(text, language):
    ph = ph[:-1] # skip the last empty character
    # Replace \n with matching punctuations.
    if len(punctuations) > 0:
        # if text ends with a punctuation.
        if text[-1] == punctuations[-1]:
            for punct in punctuations[:-1]:
                ph = ph.replace('| |\n', '|'+punct+'| |', 1)
            try:
-             ph = ph[:-1] + punctuations[-1]
+                ph = ph + punctuations[-1]
            except:
                print(text)
        else:
            for punct in punctuations:
                ph = ph.replace('| |\n', '|'+punct+'| |', 1)
    return ph
 def phoneme_to_sequence(text, cleaner_names, language):
    '''
    TODO: This ignores punctuations
    '''
    sequence = [_phonemes_to_id['^']]
    clean_text = _clean_text(text, cleaner_names)
    phonemes = text2phone(clean_text, language)
@ -80,7 +82,7 @@ def text_to_sequence(text, cleaner_names):
      Returns:
        List of integers corresponding to the symbols in the text
    '''
-    sequence = []
+    # sequence = []
    sequence = [_phonemes_to_id['^']]
    # Check for curly braces and treat their contents as ARPAbet:
    while len(text):
--- a/utils/text/cleaners.py
+++ b/utils/text/cleaners.py
@ -56,7 +56,7 @@ def lowercase(text):
 def collapse_whitespace(text):
-    return re.sub(_whitespace_re, ' ', text)
+    return re.sub(_whitespace_re, ' ', text).strip()
 def convert_to_ascii(text):