mirror of https://github.com/coqui-ai/TTS.git
text processing updates with tests
parent
e1573440ba
commit
103971c893
|
@ -12,6 +12,42 @@ def test_phoneme_to_sequence():
|
||||||
gt = "^ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
|
gt = "^ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
|
||||||
assert text_hat == gt
|
assert text_hat == gt
|
||||||
|
|
||||||
|
# multiple punctuations
|
||||||
|
text = "Be a voice, not an! echo?"
|
||||||
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
|
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
|
||||||
|
print(text_hat)
|
||||||
|
print(len(sequence))
|
||||||
|
assert text_hat == gt
|
||||||
|
|
||||||
|
# not ending with punctuation
|
||||||
|
text = "Be a voice, not an! echo"
|
||||||
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
|
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
||||||
|
print(text_hat)
|
||||||
|
print(len(sequence))
|
||||||
|
assert text_hat == gt
|
||||||
|
|
||||||
|
# original
|
||||||
|
text = "Be a voice, not an echo!"
|
||||||
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
|
gt = "^biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
|
||||||
|
print(text_hat)
|
||||||
|
print(len(sequence))
|
||||||
|
assert text_hat == gt
|
||||||
|
|
||||||
|
# extra space after the sentence
|
||||||
|
text = "Be a voice, not an! echo. "
|
||||||
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
|
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
|
||||||
|
print(text_hat)
|
||||||
|
print(len(sequence))
|
||||||
|
assert text_hat == gt
|
||||||
|
|
||||||
|
|
||||||
def test_text2phone():
|
def test_text2phone():
|
||||||
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||||
|
|
|
@ -31,19 +31,21 @@ def text2phone(text, language):
|
||||||
ph = ph[:-1] # skip the last empty character
|
ph = ph[:-1] # skip the last empty character
|
||||||
# Replace \n with matching punctuations.
|
# Replace \n with matching punctuations.
|
||||||
if len(punctuations) > 0:
|
if len(punctuations) > 0:
|
||||||
|
# if text ends with a punctuation.
|
||||||
|
if text[-1] == punctuations[-1]:
|
||||||
for punct in punctuations[:-1]:
|
for punct in punctuations[:-1]:
|
||||||
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
|
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
|
||||||
try:
|
try:
|
||||||
ph = ph[:-1] + punctuations[-1]
|
ph = ph + punctuations[-1]
|
||||||
except:
|
except:
|
||||||
print(text)
|
print(text)
|
||||||
|
else:
|
||||||
|
for punct in punctuations:
|
||||||
|
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
|
||||||
return ph
|
return ph
|
||||||
|
|
||||||
|
|
||||||
def phoneme_to_sequence(text, cleaner_names, language):
|
def phoneme_to_sequence(text, cleaner_names, language):
|
||||||
'''
|
|
||||||
TODO: This ignores punctuations
|
|
||||||
'''
|
|
||||||
sequence = [_phonemes_to_id['^']]
|
sequence = [_phonemes_to_id['^']]
|
||||||
clean_text = _clean_text(text, cleaner_names)
|
clean_text = _clean_text(text, cleaner_names)
|
||||||
phonemes = text2phone(clean_text, language)
|
phonemes = text2phone(clean_text, language)
|
||||||
|
@ -80,7 +82,7 @@ def text_to_sequence(text, cleaner_names):
|
||||||
Returns:
|
Returns:
|
||||||
List of integers corresponding to the symbols in the text
|
List of integers corresponding to the symbols in the text
|
||||||
'''
|
'''
|
||||||
sequence = []
|
# sequence = []
|
||||||
sequence = [_phonemes_to_id['^']]
|
sequence = [_phonemes_to_id['^']]
|
||||||
# Check for curly braces and treat their contents as ARPAbet:
|
# Check for curly braces and treat their contents as ARPAbet:
|
||||||
while len(text):
|
while len(text):
|
||||||
|
|
|
@ -56,7 +56,7 @@ def lowercase(text):
|
||||||
|
|
||||||
|
|
||||||
def collapse_whitespace(text):
|
def collapse_whitespace(text):
|
||||||
return re.sub(_whitespace_re, ' ', text)
|
return re.sub(_whitespace_re, ' ', text).strip()
|
||||||
|
|
||||||
|
|
||||||
def convert_to_ascii(text):
|
def convert_to_ascii(text):
|
||||||
|
|
Loading…
Reference in New Issue