Fix mimic 2 long sentences (#2061)

* Fix mimic 2 long sentences

Fixes bug in  the second and third chunking pass incorrectly by
concatinating strings with lists resulting in chunks of single
characters.

* Handle mimic2 chunking correctly

- Move preprocessing from get_tts() to a method called from tts execute,
    this allows all parts to be spoken and the caching to work correctly
- Remove duplicate of phonetic spelling in mimic2_tts
pull/2063/head
Åke 2019-03-22 16:20:06 +01:00 committed by Ruthvicp
parent dfa714c56d
commit 9eeb8cefc3
2 changed files with 53 additions and 41 deletions

View File

@ -275,6 +275,20 @@ class TTS:
# return text with supported ssml tags only # return text with supported ssml tags only
return utterance.replace(" ", " ") return utterance.replace(" ", " ")
def _preprocess_sentence(self, sentence):
""" Default preprocessing is no preprocessing.
This method can be overridden to create chunks suitable to the
TTS engine in question.
Arguments:
sentence (str): sentence to preprocess
Returns:
list: list of sentence parts
"""
return [sentence]
def execute(self, sentence, ident=None): def execute(self, sentence, ident=None):
""" """
Convert sentence to speech, preprocessing out unsupported ssml Convert sentence to speech, preprocessing out unsupported ssml
@ -295,20 +309,23 @@ class TTS:
sentence = sentence.replace(word, sentence = sentence.replace(word,
self.spellings[word.lower()]) self.spellings[word.lower()])
key = str(hashlib.md5(sentence.encode('utf-8', 'ignore')).hexdigest()) chunks = self._preprocess_sentence(sentence)
wav_file = os.path.join(mycroft.util.get_cache_directory("tts"), for sentence in chunks:
key + '.' + self.audio_ext) key = str(hashlib.md5(
sentence.encode('utf-8', 'ignore')).hexdigest())
wav_file = os.path.join(mycroft.util.get_cache_directory("tts"),
key + '.' + self.audio_ext)
if os.path.exists(wav_file): if os.path.exists(wav_file):
LOG.debug("TTS cache hit") LOG.debug("TTS cache hit")
phonemes = self.load_phonemes(key) phonemes = self.load_phonemes(key)
else: else:
wav_file, phonemes = self.get_tts(sentence, wav_file) wav_file, phonemes = self.get_tts(sentence, wav_file)
if phonemes: if phonemes:
self.save_phonemes(key, phonemes) self.save_phonemes(key, phonemes)
vis = self.viseme(phonemes) vis = self.viseme(phonemes)
self.queue.put((self.audio_ext, wav_file, vis, ident)) self.queue.put((self.audio_ext, wav_file, vis, ident))
def viseme(self, phonemes): def viseme(self, phonemes):
""" """

View File

@ -89,13 +89,17 @@ def _split_by_punctuation(chunks, puncs):
e.g. hello, world => [hello, world] e.g. hello, world => [hello, world]
Args: Args:
chunks (list): text (str) to split chunks (list or str): text (str) to split
puncs (list): list of punctuations used to split text puncs (list): list of punctuations used to split text
Returns: Returns:
list: list with split text list: list with split text
""" """
out = chunks if isinstance(chunks, str):
out = [chunks]
else:
out = chunks
for punc in puncs: for punc in puncs:
splits = [] splits = []
for t in out: for t in out:
@ -138,7 +142,7 @@ def _sentence_chunker(text):
# first split by punctuations that are major pauses # first split by punctuations that are major pauses
first_splits = _split_by_punctuation( first_splits = _split_by_punctuation(
[text], text,
puncs=[r'\.', r'\!', r'\?', r'\:', r'\;'] puncs=[r'\.', r'\!', r'\?', r'\:', r'\;']
) )
@ -149,7 +153,7 @@ def _sentence_chunker(text):
second_splits += _split_by_punctuation(chunk, second_splits += _split_by_punctuation(chunk,
puncs=[r'\,', '--', '-']) puncs=[r'\,', '--', '-'])
else: else:
second_splits += chunk second_splits.append(chunk)
# if chunks are still too big, chop into pieces of at most 20 words # if chunks are still too big, chop into pieces of at most 20 words
third_splits = [] third_splits = []
@ -157,7 +161,7 @@ def _sentence_chunker(text):
if len(chunk) > _max_sentence_size: if len(chunk) > _max_sentence_size:
third_splits += _split_by_chunk_size(chunk, 20) third_splits += _split_by_chunk_size(chunk, 20)
else: else:
third_splits += chunk third_splits.append(chunk)
return [_add_punctuation(chunk) for chunk in third_splits] return [_add_punctuation(chunk) for chunk in third_splits]
@ -194,7 +198,7 @@ class Mimic2(TTS):
'%s Http Error: %s for url: %s' % '%s Http Error: %s for url: %s' %
(req.status_code, req.reason, req.url)) (req.status_code, req.reason, req.url))
def _requests(self, chunks): def _requests(self, sentence):
"""create asynchronous request list """create asynchronous request list
Args: Args:
@ -203,13 +207,9 @@ class Mimic2(TTS):
Returns: Returns:
list: list of FutureSession objects list: list of FutureSession objects
""" """
reqs = [] url = self.url + parse.quote(sentence)
for chunk in chunks: req_route = url + "&visimes=True"
if len(chunk) > 0: return self.session.get(req_route, timeout=5)
url = self.url + parse.quote(chunk)
req_route = url + "&visimes=True"
reqs.append(self.session.get(req_route, timeout=5))
return reqs
def viseme(self, phonemes): def viseme(self, phonemes):
""" Maps phonemes to appropriate viseme encoding """ Maps phonemes to appropriate viseme encoding
@ -234,6 +234,10 @@ class Mimic2(TTS):
visemes.append((vis, vis_dur)) visemes.append((vis, vis_dur))
return visemes return visemes
def _prepocess_sentence(sentence):
""" Split sentence in chunks better suited for mimic2. """
return _sentence_chunker(sentence)
def get_tts(self, sentence, wav_file): def get_tts(self, sentence, wav_file):
""" Generate (remotely) and play mimic2 WAV audio """ Generate (remotely) and play mimic2 WAV audio
@ -241,23 +245,14 @@ class Mimic2(TTS):
sentence (str): Phrase to synthesize to audio with mimic2 sentence (str): Phrase to synthesize to audio with mimic2
wav_file (str): Location to write audio output wav_file (str): Location to write audio output
""" """
LOG.debug("Generating Mimic2 TSS for: " + str(sentence))
# Use the phonetic_spelling mechanism from the TTS base class
if self.phonetic_spelling:
for word in re.findall(r"[\w']+", sentence):
if word.lower() in self.spellings:
sentence = sentence.replace(word,
self.spellings[word.lower()])
chunks = _sentence_chunker(sentence)
LOG.debug("Generating Mimic2 TSS for: "+str(chunks))
try: try:
for _, req in enumerate(self._requests(chunks)): req = self._requests(sentence)
results = req.result().json() results = req.result().json()
audio = base64.b64decode(results['audio_base64']) audio = base64.b64decode(results['audio_base64'])
vis = results['visimes'] vis = results['visimes']
with open(wav_file, 'wb') as f: with open(wav_file, 'wb') as f:
f.write(audio) f.write(audio)
except (ReadTimeout, ConnectionError, ConnectTimeout, HTTPError): except (ReadTimeout, ConnectionError, ConnectTimeout, HTTPError):
raise RemoteTTSTimeoutException( raise RemoteTTSTimeoutException(
"Mimic 2 server request timed out. Falling back to mimic") "Mimic 2 server request timed out. Falling back to mimic")