Fix mimic 2 long sentences (#2061)

* Fix mimic 2 long sentences Fixes bug in the second and third chunking pass incorrectly by concatinating strings with lists resulting in chunks of single characters. * Handle mimic2 chunking correctly - Move preprocessing from get_tts() to a method called from tts execute, this allows all parts to be spoken and the caching to work correctly - Remove duplicate of phonetic spelling in mimic2_tts
2019-03-22 16:20:06 +01:00 · 2019-03-22 16:20:06 +01:00 · 9eeb8cefc3
parent dfa714c56d
commit 9eeb8cefc3
2 changed files with 53 additions and 41 deletions
--- a/mycroft/tts/init.py
+++ b/mycroft/tts/init.py
@ -275,6 +275,20 @@ class TTS:
        # return text with supported ssml tags only
        return utterance.replace("  ", " ")

+    def _preprocess_sentence(self, sentence):
+        """ Default preprocessing is no preprocessing.
+
+        This method can be overridden to create chunks suitable to the
+        TTS engine in question.
+
+        Arguments:
+            sentence (str): sentence to preprocess
+
+        Returns:
+            list: list of sentence parts
+        """
+        return [sentence]
+
    def execute(self, sentence, ident=None):
        """
            Convert sentence to speech, preprocessing out unsupported ssml
@ -295,20 +309,23 @@ class TTS:
                    sentence = sentence.replace(word,
                                                self.spellings[word.lower()])

-        key = str(hashlib.md5(sentence.encode('utf-8', 'ignore')).hexdigest())
-        wav_file = os.path.join(mycroft.util.get_cache_directory("tts"),
-                                key + '.' + self.audio_ext)
+        chunks = self._preprocess_sentence(sentence)
+        for sentence in chunks:
+            key = str(hashlib.md5(
+                sentence.encode('utf-8', 'ignore')).hexdigest())
+            wav_file = os.path.join(mycroft.util.get_cache_directory("tts"),
+                                    key + '.' + self.audio_ext)

-        if os.path.exists(wav_file):
-            LOG.debug("TTS cache hit")
-            phonemes = self.load_phonemes(key)
-        else:
-            wav_file, phonemes = self.get_tts(sentence, wav_file)
-            if phonemes:
-                self.save_phonemes(key, phonemes)
+            if os.path.exists(wav_file):
+                LOG.debug("TTS cache hit")
+                phonemes = self.load_phonemes(key)
+            else:
+                wav_file, phonemes = self.get_tts(sentence, wav_file)
+                if phonemes:
+                    self.save_phonemes(key, phonemes)

-        vis = self.viseme(phonemes)
-        self.queue.put((self.audio_ext, wav_file, vis, ident))
+            vis = self.viseme(phonemes)
+            self.queue.put((self.audio_ext, wav_file, vis, ident))

    def viseme(self, phonemes):
        """
--- a/mycroft/tts/mimic2_tts.py
+++ b/mycroft/tts/mimic2_tts.py
@ -89,13 +89,17 @@ def _split_by_punctuation(chunks, puncs):
    e.g. hello, world => [hello, world]

    Args:
-        chunks (list): text (str) to split
+        chunks (list or str): text (str) to split
        puncs (list): list of punctuations used to split text

    Returns:
        list: list with split text
    """
-    out = chunks
+    if isinstance(chunks, str):
+        out = [chunks]
+    else:
+        out = chunks
+
    for punc in puncs:
        splits = []
        for t in out:
@ -138,7 +142,7 @@ def _sentence_chunker(text):

    # first split by punctuations that are major pauses
    first_splits = _split_by_punctuation(
-        [text],
+        text,
        puncs=[r'\.', r'\!', r'\?', r'\:', r'\;']
    )

@ -149,7 +153,7 @@ def _sentence_chunker(text):
            second_splits += _split_by_punctuation(chunk,
                                                   puncs=[r'\,', '--', '-'])
        else:
-            second_splits += chunk
+            second_splits.append(chunk)

    # if chunks are still too big, chop into pieces of at most 20 words
    third_splits = []
@ -157,7 +161,7 @@ def _sentence_chunker(text):
        if len(chunk) > _max_sentence_size:
            third_splits += _split_by_chunk_size(chunk, 20)
        else:
-            third_splits += chunk
+            third_splits.append(chunk)

    return [_add_punctuation(chunk) for chunk in third_splits]

@ -194,7 +198,7 @@ class Mimic2(TTS):
                '%s Http Error: %s for url: %s' %
                (req.status_code, req.reason, req.url))

-    def _requests(self, chunks):
+    def _requests(self, sentence):
        """create asynchronous request list

        Args:
@ -203,13 +207,9 @@ class Mimic2(TTS):
        Returns:
            list: list of FutureSession objects
        """
-        reqs = []
-        for chunk in chunks:
-            if len(chunk) > 0:
-                url = self.url + parse.quote(chunk)
-                req_route = url + "&visimes=True"
-                reqs.append(self.session.get(req_route, timeout=5))
-        return reqs
+        url = self.url + parse.quote(sentence)
+        req_route = url + "&visimes=True"
+        return self.session.get(req_route, timeout=5)

    def viseme(self, phonemes):
        """ Maps phonemes to appropriate viseme encoding
@ -234,6 +234,10 @@ class Mimic2(TTS):
            visemes.append((vis, vis_dur))
        return visemes

+    def _prepocess_sentence(sentence):
+        """ Split sentence in chunks better suited for mimic2. """
+        return _sentence_chunker(sentence)
+
    def get_tts(self, sentence, wav_file):
        """ Generate (remotely) and play mimic2 WAV audio

@ -241,23 +245,14 @@ class Mimic2(TTS):
            sentence (str): Phrase to synthesize to audio with mimic2
            wav_file (str): Location to write audio output
        """
-
-        # Use the phonetic_spelling mechanism from the TTS base class
-        if self.phonetic_spelling:
-            for word in re.findall(r"[\w']+", sentence):
-                if word.lower() in self.spellings:
-                    sentence = sentence.replace(word,
-                                                self.spellings[word.lower()])
-
-        chunks = _sentence_chunker(sentence)
-        LOG.debug("Generating Mimic2 TSS for: "+str(chunks))
+        LOG.debug("Generating Mimic2 TSS for: " + str(sentence))
        try:
-            for _, req in enumerate(self._requests(chunks)):
-                results = req.result().json()
-                audio = base64.b64decode(results['audio_base64'])
-                vis = results['visimes']
-                with open(wav_file, 'wb') as f:
-                    f.write(audio)
+            req = self._requests(sentence)
+            results = req.result().json()
+            audio = base64.b64decode(results['audio_base64'])
+            vis = results['visimes']
+            with open(wav_file, 'wb') as f:
+                f.write(audio)
        except (ReadTimeout, ConnectionError, ConnectTimeout, HTTPError):
            raise RemoteTTSTimeoutException(
                "Mimic 2 server request timed out. Falling back to mimic")