Move listen trigger to last chunk of sentence

If rendering a chunk of a sentence takes too long time, the audio queue may run out and trigger the listening. This moves the listening trigger to after the last chunk.
2019-10-04 08:00:06 +02:00 · 2019-10-04 08:00:06 +02:00 · 29db163a78
parent 15233f8929
commit 29db163a78
2 changed files with 25 additions and 22 deletions
--- a/mycroft/audio/speech.py
+++ b/mycroft/audio/speech.py
@ -35,11 +35,6 @@ mimic_fallback_obj = None
 _last_stop_signal = 0


-def _start_listener(_):
-    """Force Mycroft to start listening (as if 'Hey Mycroft' was spoken)."""
-    bus.emit(Message('mycroft.mic.listen'))
-
-
 def handle_speak(event):
    """Handle "speak" message

@ -60,11 +55,7 @@ def handle_speak(event):
        stopwatch = Stopwatch()
        stopwatch.start()
        utterance = event.data['utterance']
-        if event.data.get('expect_response', False):
-            # When expect_response is requested, the listener will be restarted
-            # at the end of the next bit of spoken audio.
-            bus.once('recognizer_loop:audio_output_end', _start_listener)
-
+        listen = event.data.get('expect_response', False)
        # This is a bit of a hack for Picroft.  The analog audio on a Pi blocks
        # for 30 seconds fairly often, so we don't want to break on periods
        # (decreasing the chance of encountering the block).  But we will
@ -82,7 +73,10 @@ def handle_speak(event):
            utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance)
            chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s',
                              utterance)
-            for chunk in chunks:
+            # Apply the listen flag to the last chunk, set the rest to False
+            chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
+                      for i in range(len(chunks))]
+            for chunk, listen in chunks:
                # Check if somthing has aborted the speech
                if (_last_stop_signal > start or
                        check_for_signal('buttonPress')):
@ -90,7 +84,7 @@ def handle_speak(event):
                    tts.playback.clear()
                    break
                try:
-                    mute_and_speak(chunk, ident)
+                    mute_and_speak(chunk, ident, listen)
                except KeyboardInterrupt:
                    raise
                except Exception:
@ -103,7 +97,7 @@ def handle_speak(event):
                                               'tts': tts.__class__.__name__})


-def mute_and_speak(utterance, ident):
+def mute_and_speak(utterance, ident, listen=False):
    """Mute mic and start speaking the utterance using selected tts backend.

    Arguments:
@ -125,7 +119,7 @@ def mute_and_speak(utterance, ident):

    LOG.info("Speak: " + utterance)
    try:
-        tts.execute(utterance, ident)
+        tts.execute(utterance, ident, listen)
    except RemoteTTSTimeoutException as e:
        LOG.error(e)
        mimic_fallback_tts(utterance, ident)
--- a/mycroft/tts/init.py
+++ b/mycroft/tts/init.py
@ -19,7 +19,7 @@ import random
 import re
 from abc import ABCMeta, abstractmethod
 from threading import Thread
-from time import time
+from time import time, sleep

 import os.path
 from os.path import dirname, exists, isdir, join
@ -83,7 +83,8 @@ class PlaybackThread(Thread):
        """Thread main loop. get audio and viseme data from queue and play."""
        while not self._terminated:
            try:
-                snd_type, data, visemes, ident = self.queue.get(timeout=2)
+                snd_type, data, visemes, ident, listen = \
+                    self.queue.get(timeout=2)
                self.blink(0.5)
                if not self._processing_queue:
                    self._processing_queue = True
@ -111,7 +112,7 @@ class PlaybackThread(Thread):
            except Exception as e:
                LOG.exception(e)
                if self._processing_queue:
-                    self.tts.end_audio()
+                    self.tts.end_audio(listen)
                    self._processing_queue = False

    def show_visemes(self, pairs):
@ -196,7 +197,7 @@ class TTS(metaclass=ABCMeta):
        # Create signals informing start of speech
        self.bus.emit(Message("recognizer_loop:audio_output_start"))

-    def end_audio(self):
+    def end_audio(self, listen):
        """Helper function for child classes to call in execute().

        Sends the recognizer_loop:audio_output_end message, indicating
@ -205,6 +206,8 @@ class TTS(metaclass=ABCMeta):
        """

        self.bus.emit(Message("recognizer_loop:audio_output_end"))
+        if listen:
+            self.bus.emit(Message('mycroft.mic.listen'))
        # Clean the cache as needed
        cache_dir = mycroft.util.get_cache_directory("tts/" + self.tts_name)
        mycroft.util.curate_cache(cache_dir, min_free_percent=100)
@ -287,15 +290,17 @@ class TTS(metaclass=ABCMeta):
        """
        return [sentence]

-    def execute(self, sentence, ident=None):
+    def execute(self, sentence, ident=None, listen=False):
        """Convert sentence to speech, preprocessing out unsupported ssml

            The method caches results if possible using the hash of the
            sentence.

-            Args:
+            Arguments:
                sentence:   Sentence to be spoken
                ident:      Id reference to current interaction
+                listen:     True if listen should be triggered at the end
+                            of the utterance.
        """
        sentence = self.validate_ssml(sentence)

@ -307,7 +312,11 @@ class TTS(metaclass=ABCMeta):
                                                self.spellings[word.lower()])

        chunks = self._preprocess_sentence(sentence)
-        for sentence in chunks:
+        # Apply the listen flag to the last chunk, set the rest to False
+        chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
+                  for i in range(len(chunks))]
+
+        for sentence, l in chunks:
            key = str(hashlib.md5(
                sentence.encode('utf-8', 'ignore')).hexdigest())
            wav_file = os.path.join(
@ -323,7 +332,7 @@ class TTS(metaclass=ABCMeta):
                    self.save_phonemes(key, phonemes)

            vis = self.viseme(phonemes) if phonemes else None
-            self.queue.put((self.audio_ext, wav_file, vis, ident))
+            self.queue.put((self.audio_ext, wav_file, vis, ident, l))

    def viseme(self, phonemes):
        """Create visemes from phonemes. Needs to be implemented for all