TTS playback queue singleton (#3055)

* fix issues when remote excepts out * Remove explicit clear_cache from MimicTTS * Updates for using singleton TTS playback thread - Cache is called on all tts's registered as using the thread - Begin audio and end audio is handled by the playback thread - Further changes from self.playback to TTS.playback for consistency * Remove redundant try/except * Consolidate general and TTS-specific sentence splitting This performs all sentence-splitting at the same stage. This fixes a subtle issue where a TTS splits a sentence into chunks and throws an error on only one of those chunks. The fallback would generate a sentence for the original un-chunked sentence. possibly saying the same parts twice. This also pre-compiles the regexes used to speed things up a bit. Co-authored-by: Ken <ken.smith@mycroft.ai>
2022-03-02 01:29:57 +01:00 · 2022-03-02 01:29:57 +01:00 · e7ddd51256
parent 36620af703
commit e7ddd51256
5 changed files with 139 additions and 47 deletions
--- a/mycroft/audio/speech.py
+++ b/mycroft/audio/speech.py
@ -74,13 +74,7 @@ def handle_speak(event):
        # so we likely will want to get rid of this when not running on Mimic
        if (config.get('enclosure', {}).get('platform') != "picroft" and
                len(re.findall('<[^>]*>', utterance)) == 0):
-            # Remove any whitespace present after the period,
-            # if a character (only alpha) ends with a period
-            # ex: A. Lincoln -> A.Lincoln
-            # so that we don't split at the period
-            utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance)
-            chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s',
-                              utterance)
+            chunks = tts.preprocess_utterance(utterance)
            # Apply the listen flag to the last chunk, set the rest to False
            chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
                      for i in range(len(chunks))]
@ -116,10 +110,9 @@ def mute_and_speak(utterance, ident, listen=False):
    # update TTS object if configuration has changed
    if tts_hash != hash(str(config.get('tts', ''))):
        global tts
-        # Stop tts playback thread
-        tts.playback.stop()
-        tts.playback.join()
        # Create new tts instance
+        if tts:
+            tts.playback.detach_tts(tts)
        tts = TTSFactory.create()
        tts.init(bus)
        tts_hash = hash(str(config.get('tts', '')))
--- a/mycroft/tts/mimic_tts.py
+++ b/mycroft/tts/mimic_tts.py
@ -130,8 +130,6 @@ class Mimic(TTS):
        )
        self.default_binary = get_mimic_binary()

-        self.clear_cache()
-
        # Download subscriber voices if needed
        self.subscriber_voices = get_subscriber_voices()
        self.is_subscriber = DeviceApi().is_subscriber
--- a/mycroft/tts/tts.py
+++ b/mycroft/tts/tts.py
@ -39,22 +39,43 @@ from mycroft.util.plugins import load_plugin
 from queue import Queue, Empty
 from .cache import hash_sentence, TextToSpeechCache

-
 _TTS_ENV = deepcopy(os.environ)
 _TTS_ENV['PULSE_PROP'] = 'media.role=phone'

-
 EMPTY_PLAYBACK_QUEUE_TUPLE = (None, None, None, None, None)

+SSML_TAGS = re.compile(r'<[^>]*>')
+WHITESPACE_AFTER_PERIOD = re.compile(r'\b([A-za-z][\.])(\s+)')
+SENTENCE_DELIMITERS = re.compile(
+    r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s'
+)
+
+
+def default_preprocess_utterance(utterance):
+    """Default method for preprocessing Mycroft utterances for TTS.
+
+    Args:
+        utteance (str): Input utterance
+
+    Returns:
+        [str]: list of preprocessed sentences
+    """
+
+    utterance = WHITESPACE_AFTER_PERIOD.sub(r'\g<1>', utterance)
+    chunks = SENTENCE_DELIMITERS.split(utterance)
+    return chunks
+

 class PlaybackThread(Thread):
    """Thread class for playing back tts audio and sending
    viseme data to enclosure.
    """
-
    def __init__(self, queue):
        super(PlaybackThread, self).__init__()
        self.queue = queue
+        self.tts = []
+        self.bus = None
+
        self._terminated = False
        self._processing_queue = False
        self.enclosure = None
@ -66,7 +87,28 @@ class PlaybackThread(Thread):
            self.pulse_env = None

    def init(self, tts):
-        self.tts = tts
+        """DEPRECATED! Init the TTS Playback thread.
+
+        TODO: 22.02 Remove this
+        """
+        self.attach_tts(tts)
+        self.set_bus(tts.bus)
+
+    def set_bus(self, bus):
+        """Provide bus instance to the TTS Playback thread.
+
+        Args:
+            bus (MycroftBusClient): bus client
+        """
+        self.bus = bus
+
+    def attach_tts(self, tts):
+        """Add TTS to be cache checked."""
+        self.tts.append(tts)
+
+    def detach_tts(self, tts):
+        """Remove TTS from cache check."""
+        self.tts.remove(tts)

    def clear_queue(self):
        """Remove all pending playbacks."""
@ -90,7 +132,7 @@ class PlaybackThread(Thread):
        the loop then wait for the playback process to finish before starting
        checking the next position in queue.

-        If the queue is empty the tts.end_audio() is called possibly triggering
+        If the queue is empty the end_audio() is called possibly triggering
        listening.
        """
        while not self._terminated:
@ -100,7 +142,7 @@ class PlaybackThread(Thread):
                self.blink(0.5)
                if not self._processing_queue:
                    self._processing_queue = True
-                    self.tts.begin_audio()
+                    self.begin_audio()

                stopwatch = Stopwatch()
                with stopwatch:
@ -116,7 +158,7 @@ class PlaybackThread(Thread):
                report_timing(ident, 'speech_playback', stopwatch)

                if self.queue.empty():
-                    self.tts.end_audio(listen)
+                    self.end_audio(listen)
                    self._processing_queue = False
                self.blink(0.2)
            except Empty:
@ -124,9 +166,42 @@ class PlaybackThread(Thread):
            except Exception as e:
                LOG.exception(e)
                if self._processing_queue:
-                    self.tts.end_audio(listen)
+                    self.end_audio(listen)
                    self._processing_queue = False

+    def begin_audio(self):
+        """Perform befining of speech actions."""
+        # Create signals informing start of speech
+        if self.bus:
+            self.bus.emit(Message("recognizer_loop:audio_output_start"))
+        else:
+            LOG.warning("Speech started before bus was attached.")
+
+    def end_audio(self, listen):
+        """Perform end of speech output actions.
+
+        Will inform the system that speech has ended and trigger the TTS's
+        cache checks. Listening will be triggered if requested.
+
+        Args:
+            listen (bool): True if listening event should be emitted
+        """
+        if self.bus:
+            # Send end of speech signals to the system
+            self.bus.emit(Message("recognizer_loop:audio_output_end"))
+            if listen:
+                self.bus.emit(Message('mycroft.mic.listen'))
+
+            # Clear cache for all attached tts objects
+            # This is basically the only safe time
+            for tts in self.tts:
+                tts.cache.curate()
+
+            # This check will clear the filesystem IPC "signal"
+            check_for_signal("isSpeaking")
+        else:
+            LOG.warning("Speech started before bus was attached.")
+
    def show_visemes(self, pairs):
        """Send viseme data to enclosure

@ -167,6 +242,8 @@ class TTS(metaclass=ABCMeta):
        phonetic_spelling (bool): Whether to spell certain words phonetically
        ssml_tags (list): Supported ssml properties. Ex. ['speak', 'prosody']
    """
+    queue = None
+    playback = None

    def __init__(self, lang, config, validator, audio_ext='wav',
                 phonetic_spelling=True, ssml_tags=None):
@ -183,9 +260,12 @@ class TTS(metaclass=ABCMeta):
        self.filename = get_temp_path('tts.wav')
        self.enclosure = None
        random.seed()
-        self.queue = Queue()
-        self.playback = PlaybackThread(self.queue)
-        self.playback.start()
+
+        if TTS.queue is None:
+            TTS.queue = Queue()
+            TTS.playback = PlaybackThread(TTS.queue)
+            TTS.playback.start()
+
        self.spellings = self.load_spellings()
        self.tts_name = type(self).__name__
        self.cache = TextToSpeechCache(
@ -252,9 +332,10 @@ class TTS(metaclass=ABCMeta):
            bus:    Mycroft messagebus connection
        """
        self.bus = bus
-        self.playback.init(self)
+        TTS.playback.set_bus(bus)
+        TTS.playback.attach_tts(self)
        self.enclosure = EnclosureAPI(self.bus)
-        self.playback.enclosure = self.enclosure
+        TTS.playback.enclosure = self.enclosure

    def get_tts(self, sentence, wav_file):
        """Abstract method that a tts implementation needs to implement.
@ -306,7 +387,7 @@ class TTS(metaclass=ABCMeta):
            return self.remove_ssml(utterance)

        # find ssml tags in string
-        tags = re.findall('<[^>]*>', utterance)
+        tags = SSML_TAGS.findall(utterance)

        for tag in tags:
            if any(supported in tag for supported in self.ssml_tags):
@ -318,6 +399,21 @@ class TTS(metaclass=ABCMeta):
        # return text with supported ssml tags only
        return utterance.replace("  ", " ")

+    def preprocess_utterance(self, utterance):
+        """Preprocess utterance into list of chunks suitable for the TTS.
+
+        Perform general chunking and TTS specific chunking.
+        """
+        # Remove any whitespace present after the period,
+        # if a character (only alpha) ends with a period
+        # ex: A. Lincoln -> A.Lincoln
+        # so that we don't split at the period
+        chunks = default_preprocess_utterance(utterance)
+        result = []
+        for chunk in chunks:
+            result += self._preprocess_sentence(chunk)
+        return result
+
    def _preprocess_sentence(self, sentence):
        """Default preprocessing is no preprocessing.

@ -347,13 +443,7 @@ class TTS(metaclass=ABCMeta):
        sentence = self.validate_ssml(sentence)

        create_signal("isSpeaking")
-        try:
-            self._execute(sentence, ident, listen)
-        except Exception:
-            # If an error occurs end the audio sequence through an empty entry
-            self.queue.put(EMPTY_PLAYBACK_QUEUE_TUPLE)
-            # Re-raise to allow the Exception to be handled externally as well.
-            raise
+        self._execute(sentence, ident, listen)

    def _execute(self, sentence, ident, listen):
        if self.phonetic_spelling:
@ -362,6 +452,8 @@ class TTS(metaclass=ABCMeta):
                    sentence = sentence.replace(word,
                                                self.spellings[word.lower()])

+        # TODO: 22.02 This is no longer needed and can be removed
+        # Just kept for compatibility for now
        chunks = self._preprocess_sentence(sentence)
        # Apply the listen flag to the last chunk, set the rest to False
        chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
@ -409,7 +501,7 @@ class TTS(metaclass=ABCMeta):
                    audio_file, phoneme_file
                )
            viseme = self.viseme(phonemes) if phonemes else None
-            self.queue.put(
+            TTS.queue.put(
                (self.audio_ext, str(audio_file.path), viseme, ident, l)
            )

@ -489,10 +581,6 @@ class TTS(metaclass=ABCMeta):
                LOG.debug("Failed to read .PHO from cache")
        return None

-    def __del__(self):
-        self.playback.stop()
-        self.playback.join()
-

 class TTSValidator(metaclass=ABCMeta):
    """TTS Validator abstract class to be implemented by all TTS engines.
@ -500,7 +588,6 @@ class TTSValidator(metaclass=ABCMeta):
    It exposes and implements ``validate(tts)`` function as a template to
    validate the TTS engines.
    """
-
    def __init__(self, tts):
        self.tts = tts

--- a/test/unittests/audio/test_speech.py
+++ b/test/unittests/audio/test_speech.py
@ -23,6 +23,7 @@ from os.path import exists

 import mycroft.audio.speech as speech
 from mycroft.messagebus import Message
+from mycroft.tts.tts import default_preprocess_utterance
 from mycroft.tts.remote_tts import RemoteTTSTimeoutException

 """Tests for speech dispatch service."""
@ -36,6 +37,8 @@ def setup_mocks(config_mock, tts_factory_mock):
    config_mock.get.return_value = {}

    tts_factory_mock.create.return_value = tts_mock
+
+    tts_mock.preprocess_utterance.side_effect = default_preprocess_utterance
    config_mock.reset_mock()
    tts_factory_mock.reset_mock()
    tts_mock.reset_mock()
--- a/test/unittests/tts/test_tts.py
+++ b/test/unittests/tts/test_tts.py
@ -12,6 +12,14 @@ mock_audio = "/tmp/mock_path"
 mock_viseme = mock.Mock(name='viseme')


+class MsgTypeCheck:
+    def __init__(self, msg_type):
+        self.msg_type = msg_type
+
+    def __eq__(self, other):
+        return self.msg_type == other.msg_type
+
+
 class MockTTS(mycroft.tts.TTS):
    def __init__(self, lang, config, validator, audio_ext='wav',
                 phonetic_spelling=True, ssml_tags=None):
@ -58,17 +66,20 @@ class TestPlaybackThread(unittest.TestCase):
            # Test wav data
            wav_mock = mock.Mock(name='wav_data')
            queue.put(('wav', wav_mock, None, 0, False))
-            time.sleep(0.2)
-            mock_tts.begin_audio.called_with()
+            time.sleep(0.3)
            mock_play_wav.assert_called_with(wav_mock, environment=None)
-            mock_tts.end_audio.assert_called_with(False)
+            mock_tts.bus.emit.assert_called_with(
+                    MsgTypeCheck('recognizer_loop:audio_output_end')
+            )

            # Test mp3 data and trigger listening True
            mp3_mock = mock.Mock(name='mp3_data')
            queue.put(('mp3', mp3_mock, None, 0, True))
            time.sleep(0.2)
            mock_play_mp3.assert_called_with(mp3_mock, environment=None)
-            mock_tts.end_audio.assert_called_with(True)
+            mock_tts.bus.emit.assert_called_with(
+                MsgTypeCheck('mycroft.mic.listen')
+            )
            self.assertFalse(playback.enclosure.get.called)

            # Test sending visemes
@ -92,7 +103,7 @@ class TestTTS(unittest.TestCase):
        tts.init(bus_mock)
        self.assertTrue(tts.bus is bus_mock)

-        tts.queue = mock.Mock()
+        mycroft.tts.TTS.queue = mock.Mock()
        with mock.patch('mycroft.tts.tts.open') as mock_open:
            tts.cache.temporary_cache_dir = Path('/tmp/dummy')
            tts.execute('Oh no, not again', 42)
@ -100,7 +111,7 @@ class TestTTS(unittest.TestCase):
            'Oh no, not again',
            '/tmp/dummy/8da7f22aeb16bc3846ad07b644d59359.wav'
        )
-        tts.queue.put.assert_called_with(
+        mycroft.tts.TTS.queue.put.assert_called_with(
            (
                'wav',
                mock_audio,
@ -117,7 +128,7 @@ class TestTTS(unittest.TestCase):
        tts.init(bus_mock)
        self.assertTrue(tts.bus is bus_mock)

-        tts.queue = mock.Mock()
+        mycroft.tts.TTS.queue = mock.Mock()
        with mock.patch('mycroft.tts.tts.open') as mock_open:
            tts.cache.temporary_cache_dir = Path('/tmp/dummy')
            tts.execute('Oh no, not again', 42)
@ -125,7 +136,7 @@ class TestTTS(unittest.TestCase):
            'Oh no, not again',
            '/tmp/dummy/8da7f22aeb16bc3846ad07b644d59359.wav'
        )
-        tts.queue.put.assert_called_with(
+        mycroft.tts.TTS.queue.put.assert_called_with(
            (
                'wav',
                mock_audio,