# Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import re import time import typing from collections import defaultdict from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from threading import Lock from uuid import uuid4 from mycroft.configuration import Configuration from mycroft.metrics import report_timing, Stopwatch from mycroft.tts import TTSFactory from mycroft.util import check_for_signal, create_signal, resolve_resource_file from mycroft.util.log import LOG from mycroft.messagebus.message import Message from mycroft.tts.remote_tts import RemoteTTSException from mycroft.tts.mimic_tts import Mimic @dataclass class TTSSession: id: str cache_paths: typing.List[typing.Union[str, Path]] = field(default_factory=list) expire_after: typing.Optional[datetime] = None bus = None # Mycroft messagebus connection config = None tts = None tts_hash = None lock = Lock() mimic_fallback_obj = None tts_session_cache: typing.Dict[str, TTSSession] = dict() _last_stop_signal = 0 def handle_speak(event): """Handle "speak" message Parse sentences and invoke text to speech service. """ config = Configuration.get() Configuration.set_config_update_handlers(bus) global _last_stop_signal # if the message is targeted and audio is not the target don't # don't synthezise speech event.context = event.context or {} if event.context.get("destination") and not ( "debug_cli" in event.context["destination"] or "audio" in event.context["destination"] ): return # Get conversation ID if event.context and "ident" in event.context: ident = event.context["ident"] else: ident = "unknown" start = time.time() # Time of speech request with lock: stopwatch = Stopwatch() stopwatch.start() utterance = event.data["utterance"] cache_only = event.data.get("cache_only", False) speak = not cache_only listen = event.data.get("expect_response", False) cache_key = event.data.get("cache_key") if cache_key and speak: cache_keep = event.data.get("cache_keep", False) was_in_cache = _speak_from_cache(cache_key, keep=cache_keep, listen=listen) if was_in_cache: # Successfully spoken from cache return tts_session_id = cache_key or str(uuid4()) if cache_only: # Create new TTS session expire_after: typing.Optional[datetime] = None expire_after_str = event.data.get("cache_expire") if expire_after_str: expire_after = datetime.fromisoformat(expire_after_str) tts_session_cache[tts_session_id] = TTSSession( id=tts_session_id, expire_after=expire_after ) create_signal("isSpeaking") # This is a bit of a hack for Picroft. The analog audio on a Pi blocks # for 30 seconds fairly often, so we don't want to break on periods # (decreasing the chance of encountering the block). But we will # keep the split for non-Picroft installs since it give user feedback # faster on longer phrases. # # TODO: Remove or make an option? This is really a hack, anyway, # so we likely will want to get rid of this when not running on Mimic if ( config.get("enclosure", {}).get("platform") != "picroft" and len(re.findall("<[^>]*>", utterance)) == 0 ): # Remove any whitespace present after the period, # if a character (only alpha) ends with a period # ex: A. Lincoln -> A.Lincoln # so that we don't split at the period # NOTE: This does not work because things like "a.m." and "I.P." # will have their whitespace removed too. # # utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance) chunks = re.split( r"(? start or check_for_signal("buttonPress"): # Clear any newly queued speech tts.playback.clear() break try: mute_and_speak( chunk, ident, listen, session_id=tts_session_id, chunk_idx=chunk_idx, num_chunks=num_chunks, speak=speak, ) except KeyboardInterrupt: raise except Exception: LOG.error("Error in mute_and_speak", exc_info=True) if cache_only: bus.emit(event.reply("speak.cache.reply", {"key": tts_session_id})) else: mute_and_speak(utterance, ident, listen) stopwatch.stop() report_timing( ident, "speech", stopwatch, {"utterance": utterance, "tts": tts.__class__.__name__}, ) def mute_and_speak( utterance, ident, listen=False, session_id=None, chunk_idx=0, num_chunks=1, speak=True, ): """Mute mic and start speaking the utterance using selected tts backend. Args: utterance: The sentence to be spoken ident: Ident tying the utterance to the source query """ global tts_hash # update TTS object if configuration has changed if tts_hash != hash(str(config.get("tts", ""))): global tts # Stop tts playback thread tts.playback.stop() tts.playback.join() # Create new tts instance tts = TTSFactory.create() tts.init(bus) tts_hash = hash(str(config.get("tts", ""))) LOG.debug("Listen=%s, Speak:%s" % (listen, utterance)) try: cached_path = tts.execute( utterance, ident, listen, session_id=session_id, chunk_idx=chunk_idx, num_chunks=num_chunks, speak=speak, ) if not speak: session = tts_session_cache[session_id] session.cache_paths.append(cached_path) if session.expire_after: LOG.info( "Cached utterance for session %s until %s", session_id, session.expire_after, ) else: LOG.info("Cached utterance for session %s", session_id) except RemoteTTSException as e: LOG.error(e) mimic_fallback_tts(utterance, ident, listen) except Exception: LOG.exception("TTS execution failed.") def _get_mimic_fallback(): """Lazily initializes the fallback TTS if needed.""" global mimic_fallback_obj if not mimic_fallback_obj: config = Configuration.get() tts_config = config.get("tts", {}).get("mimic", {}) lang = config.get("lang", "en-us") tts = Mimic(lang, tts_config) tts.validator.validate() tts.init(bus) mimic_fallback_obj = tts return mimic_fallback_obj def mimic_fallback_tts(utterance, ident, listen): """Speak utterance using fallback TTS if connection is lost. Args: utterance (str): sentence to speak ident (str): interaction id for metrics listen (bool): True if interaction should end with mycroft listening """ tts = _get_mimic_fallback() LOG.debug("Mimic fallback, utterance : " + str(utterance)) tts.execute(utterance, ident, listen) def handle_stop(event): """Handle stop message. Shutdown any speech. """ global _last_stop_signal if check_for_signal("isSpeaking", -1): _last_stop_signal = time.time() tts.playback.clear() # Clear here to get instant stop bus.emit(Message("mycroft.stop.handled", {"by": "TTS"})) def handle_pause(event): tts.playback.pause() def handle_resume(event): tts.playback.resume() def _speak_from_cache(key: str, keep: bool = False, listen: bool = False) -> bool: if keep: session = tts_session_cache.get(key) else: session = tts_session_cache.pop(key, None) if session is None: LOG.warning("No TTS session cache for %s", key) return False if (session.expire_after is not None) and (datetime.now() > session.expire_after): LOG.debug("TTS session expired for %s", key) # Ensure session is gone tts_session_cache.pop(key, None) return False # Verify that all paths exist for cache_path in session.cache_paths: if not os.path.exists(cache_path): return False create_signal("isSpeaking") session_id = str(uuid4()) num_chunks = len(session.cache_paths) for chunk_idx, cache_path in enumerate(session.cache_paths): audio_uri = "file://" + str(cache_path) bus.emit( Message( "mycroft.tts.speak-chunk", data={ "uri": audio_uri, "session_id": session_id, "chunk_index": chunk_idx, "num_chunks": num_chunks, "listen": listen if chunk_idx == (num_chunks - 1) else False, }, ) ) return True def init(messagebus): """Start speech related handlers. Args: messagebus: Connection to the Mycroft messagebus """ global bus global tts global tts_hash global config bus = messagebus Configuration.set_config_update_handlers(bus) config = Configuration.get() tts = TTSFactory.create() tts.init(bus) tts_hash = hash(str(config.get("tts", ""))) bus.on("mycroft.stop", handle_stop) bus.on("mycroft.audio.speech.stop", handle_stop) bus.on("mycroft.audio.speech.pause", handle_pause) bus.on("mycroft.audio.speech.resume", handle_resume) bus.on("speak", handle_speak) bus.on("speak.cache", handle_speak) def shutdown(): """Shutdown the audio service cleanly. Stop any playing audio and make sure threads are joined correctly. """ if tts: tts.playback.stop() tts.playback.join() if mimic_fallback_obj: mimic_fallback_obj.playback.stop() mimic_fallback_obj.playback.join()