Merge pull request #2450 from forslund/refactor/tts

Add TTS tests
2020-01-13 15:24:50 +01:00 · 2020-01-13 15:24:50 +01:00 · 80b3ffb587
parent 4ab39bf0fd 8f6822278f
commit 80b3ffb587
18 changed files with 959 additions and 600 deletions
--- a/mycroft/tts/init.py
+++ b/mycroft/tts/init.py
@ -12,512 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from copy import deepcopy
-import hashlib
-import os
-import random
-import re
-from abc import ABCMeta, abstractmethod
-from threading import Thread
-from time import time, sleep
+"""The TTS module contains TTS classes for interfacing with various TTS
+services. This includes both local and remote services. The module also
+declares a "factory" for spawning a TTS service based on configuration.
+"""

-import os.path
-from os.path import dirname, exists, isdir, join
-
-import mycroft.util
-from mycroft.enclosure.api import EnclosureAPI
-from mycroft.configuration import Configuration
-from mycroft.messagebus.message import Message
-from mycroft.metrics import report_timing, Stopwatch
-from mycroft.util import (
-    play_wav, play_mp3, check_for_signal, create_signal, resolve_resource_file
-)
-from mycroft.util.log import LOG
-from queue import Queue, Empty
-
-
-_TTS_ENV = deepcopy(os.environ)
-_TTS_ENV['PULSE_PROP'] = 'media.role=phone'
-
-
-def send_playback_metric(stopwatch, ident):
-    """Send playback metrics in a background thread."""
-
-    def do_send(stopwatch, ident):
-        report_timing(ident, 'speech_playback', stopwatch)
-
-    t = Thread(target=do_send, args=(stopwatch, ident))
-    t.daemon = True
-    t.start()
-
-
-class PlaybackThread(Thread):
-    """Thread class for playing back tts audio and sending
-    viseme data to enclosure.
-    """
-
-    def __init__(self, queue):
-        super(PlaybackThread, self).__init__()
-        self.queue = queue
-        self._terminated = False
-        self._processing_queue = False
-        # Check if the tts shall have a ducking role set
-        if Configuration.get().get('tts', {}).get('pulse_duck'):
-            self.pulse_env = _TTS_ENV
-        else:
-            self.pulse_env = None
-
-    def init(self, tts):
-        self.tts = tts
-
-    def clear_queue(self):
-        """Remove all pending playbacks."""
-        while not self.queue.empty():
-            self.queue.get()
-        try:
-            self.p.terminate()
-        except Exception:
-            pass
-
-    def run(self):
-        """Thread main loop. Get audio and extra data from queue and play.
-
-        The queue messages is a tuple containing
-        snd_type: 'mp3' or 'wav' telling the loop what format the data is in
-        data: path to temporary audio data
-        videmes: list of visemes to display while playing
-        listen: if listening should be triggered at the end of the sentence.
-
-        Playback of audio is started and the visemes are sent over the bus
-        the loop then wait for the playback process to finish before starting
-        checking the next position in queue.
-
-        If the queue is empty the tts.end_audio() is called possibly triggering
-        listening.
-        """
-        while not self._terminated:
-            try:
-                (snd_type, data,
-                 visemes, ident, listen) = self.queue.get(timeout=2)
-                self.blink(0.5)
-                if not self._processing_queue:
-                    self._processing_queue = True
-                    self.tts.begin_audio()
-
-                stopwatch = Stopwatch()
-                with stopwatch:
-                    if snd_type == 'wav':
-                        self.p = play_wav(data, environment=self.pulse_env)
-                    elif snd_type == 'mp3':
-                        self.p = play_mp3(data, environment=self.pulse_env)
-
-                    if visemes:
-                        self.show_visemes(visemes)
-                    self.p.communicate()
-                    self.p.wait()
-                send_playback_metric(stopwatch, ident)
-
-                if self.queue.empty():
-                    self.tts.end_audio(listen)
-                    self._processing_queue = False
-                self.blink(0.2)
-            except Empty:
-                pass
-            except Exception as e:
-                LOG.exception(e)
-                if self._processing_queue:
-                    self.tts.end_audio(listen)
-                    self._processing_queue = False
-
-    def show_visemes(self, pairs):
-        """Send viseme data to enclosure
-
-        Arguments:
-            pairs(list): Visime and timing pair
-
-        Returns:
-            True if button has been pressed.
-        """
-        if self.enclosure:
-            self.enclosure.mouth_viseme(time(), pairs)
-
-    def clear(self):
-        """Clear all pending actions for the TTS playback thread."""
-        self.clear_queue()
-
-    def blink(self, rate=1.0):
-        """Blink mycroft's eyes"""
-        if self.enclosure and random.random() < rate:
-            self.enclosure.eyes_blink("b")
-
-    def stop(self):
-        """Stop thread"""
-        self._terminated = True
-        self.clear_queue()
-
-
-class TTS(metaclass=ABCMeta):
-    """TTS abstract class to be implemented by all TTS engines.
-
-    It aggregates the minimum required parameters and exposes
-    ``execute(sentence)`` and ``validate_ssml(sentence)`` functions.
-
-    Arguments:
-        lang (str):
-        config (dict): Configuration for this specific tts engine
-        validator (TTSValidator): Used to verify proper installation
-        phonetic_spelling (bool): Whether to spell certain words phonetically
-        ssml_tags (list): Supported ssml properties. Ex. ['speak', 'prosody']
-    """
-    def __init__(self, lang, config, validator, audio_ext='wav',
-                 phonetic_spelling=True, ssml_tags=None):
-        super(TTS, self).__init__()
-        self.bus = None  # initalized in "init" step
-        self.lang = lang or 'en-us'
-        self.config = config
-        self.validator = validator
-        self.phonetic_spelling = phonetic_spelling
-        self.audio_ext = audio_ext
-        self.ssml_tags = ssml_tags or []
-
-        self.voice = config.get("voice")
-        self.filename = '/tmp/tts.wav'
-        self.enclosure = None
-        random.seed()
-        self.queue = Queue()
-        self.playback = PlaybackThread(self.queue)
-        self.playback.start()
-        self.clear_cache()
-        self.spellings = self.load_spellings()
-        self.tts_name = type(self).__name__
-
-    def load_spellings(self):
-        """Load phonetic spellings of words as dictionary"""
-        path = join('text', self.lang, 'phonetic_spellings.txt')
-        spellings_file = resolve_resource_file(path)
-        if not spellings_file:
-            return {}
-        try:
-            with open(spellings_file) as f:
-                lines = filter(bool, f.read().split('\n'))
-            lines = [i.split(':') for i in lines]
-            return {key.strip(): value.strip() for key, value in lines}
-        except ValueError:
-            LOG.exception('Failed to load phonetic spellings.')
-            return {}
-
-    def begin_audio(self):
-        """Helper function for child classes to call in execute()"""
-        # Create signals informing start of speech
-        self.bus.emit(Message("recognizer_loop:audio_output_start"))
-
-    def end_audio(self, listen=False):
-        """Helper function for child classes to call in execute().
-
-        Sends the recognizer_loop:audio_output_end message (indicating
-        that speaking is done for the moment) as well as trigger listening
-        if it has been requested. It also checks if cache directory needs
-        cleaning to free up disk space.
-
-        Arguments:
-            listen (bool): indication if listening trigger should be sent.
-        """
-
-        self.bus.emit(Message("recognizer_loop:audio_output_end"))
-        if listen:
-            self.bus.emit(Message('mycroft.mic.listen'))
-        # Clean the cache as needed
-        cache_dir = mycroft.util.get_cache_directory("tts/" + self.tts_name)
-        mycroft.util.curate_cache(cache_dir, min_free_percent=100)
-
-        # This check will clear the "signal"
-        check_for_signal("isSpeaking")
-
-    def init(self, bus):
-        """Performs intial setup of TTS object.
-
-        Arguments:
-            bus:    Mycroft messagebus connection
-        """
-        self.bus = bus
-        self.playback.init(self)
-        self.enclosure = EnclosureAPI(self.bus)
-        self.playback.enclosure = self.enclosure
-
-    def get_tts(self, sentence, wav_file):
-        """Abstract method that a tts implementation needs to implement.
-
-        Should get data from tts.
-
-        Arguments:
-            sentence(str): Sentence to synthesize
-            wav_file(str): output file
-
-        Returns:
-            tuple: (wav_file, phoneme)
-        """
-        pass
-
-    def modify_tag(self, tag):
-        """Override to modify each supported ssml tag"""
-        return tag
-
-    @staticmethod
-    def remove_ssml(text):
-        return re.sub('<[^>]*>', '', text).replace('  ', ' ')
-
-    def validate_ssml(self, utterance):
-        """Check if engine supports ssml, if not remove all tags.
-
-        Remove unsupported / invalid tags
-
-        Arguments:
-            utterance(str): Sentence to validate
-
-        Returns:
-            validated_sentence (str)
-        """
-        # if ssml is not supported by TTS engine remove all tags
-        if not self.ssml_tags:
-            return self.remove_ssml(utterance)
-
-        # find ssml tags in string
-        tags = re.findall('<[^>]*>', utterance)
-
-        for tag in tags:
-            if any(supported in tag for supported in self.ssml_tags):
-                utterance = utterance.replace(tag, self.modify_tag(tag))
-            else:
-                # remove unsupported tag
-                utterance = utterance.replace(tag, "")
-
-        # return text with supported ssml tags only
-        return utterance.replace("  ", " ")
-
-    def _preprocess_sentence(self, sentence):
-        """Default preprocessing is no preprocessing.
-
-        This method can be overridden to create chunks suitable to the
-        TTS engine in question.
-
-        Arguments:
-            sentence (str): sentence to preprocess
-
-        Returns:
-            list: list of sentence parts
-        """
-        return [sentence]
-
-    def execute(self, sentence, ident=None, listen=False):
-        """Convert sentence to speech, preprocessing out unsupported ssml
-
-            The method caches results if possible using the hash of the
-            sentence.
-
-            Arguments:
-                sentence:   Sentence to be spoken
-                ident:      Id reference to current interaction
-                listen:     True if listen should be triggered at the end
-                            of the utterance.
-        """
-        sentence = self.validate_ssml(sentence)
-
-        create_signal("isSpeaking")
-        if self.phonetic_spelling:
-            for word in re.findall(r"[\w']+", sentence):
-                if word.lower() in self.spellings:
-                    sentence = sentence.replace(word,
-                                                self.spellings[word.lower()])
-
-        chunks = self._preprocess_sentence(sentence)
-        # Apply the listen flag to the last chunk, set the rest to False
-        chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
-                  for i in range(len(chunks))]
-
-        for sentence, l in chunks:
-            key = str(hashlib.md5(
-                sentence.encode('utf-8', 'ignore')).hexdigest())
-            wav_file = os.path.join(
-                mycroft.util.get_cache_directory("tts/" + self.tts_name),
-                key + '.' + self.audio_ext)
-
-            if os.path.exists(wav_file):
-                LOG.debug("TTS cache hit")
-                phonemes = self.load_phonemes(key)
-            else:
-                wav_file, phonemes = self.get_tts(sentence, wav_file)
-                if phonemes:
-                    self.save_phonemes(key, phonemes)
-
-            vis = self.viseme(phonemes) if phonemes else None
-            self.queue.put((self.audio_ext, wav_file, vis, ident, l))
-
-    def viseme(self, phonemes):
-        """Create visemes from phonemes. Needs to be implemented for all
-            tts backends.
-
-            Arguments:
-                phonemes(str): String with phoneme data
-        """
-        return None
-
-    def clear_cache(self):
-        """Remove all cached files."""
-        if not os.path.exists(mycroft.util.get_cache_directory('tts')):
-            return
-        for d in os.listdir(mycroft.util.get_cache_directory("tts")):
-            dir_path = os.path.join(mycroft.util.get_cache_directory("tts"), d)
-            if os.path.isdir(dir_path):
-                for f in os.listdir(dir_path):
-                    file_path = os.path.join(dir_path, f)
-                    if os.path.isfile(file_path):
-                        os.unlink(file_path)
-            # If no sub-folders are present, check if it is a file & clear it
-            elif os.path.isfile(dir_path):
-                os.unlink(dir_path)
-
-    def save_phonemes(self, key, phonemes):
-        """Cache phonemes
-
-        Arguments:
-            key:        Hash key for the sentence
-            phonemes:   phoneme string to save
-        """
-        cache_dir = mycroft.util.get_cache_directory("tts/" + self.tts_name)
-        pho_file = os.path.join(cache_dir, key + ".pho")
-        try:
-            with open(pho_file, "w") as cachefile:
-                cachefile.write(phonemes)
-        except Exception:
-            LOG.exception("Failed to write {} to cache".format(pho_file))
-            pass
-
-    def load_phonemes(self, key):
-        """Load phonemes from cache file.
-
-        Arguments:
-            Key:    Key identifying phoneme cache
-        """
-        pho_file = os.path.join(
-            mycroft.util.get_cache_directory("tts/" + self.tts_name),
-            key + ".pho")
-        if os.path.exists(pho_file):
-            try:
-                with open(pho_file, "r") as cachefile:
-                    phonemes = cachefile.read().strip()
-                return phonemes
-            except Exception:
-                LOG.debug("Failed to read .PHO from cache")
-        return None
-
-    def __del__(self):
-        self.playback.stop()
-        self.playback.join()
-
-
-class TTSValidator(metaclass=ABCMeta):
-    """TTS Validator abstract class to be implemented by all TTS engines.
-
-    It exposes and implements ``validate(tts)`` function as a template to
-    validate the TTS engines.
-    """
-    def __init__(self, tts):
-        self.tts = tts
-
-    def validate(self):
-        self.validate_dependencies()
-        self.validate_instance()
-        self.validate_filename()
-        self.validate_lang()
-        self.validate_connection()
-
-    def validate_dependencies(self):
-        pass
-
-    def validate_instance(self):
-        clazz = self.get_tts_class()
-        if not isinstance(self.tts, clazz):
-            raise AttributeError('tts must be instance of ' + clazz.__name__)
-
-    def validate_filename(self):
-        filename = self.tts.filename
-        if not (filename and filename.endswith('.wav')):
-            raise AttributeError('file: %s must be in .wav format!' % filename)
-
-        dir_path = dirname(filename)
-        if not (exists(dir_path) and isdir(dir_path)):
-            raise AttributeError('filename: %s is not valid!' % filename)
-
-    @abstractmethod
-    def validate_lang(self):
-        pass
-
-    @abstractmethod
-    def validate_connection(self):
-        pass
-
-    @abstractmethod
-    def get_tts_class(self):
-        pass
-
-
-class TTSFactory:
-    from mycroft.tts.espeak_tts import ESpeak
-    from mycroft.tts.fa_tts import FATTS
-    from mycroft.tts.google_tts import GoogleTTS
-    from mycroft.tts.mary_tts import MaryTTS
-    from mycroft.tts.mimic_tts import Mimic
-    from mycroft.tts.spdsay_tts import SpdSay
-    from mycroft.tts.bing_tts import BingTTS
-    from mycroft.tts.ibm_tts import WatsonTTS
-    from mycroft.tts.responsive_voice_tts import ResponsiveVoice
-    from mycroft.tts.mimic2_tts import Mimic2
-    from mycroft.tts.yandex_tts import YandexTTS
-
-    CLASSES = {
-        "mimic": Mimic,
-        "mimic2": Mimic2,
-        "google": GoogleTTS,
-        "marytts": MaryTTS,
-        "fatts": FATTS,
-        "espeak": ESpeak,
-        "spdsay": SpdSay,
-        "watson": WatsonTTS,
-        "bing": BingTTS,
-        "responsive_voice": ResponsiveVoice,
-        "yandex": YandexTTS
-    }
-
-    @staticmethod
-    def create():
-        """Factory method to create a TTS engine based on configuration.
-
-        The configuration file ``mycroft.conf`` contains a ``tts`` section with
-        the name of a TTS module to be read by this method.
-
-        "tts": {
-            "module": <engine_name>
-        }
-        """
-        config = Configuration.get()
-        lang = config.get("lang", "en-us")
-        tts_module = config.get('tts', {}).get('module', 'mimic')
-        tts_config = config.get('tts', {}).get(tts_module, {})
-        tts_lang = tts_config.get('lang', lang)
-        try:
-            clazz = TTSFactory.CLASSES.get(tts_module)
-            tts = clazz(tts_lang, tts_config)
-            tts.validator.validate()
-        except Exception as e:
-            # Fallback to mimic if an error occurs while loading.
-            if tts_module != 'mimic':
-                LOG.exception('The selected TTS backend couldn\'t be loaded. '
-                              'Falling back to Mimic')
-                from mycroft.tts.mimic_tts import Mimic
-                tts = Mimic(tts_lang, tts_config)
-                tts.validator.validate()
-            else:
-                LOG.exception('The TTS could not be loaded.')
-                raise
-
-        return tts
+from .tts import TTSFactory, TTS, TTSValidator, PlaybackThread
--- a/mycroft/tts/bing_tts.py
+++ b/mycroft/tts/bing_tts.py
@ -13,7 +13,7 @@
 # limitations under the License.
 #

-from mycroft.tts import TTS, TTSValidator
+from .tts import TTS, TTSValidator
 from mycroft.configuration import Configuration


--- a/mycroft/tts/espeak_tts.py
+++ b/mycroft/tts/espeak_tts.py
@ -14,7 +14,7 @@
 #
 import subprocess

-from mycroft.tts import TTS, TTSValidator
+from .tts import TTS, TTSValidator


 class ESpeak(TTS):
--- a/mycroft/tts/fa_tts.py
+++ b/mycroft/tts/fa_tts.py
@ -14,8 +14,8 @@
 #
 import requests

-from mycroft.tts import TTSValidator
-from mycroft.tts.remote_tts import RemoteTTS
+from .tts import TTSValidator
+from .remote_tts import RemoteTTS


 class FATTS(RemoteTTS):
--- a/mycroft/tts/google_tts.py
+++ b/mycroft/tts/google_tts.py
@ -14,15 +14,24 @@
 #
 from gtts import gTTS

-from mycroft.tts import TTS, TTSValidator
+from .tts import TTS, TTSValidator


 class GoogleTTS(TTS):
+    """Interface to google TTS."""
    def __init__(self, lang, config):
        super(GoogleTTS, self).__init__(lang, config, GoogleTTSValidator(
            self), 'mp3')

    def get_tts(self, sentence, wav_file):
+        """Fetch tts audio using gTTS.
+
+        Arguments:
+            sentence (str): Sentence to generate audio for
+            wav_file (str): output file path
+        Returns:
+            Tuple ((str) written file, None)
+        """
        tts = gTTS(text=sentence, lang=self.lang)
        tts.save(wav_file)
        return (wav_file, None)  # No phonemes
--- a/mycroft/tts/ibm_tts.py
+++ b/mycroft/tts/ibm_tts.py
@ -13,8 +13,8 @@
 # limitations under the License.
 #

-from mycroft.tts import TTSValidator
-from mycroft.tts.remote_tts import RemoteTTS
+from .tts import TTSValidator
+from .remote_tts import RemoteTTS
 from mycroft.configuration import Configuration
 from requests.auth import HTTPBasicAuth

--- a/mycroft/tts/mary_tts.py
+++ b/mycroft/tts/mary_tts.py
@ -14,8 +14,8 @@
 #
 import requests

-from mycroft.tts import TTSValidator
-from mycroft.tts.remote_tts import RemoteTTS
+from .tts import TTSValidator
+from .remote_tts import RemoteTTS


 class MaryTTS(RemoteTTS):
--- a/mycroft/tts/mimic2_tts.py
+++ b/mycroft/tts/mimic2_tts.py
@ -13,12 +13,11 @@
 # limitations under the License.
 #

-from mycroft.tts import TTS, TTSValidator
-from mycroft.tts.remote_tts import RemoteTTSTimeoutException
+from .tts import TTS, TTSValidator
+from .remote_tts import RemoteTTSTimeoutException
 from mycroft.util.log import LOG
-from mycroft.util.format import pronounce_number
 from mycroft.tts import cache_handler
-from mycroft.util import play_wav, get_cache_directory
+from mycroft.util import get_cache_directory
 from requests_futures.sessions import FuturesSession
 from requests.exceptions import (
    ReadTimeout, ConnectionError, ConnectTimeout, HTTPError
@ -38,9 +37,9 @@ _max_sentence_size = 170


 def _break_chunks(l, n):
-    """ Yield successive n-sized chunks
+    """Yield successive n-sized chunks

-    Args:
+    Arguments:
        l (list): text (str) to split
        chunk_size (int): chunk size
    """
@ -49,9 +48,9 @@ def _break_chunks(l, n):


 def _split_by_chunk_size(text, chunk_size):
-    """ Split text into word chunks by chunk_size size
+    """Split text into word chunks by chunk_size size

-    Args:
+    Arguments:
        text (str): text to split
        chunk_size (int): chunk size

@ -86,10 +85,10 @@ def _split_by_chunk_size(text, chunk_size):


 def _split_by_punctuation(chunks, puncs):
-    """splits text by various punctionations
+    """Splits text by various punctionations
    e.g. hello, world => [hello, world]

-    Args:
+    Arguments:
        chunks (list or str): text (str) to split
        puncs (list): list of punctuations used to split text

@ -113,7 +112,7 @@ def _split_by_punctuation(chunks, puncs):


 def _add_punctuation(text):
-    """ Add punctuation at the end of each chunk.
+    """Add punctuation at the end of each chunk.

    Mimic2 expects some form of punctuation at the end of a sentence.
    """
@ -125,12 +124,12 @@ def _add_punctuation(text):


 def _sentence_chunker(text):
-    """ Split text into smaller chunks for TTS generation.
+    """Split text into smaller chunks for TTS generation.

    NOTE: The smaller chunks are needed due to current Mimic2 TTS limitations.
    This stage can be removed once Mimic2 can generate longer sentences.

-    Args:
+    Arguments:
        text (str): text to split
        chunk_size (int): size of each chunk
        split_by_punc (bool, optional): Defaults to True.
@ -168,7 +167,7 @@ def _sentence_chunker(text):


 class Mimic2(TTS):
-
+    """Interface to the Mimic2 TTS."""
    def __init__(self, lang, config):
        super(Mimic2, self).__init__(
            lang, config, Mimic2Validator(self)
@ -183,33 +182,10 @@ class Mimic2(TTS):
        self.url = config['url']
        self.session = FuturesSession()

-    def _save(self, data):
-        """ Save WAV files in tmp
-
-        Args:
-            data (byes): WAV data
-        """
-        with open(self.filename, 'wb') as f:
-            f.write(data)
-
-    def _play(self, req):
-        """ Play WAV file after saving to tmp
-
-        Args:
-            req (object): requests object
-        """
-        if req.status_code == 200:
-            self._save(req.content)
-            play_wav(self.filename).communicate()
-        else:
-            LOG.error(
-                '%s Http Error: %s for url: %s' %
-                (req.status_code, req.reason, req.url))
-
    def _requests(self, sentence):
-        """create asynchronous request list
+        """Create asynchronous request list

-        Args:
+        Arguments:
            chunks (list): list of text to synthesize

        Returns:
@ -220,9 +196,9 @@ class Mimic2(TTS):
        return self.session.get(req_route, timeout=5)

    def viseme(self, phonemes):
-        """ Maps phonemes to appropriate viseme encoding
+        """Maps phonemes to appropriate viseme encoding

-        Args:
+        Arguments:
            phonemes (list): list of tuples (phoneme, time_start)

        Returns:
@ -242,14 +218,14 @@ class Mimic2(TTS):
            visemes.append((vis, vis_dur))
        return visemes

-    def _prepocess_sentence(sentence):
-        """ Split sentence in chunks better suited for mimic2. """
+    def _preprocess_sentence(self, sentence):
+        """Split sentence in chunks better suited for mimic2. """
        return _sentence_chunker(sentence)

    def get_tts(self, sentence, wav_file):
-        """ Generate (remotely) and play mimic2 WAV audio
+        """Generate (remotely) and play mimic2 WAV audio

-        Args:
+        Arguments:
            sentence (str): Phrase to synthesize to audio with mimic2
            wav_file (str): Location to write audio output
        """
@ -267,12 +243,11 @@ class Mimic2(TTS):
        return (wav_file, vis)

    def save_phonemes(self, key, phonemes):
-        """
-            Cache phonemes
+        """Cache phonemes

-            Args:
-                key:        Hash key for the sentence
-                phonemes:   phoneme string to save
+        Arguments:
+            key:        Hash key for the sentence
+            phonemes:   phoneme string to save
        """
        cache_dir = get_cache_directory("tts/" + self.tts_name)
        pho_file = os.path.join(cache_dir, key + ".pho")
@ -283,11 +258,10 @@ class Mimic2(TTS):
            LOG.exception("Failed to write {} to cache".format(pho_file))

    def load_phonemes(self, key):
-        """
-            Load phonemes from cache file.
+        """Load phonemes from cache file.

-            Args:
-                Key:    Key identifying phoneme cache
+        Arguments:
+            Key:    Key identifying phoneme cache
        """
        pho_file = os.path.join(get_cache_directory("tts/" + self.tts_name),
                                key + ".pho")
--- a/mycroft/tts/mimic_tts.py
+++ b/mycroft/tts/mimic_tts.py
@ -16,7 +16,7 @@ import os
 import stat
 import subprocess
 from threading import Thread
-from time import time, sleep
+from time import sleep

 import os.path
 from os.path import exists, join, expanduser
@ -24,10 +24,11 @@ from os.path import exists, join, expanduser
 from mycroft import MYCROFT_ROOT_PATH
 from mycroft.api import DeviceApi
 from mycroft.configuration import Configuration
-from mycroft.tts import TTS, TTSValidator
 from mycroft.util.download import download
 from mycroft.util.log import LOG

+from .tts import TTS, TTSValidator
+
 config = Configuration.get().get("tts").get("mimic")
 data_dir = expanduser(Configuration.get()['data_dir'])

@ -44,14 +45,14 @@ SUBSCRIBER_VOICES = {'trinity': join(data_dir, 'voices/mimic_tn')}


 def download_subscriber_voices(selected_voice):
-    """
-        Function to download all premium voices, starting with
-        the currently selected if applicable
+    """Function to download all premium voices.
+
+    The function starts with the currently selected if applicable
    """

    def make_executable(dest):
-        """ Call back function to make the downloaded file executable. """
-        LOG.info('Make executable')
+        """Call back function to make the downloaded file executable."""
+        LOG.info('Make executable new voice binary executable')
        # make executable
        st = os.stat(dest)
        os.chmod(dest, st.st_mode | stat.S_IEXEC)
@ -59,7 +60,7 @@ def download_subscriber_voices(selected_voice):
    # First download the selected voice if needed
    voice_file = SUBSCRIBER_VOICES.get(selected_voice)
    if voice_file is not None and not exists(voice_file):
-        LOG.info('voice doesn\'t exist, downloading')
+        LOG.info('Voice doesn\'t exist, downloading')
        url = DeviceApi().get_subscriber_voice_url(selected_voice)
        # Check we got an url
        if url:
@ -88,6 +89,7 @@ def download_subscriber_voices(selected_voice):


 class Mimic(TTS):
+    """TTS interface for local mimic v1."""
    def __init__(self, lang, config):
        super(Mimic, self).__init__(
            lang, config, MimicValidator(self), 'wav',
@ -117,7 +119,7 @@ class Mimic(TTS):

    @property
    def args(self):
-        """ Build mimic arguments. """
+        """Build mimic arguments."""
        if (self.voice in SUBSCRIBER_VOICES and
                exists(SUBSCRIBER_VOICES[self.voice]) and self.is_subscriber):
            # Use subscriber voice
@ -140,14 +142,29 @@ class Mimic(TTS):
        return args

    def get_tts(self, sentence, wav_file):
-        #  Generate WAV and phonemes
+        """Generate WAV and phonemes.
+
+        Arguments:
+            sentence (str): sentence to generate audio for
+            wav_file (str): output file
+
+        Returns:
+            tuple ((str) file location, (str) generated phonemes)
+        """
        phonemes = subprocess.check_output(self.args + ['-o', wav_file,
                                                        '-t', sentence])
        return wav_file, phonemes.decode()

    def viseme(self, output):
+        """Convert phoneme string to visemes.
+
+        Arguments:
+            output (str): Phoneme output from mimic
+
+        Returns:
+            (list) list of tuples of viseme and duration
+        """
        visemes = []
-        start = time()
        pairs = str(output).split(" ")
        for pair in pairs:
            pho_dur = pair.split(":")  # phoneme:duration
--- a/mycroft/tts/remote_tts.py
+++ b/mycroft/tts/remote_tts.py
@ -16,7 +16,7 @@ import abc
 import re
 from requests_futures.sessions import FuturesSession

-from mycroft.tts import TTS
+from .tts import TTS
 from mycroft.util import remove_last_slash, play_wav
 from mycroft.util.log import LOG

--- a/mycroft/tts/responsive_voice_tts.py
+++ b/mycroft/tts/responsive_voice_tts.py
@ -13,7 +13,7 @@
 # limitations under the License.
 #
 import requests
-from mycroft.tts import TTS, TTSValidator
+from .tts import TTS, TTSValidator


 class ResponsiveVoice(TTS):
--- a/mycroft/tts/spdsay_tts.py
+++ b/mycroft/tts/spdsay_tts.py
@ -14,7 +14,7 @@
 #
 import subprocess

-from mycroft.tts import TTS, TTSValidator
+from .tts import TTS, TTSValidator


 class SpdSay(TTS):
--- a/mycroft/tts/tts.py
+++ b/mycroft/tts/tts.py
@ -0,0 +1,523 @@
+# Copyright 2017 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from copy import deepcopy
+import hashlib
+import os
+import random
+import re
+from abc import ABCMeta, abstractmethod
+from threading import Thread
+from time import time
+
+import os.path
+from os.path import dirname, exists, isdir, join
+
+import mycroft.util
+from mycroft.enclosure.api import EnclosureAPI
+from mycroft.configuration import Configuration
+from mycroft.messagebus.message import Message
+from mycroft.metrics import report_timing, Stopwatch
+from mycroft.util import (
+    play_wav, play_mp3, check_for_signal, create_signal, resolve_resource_file
+)
+from mycroft.util.log import LOG
+from queue import Queue, Empty
+
+
+_TTS_ENV = deepcopy(os.environ)
+_TTS_ENV['PULSE_PROP'] = 'media.role=phone'
+
+
+def send_playback_metric(stopwatch, ident):
+    """Send playback metrics in a background thread."""
+
+    def do_send(stopwatch, ident):
+        report_timing(ident, 'speech_playback', stopwatch)
+
+    t = Thread(target=do_send, args=(stopwatch, ident))
+    t.daemon = True
+    t.start()
+
+
+class PlaybackThread(Thread):
+    """Thread class for playing back tts audio and sending
+    viseme data to enclosure.
+    """
+
+    def __init__(self, queue):
+        super(PlaybackThread, self).__init__()
+        self.queue = queue
+        self._terminated = False
+        self._processing_queue = False
+        self.enclosure = None
+        # Check if the tts shall have a ducking role set
+        if Configuration.get().get('tts', {}).get('pulse_duck'):
+            self.pulse_env = _TTS_ENV
+        else:
+            self.pulse_env = None
+
+    def init(self, tts):
+        self.tts = tts
+
+    def clear_queue(self):
+        """Remove all pending playbacks."""
+        while not self.queue.empty():
+            self.queue.get()
+        try:
+            self.p.terminate()
+        except Exception:
+            pass
+
+    def run(self):
+        """Thread main loop. Get audio and extra data from queue and play.
+
+        The queue messages is a tuple containing
+        snd_type: 'mp3' or 'wav' telling the loop what format the data is in
+        data: path to temporary audio data
+        videmes: list of visemes to display while playing
+        listen: if listening should be triggered at the end of the sentence.
+
+        Playback of audio is started and the visemes are sent over the bus
+        the loop then wait for the playback process to finish before starting
+        checking the next position in queue.
+
+        If the queue is empty the tts.end_audio() is called possibly triggering
+        listening.
+        """
+        while not self._terminated:
+            try:
+                (snd_type, data,
+                 visemes, ident, listen) = self.queue.get(timeout=2)
+                self.blink(0.5)
+                if not self._processing_queue:
+                    self._processing_queue = True
+                    self.tts.begin_audio()
+
+                stopwatch = Stopwatch()
+                with stopwatch:
+                    if snd_type == 'wav':
+                        self.p = play_wav(data, environment=self.pulse_env)
+                    elif snd_type == 'mp3':
+                        self.p = play_mp3(data, environment=self.pulse_env)
+                    if visemes:
+                        self.show_visemes(visemes)
+                    self.p.communicate()
+                    self.p.wait()
+                send_playback_metric(stopwatch, ident)
+
+                if self.queue.empty():
+                    self.tts.end_audio(listen)
+                    self._processing_queue = False
+                self.blink(0.2)
+            except Empty:
+                pass
+            except Exception as e:
+                LOG.exception(e)
+                if self._processing_queue:
+                    self.tts.end_audio(listen)
+                    self._processing_queue = False
+
+    def show_visemes(self, pairs):
+        """Send viseme data to enclosure
+
+        Arguments:
+            pairs(list): Visime and timing pair
+
+        Returns:
+            True if button has been pressed.
+        """
+        if self.enclosure:
+            self.enclosure.mouth_viseme(time(), pairs)
+
+    def clear(self):
+        """Clear all pending actions for the TTS playback thread."""
+        self.clear_queue()
+
+    def blink(self, rate=1.0):
+        """Blink mycroft's eyes"""
+        if self.enclosure and random.random() < rate:
+            self.enclosure.eyes_blink("b")
+
+    def stop(self):
+        """Stop thread"""
+        self._terminated = True
+        self.clear_queue()
+
+
+class TTS(metaclass=ABCMeta):
+    """TTS abstract class to be implemented by all TTS engines.
+
+    It aggregates the minimum required parameters and exposes
+    ``execute(sentence)`` and ``validate_ssml(sentence)`` functions.
+
+    Arguments:
+        lang (str):
+        config (dict): Configuration for this specific tts engine
+        validator (TTSValidator): Used to verify proper installation
+        phonetic_spelling (bool): Whether to spell certain words phonetically
+        ssml_tags (list): Supported ssml properties. Ex. ['speak', 'prosody']
+    """
+    def __init__(self, lang, config, validator, audio_ext='wav',
+                 phonetic_spelling=True, ssml_tags=None):
+        super(TTS, self).__init__()
+        self.bus = None  # initalized in "init" step
+        self.lang = lang or 'en-us'
+        self.config = config
+        self.validator = validator
+        self.phonetic_spelling = phonetic_spelling
+        self.audio_ext = audio_ext
+        self.ssml_tags = ssml_tags or []
+
+        self.voice = config.get("voice")
+        self.filename = '/tmp/tts.wav'
+        self.enclosure = None
+        random.seed()
+        self.queue = Queue()
+        self.playback = PlaybackThread(self.queue)
+        self.playback.start()
+        self.clear_cache()
+        self.spellings = self.load_spellings()
+        self.tts_name = type(self).__name__
+
+    def load_spellings(self):
+        """Load phonetic spellings of words as dictionary"""
+        path = join('text', self.lang, 'phonetic_spellings.txt')
+        spellings_file = resolve_resource_file(path)
+        if not spellings_file:
+            return {}
+        try:
+            with open(spellings_file) as f:
+                lines = filter(bool, f.read().split('\n'))
+            lines = [i.split(':') for i in lines]
+            return {key.strip(): value.strip() for key, value in lines}
+        except ValueError:
+            LOG.exception('Failed to load phonetic spellings.')
+            return {}
+
+    def begin_audio(self):
+        """Helper function for child classes to call in execute()"""
+        # Create signals informing start of speech
+        self.bus.emit(Message("recognizer_loop:audio_output_start"))
+
+    def end_audio(self, listen=False):
+        """Helper function for child classes to call in execute().
+
+        Sends the recognizer_loop:audio_output_end message (indicating
+        that speaking is done for the moment) as well as trigger listening
+        if it has been requested. It also checks if cache directory needs
+        cleaning to free up disk space.
+
+        Arguments:
+            listen (bool): indication if listening trigger should be sent.
+        """
+
+        self.bus.emit(Message("recognizer_loop:audio_output_end"))
+        if listen:
+            self.bus.emit(Message('mycroft.mic.listen'))
+        # Clean the cache as needed
+        cache_dir = mycroft.util.get_cache_directory("tts/" + self.tts_name)
+        mycroft.util.curate_cache(cache_dir, min_free_percent=100)
+
+        # This check will clear the "signal"
+        check_for_signal("isSpeaking")
+
+    def init(self, bus):
+        """Performs intial setup of TTS object.
+
+        Arguments:
+            bus:    Mycroft messagebus connection
+        """
+        self.bus = bus
+        self.playback.init(self)
+        self.enclosure = EnclosureAPI(self.bus)
+        self.playback.enclosure = self.enclosure
+
+    def get_tts(self, sentence, wav_file):
+        """Abstract method that a tts implementation needs to implement.
+
+        Should get data from tts.
+
+        Arguments:
+            sentence(str): Sentence to synthesize
+            wav_file(str): output file
+
+        Returns:
+            tuple: (wav_file, phoneme)
+        """
+        pass
+
+    def modify_tag(self, tag):
+        """Override to modify each supported ssml tag"""
+        return tag
+
+    @staticmethod
+    def remove_ssml(text):
+        return re.sub('<[^>]*>', '', text).replace('  ', ' ')
+
+    def validate_ssml(self, utterance):
+        """Check if engine supports ssml, if not remove all tags.
+
+        Remove unsupported / invalid tags
+
+        Arguments:
+            utterance(str): Sentence to validate
+
+        Returns:
+            validated_sentence (str)
+        """
+        # if ssml is not supported by TTS engine remove all tags
+        if not self.ssml_tags:
+            return self.remove_ssml(utterance)
+
+        # find ssml tags in string
+        tags = re.findall('<[^>]*>', utterance)
+
+        for tag in tags:
+            if any(supported in tag for supported in self.ssml_tags):
+                utterance = utterance.replace(tag, self.modify_tag(tag))
+            else:
+                # remove unsupported tag
+                utterance = utterance.replace(tag, "")
+
+        # return text with supported ssml tags only
+        return utterance.replace("  ", " ")
+
+    def _preprocess_sentence(self, sentence):
+        """Default preprocessing is no preprocessing.
+
+        This method can be overridden to create chunks suitable to the
+        TTS engine in question.
+
+        Arguments:
+            sentence (str): sentence to preprocess
+
+        Returns:
+            list: list of sentence parts
+        """
+        return [sentence]
+
+    def execute(self, sentence, ident=None, listen=False):
+        """Convert sentence to speech, preprocessing out unsupported ssml
+
+            The method caches results if possible using the hash of the
+            sentence.
+
+            Arguments:
+                sentence:   Sentence to be spoken
+                ident:      Id reference to current interaction
+                listen:     True if listen should be triggered at the end
+                            of the utterance.
+        """
+        sentence = self.validate_ssml(sentence)
+
+        create_signal("isSpeaking")
+        if self.phonetic_spelling:
+            for word in re.findall(r"[\w']+", sentence):
+                if word.lower() in self.spellings:
+                    sentence = sentence.replace(word,
+                                                self.spellings[word.lower()])
+
+        chunks = self._preprocess_sentence(sentence)
+        # Apply the listen flag to the last chunk, set the rest to False
+        chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
+                  for i in range(len(chunks))]
+
+        for sentence, l in chunks:
+            key = str(hashlib.md5(
+                sentence.encode('utf-8', 'ignore')).hexdigest())
+            wav_file = os.path.join(
+                mycroft.util.get_cache_directory("tts/" + self.tts_name),
+                key + '.' + self.audio_ext)
+
+            if os.path.exists(wav_file):
+                LOG.debug("TTS cache hit")
+                phonemes = self.load_phonemes(key)
+            else:
+                wav_file, phonemes = self.get_tts(sentence, wav_file)
+                if phonemes:
+                    self.save_phonemes(key, phonemes)
+
+            vis = self.viseme(phonemes) if phonemes else None
+            self.queue.put((self.audio_ext, wav_file, vis, ident, l))
+
+    def viseme(self, phonemes):
+        """Create visemes from phonemes. Needs to be implemented for all
+            tts backends.
+
+            Arguments:
+                phonemes(str): String with phoneme data
+        """
+        return None
+
+    def clear_cache(self):
+        """Remove all cached files."""
+        if not os.path.exists(mycroft.util.get_cache_directory('tts')):
+            return
+        for d in os.listdir(mycroft.util.get_cache_directory("tts")):
+            dir_path = os.path.join(mycroft.util.get_cache_directory("tts"), d)
+            if os.path.isdir(dir_path):
+                for f in os.listdir(dir_path):
+                    file_path = os.path.join(dir_path, f)
+                    if os.path.isfile(file_path):
+                        os.unlink(file_path)
+            # If no sub-folders are present, check if it is a file & clear it
+            elif os.path.isfile(dir_path):
+                os.unlink(dir_path)
+
+    def save_phonemes(self, key, phonemes):
+        """Cache phonemes
+
+        Arguments:
+            key:        Hash key for the sentence
+            phonemes:   phoneme string to save
+        """
+        cache_dir = mycroft.util.get_cache_directory("tts/" + self.tts_name)
+        pho_file = os.path.join(cache_dir, key + ".pho")
+        try:
+            with open(pho_file, "w") as cachefile:
+                cachefile.write(phonemes)
+        except Exception:
+            LOG.exception("Failed to write {} to cache".format(pho_file))
+            pass
+
+    def load_phonemes(self, key):
+        """Load phonemes from cache file.
+
+        Arguments:
+            Key:    Key identifying phoneme cache
+        """
+        pho_file = os.path.join(
+            mycroft.util.get_cache_directory("tts/" + self.tts_name),
+            key + ".pho")
+        if os.path.exists(pho_file):
+            try:
+                with open(pho_file, "r") as cachefile:
+                    phonemes = cachefile.read().strip()
+                return phonemes
+            except Exception:
+                LOG.debug("Failed to read .PHO from cache")
+        return None
+
+    def __del__(self):
+        self.playback.stop()
+        self.playback.join()
+
+
+class TTSValidator(metaclass=ABCMeta):
+    """TTS Validator abstract class to be implemented by all TTS engines.
+
+    It exposes and implements ``validate(tts)`` function as a template to
+    validate the TTS engines.
+    """
+    def __init__(self, tts):
+        self.tts = tts
+
+    def validate(self):
+        self.validate_dependencies()
+        self.validate_instance()
+        self.validate_filename()
+        self.validate_lang()
+        self.validate_connection()
+
+    def validate_dependencies(self):
+        pass
+
+    def validate_instance(self):
+        clazz = self.get_tts_class()
+        if not isinstance(self.tts, clazz):
+            raise AttributeError('tts must be instance of ' + clazz.__name__)
+
+    def validate_filename(self):
+        filename = self.tts.filename
+        if not (filename and filename.endswith('.wav')):
+            raise AttributeError('file: %s must be in .wav format!' % filename)
+
+        dir_path = dirname(filename)
+        if not (exists(dir_path) and isdir(dir_path)):
+            raise AttributeError('filename: %s is not valid!' % filename)
+
+    @abstractmethod
+    def validate_lang(self):
+        pass
+
+    @abstractmethod
+    def validate_connection(self):
+        pass
+
+    @abstractmethod
+    def get_tts_class(self):
+        pass
+
+
+class TTSFactory:
+    from mycroft.tts.espeak_tts import ESpeak
+    from mycroft.tts.fa_tts import FATTS
+    from mycroft.tts.google_tts import GoogleTTS
+    from mycroft.tts.mary_tts import MaryTTS
+    from mycroft.tts.mimic_tts import Mimic
+    from mycroft.tts.spdsay_tts import SpdSay
+    from mycroft.tts.bing_tts import BingTTS
+    from mycroft.tts.ibm_tts import WatsonTTS
+    from mycroft.tts.responsive_voice_tts import ResponsiveVoice
+    from mycroft.tts.mimic2_tts import Mimic2
+    from mycroft.tts.yandex_tts import YandexTTS
+
+    CLASSES = {
+        "mimic": Mimic,
+        "mimic2": Mimic2,
+        "google": GoogleTTS,
+        "marytts": MaryTTS,
+        "fatts": FATTS,
+        "espeak": ESpeak,
+        "spdsay": SpdSay,
+        "watson": WatsonTTS,
+        "bing": BingTTS,
+        "responsive_voice": ResponsiveVoice,
+        "yandex": YandexTTS
+    }
+
+    @staticmethod
+    def create():
+        """Factory method to create a TTS engine based on configuration.
+
+        The configuration file ``mycroft.conf`` contains a ``tts`` section with
+        the name of a TTS module to be read by this method.
+
+        "tts": {
+            "module": <engine_name>
+        }
+        """
+        config = Configuration.get()
+        lang = config.get("lang", "en-us")
+        tts_module = config.get('tts', {}).get('module', 'mimic')
+        tts_config = config.get('tts', {}).get(tts_module, {})
+        tts_lang = tts_config.get('lang', lang)
+        try:
+            clazz = TTSFactory.CLASSES.get(tts_module)
+            tts = clazz(tts_lang, tts_config)
+            tts.validator.validate()
+        except Exception:
+            # Fallback to mimic if an error occurs while loading.
+            if tts_module != 'mimic':
+                LOG.exception('The selected TTS backend couldn\'t be loaded. '
+                              'Falling back to Mimic')
+                clazz = TTSFactory.CLASSES.get('mimic')
+                tts = clazz(tts_lang, tts_config)
+                tts.validator.validate()
+            else:
+                LOG.exception('The TTS could not be loaded.')
+                raise
+
+        return tts
--- a/mycroft/tts/yandex_tts.py
+++ b/mycroft/tts/yandex_tts.py
@ -13,7 +13,7 @@
 # limitations under the License.
 #

-from mycroft.tts import TTS, TTSValidator
+from .tts import TTS, TTSValidator
 from mycroft.configuration import Configuration

 import requests
--- a/test/unittests/tts/test_google_tts.py
+++ b/test/unittests/tts/test_google_tts.py
@ -0,0 +1,26 @@
+import unittest
+from unittest import mock
+
+from mycroft.tts.google_tts import GoogleTTS, GoogleTTSValidator
+
+
+@mock.patch('mycroft.tts.google_tts.gTTS')
+@mock.patch('mycroft.tts.tts.PlaybackThread')
+class TestGoogleTTS(unittest.TestCase):
+    def test_get_tts(self, _, gtts_mock):
+        gtts_response = mock.Mock()
+        gtts_mock.return_value = gtts_response
+        tts = GoogleTTS('en-US', {})
+        sentence = 'help me Obi-Wan Kenobi, you are my only hope'
+        mp3_file, vis = tts.get_tts(sentence, 'output.mp3')
+        gtts_mock.assert_called_with(text=sentence, lang='en-US')
+        gtts_response.save.assert_called_with('output.mp3')
+
+    def test_validator(self, _, gtts_mock):
+        validator = GoogleTTSValidator(GoogleTTS('en-US', {}))
+        validator.validate_connection()
+        with self.assertRaises(Exception):
+            def sideeffect(**kwargs):
+                raise Exception
+            gtts_mock.side_effect = sideeffect
+            validator.validate_connection()
--- a/test/unittests/tts/test_mimic2_tts.py
+++ b/test/unittests/tts/test_mimic2_tts.py
@ -0,0 +1,83 @@
+import unittest
+from unittest import mock
+
+from mycroft.tts.mimic2_tts import Mimic2
+
+
+@mock.patch('mycroft.tts.mimic2_tts.FuturesSession')
+@mock.patch('mycroft.tts.tts.PlaybackThread')
+class TestMimic2(unittest.TestCase):
+    def test_get_tts(self, _, mock_session):
+        mock_session_instance = mock.Mock(name='SessionMock')
+        mock_session.return_value = mock_session_instance
+
+        get_mock = mock.Mock(name='getMock')
+        mock_session_instance.get.return_value = get_mock
+
+        result_mock = mock.Mock(name='resultMock')
+        get_mock.result.return_value = result_mock
+        result_mock.json.return_value = {'audio_base64': '', 'visimes': ''}
+        m2 = Mimic2('en-US', {'url': 'https://just.testing.nu'})
+
+        with mock.patch('mycroft.tts.mimic2_tts.open') as mock_open:
+            wav_file, vis = m2.get_tts("Hello old friend", 'test.wav')
+        self.assertTrue(mock_session_instance.get.called)
+
+    def test_visemes(self, _, __):
+        m2 = Mimic2('en-US', {'url': 'https://just.testing.nu'})
+        phonemes = [('pau', 0.137), ('hh', 0.236), ('ax', 0.286), ('l', 0.387),
+                    ('ow', 0.542), ('f', 0.642), ('r', 0.728), ('eh', 0.807),
+                    ('n', 0.899), ('d', 1.033), ('pau', 1.187)]
+        vis = m2.viseme(phonemes)
+        self.assertEqual(vis, [('4', 0.137), ('0', 0.236), ('0', 0.286),
+                               ('3', 0.387), ('2', 0.542), ('5', 0.642),
+                               ('2', 0.728), ('0', 0.807), ('3', 0.899),
+                               ('3', 1.033), ('4', 1.187)])
+
+    def test_preprocess(self, _, __):
+        """Test mimic2 specific preprocessing.
+
+        The Mimic-2 backend has some specifics regarding how the sentence
+        must look to render correctly.
+        """
+        m2 = Mimic2('en-US', {'url': 'https://just.testing.nu'})
+        # Test short sentence get's '.' at the end.
+        self.assertEqual(m2._preprocess_sentence('Hello old friend'),
+                         ['Hello old friend.'])
+        # Ensure that a very long sentence gets separated into chunks.
+        self.assertEqual(m2._preprocess_sentence('Harris said he felt such '
+                                                 'extraordinary fits of '
+                                                 'giddiness come over him at '
+                                                 'times, that he hardly knew '
+                                                 'what he was doing; and then '
+                                                 'George said that he had '
+                                                 'fits of giddiness too, and '
+                                                 'hardly knew what he was '
+                                                 'doing.'),
+                         ['Harris said he felt such extraordinary fits of '
+                          'giddiness come over him at times, that he hardly '
+                          'knew what he was doing.',
+                          'and then George said that he had fits of giddiness '
+                          'too, and hardly knew what he was doing.'])
+
+    @mock.patch('mycroft.tts.mimic2_tts.open')
+    def test_phoneme_cache(self, mock_open, _, __):
+        m2 = Mimic2('en-US', {'url': 'https://just.testing.nu'})
+        phonemes = [['pau', 0.137], ['hh', 0.236], ['ax', 0.286], ['l', 0.387],
+                    ['ow', 0.542], ['f', 0.642], ['r', 0.728], ['eh', 0.807],
+                    ['n', 0.899], ['d', 1.033], ['pau', 1.187]]
+
+        mock_context = mock.Mock(name='context')
+        mock_file = mock.MagicMock(name='file')
+        mock_open.return_value = mock_file
+        mock_file.__enter__.return_value = mock_context
+        m2.save_phonemes('abc', phonemes)
+        self.assertTrue(mock_context.write.called_with)
+        with mock.patch('mycroft.tts.mimic2_tts.json.load') as mock_load:
+            read_phonemes = m2.load_phonemes('abc')
+            self.assertEqual(read_phonemes, None)
+            mock_load.reset_mock()
+            with mock.patch('mycroft.tts.mimic2_tts.os.path.exists') as _:
+                mock_load.return_value = phonemes
+                read_phonemes = m2.load_phonemes('abc')
+                self.assertEqual(read_phonemes, phonemes)
--- a/test/unittests/tts/test_mimic_tts.py
+++ b/test/unittests/tts/test_mimic_tts.py
@ -0,0 +1,91 @@
+import stat
+
+import unittest
+from unittest import mock
+
+from mycroft.tts.mimic_tts import (Mimic, download_subscriber_voices, BIN,
+                                   SUBSCRIBER_VOICES)
+
+
+device_instance_mock = mock.Mock(name='device_api_instance')
+device_instance_mock.is_subscriber = False
+
+subscribed_device = mock.Mock(name='subscriber_device')
+subscribed_device.is_subscribed = True
+subscribed_device.get_subscriber_voice_url.return_value = 'https://trinity'
+
+
+@mock.patch('mycroft.tts.mimic_tts.DeviceApi')
+@mock.patch('mycroft.tts.tts.PlaybackThread')
+class TestMimic(unittest.TestCase):
+    @mock.patch('mycroft.tts.mimic_tts.subprocess')
+    def test_get_tts(self, mock_subprocess, _, mock_device_api):
+        mock_device_api.return_value = device_instance_mock
+        m = Mimic('en-US', {})
+        wav, phonemes = m.get_tts('hello', 'abc.wav')
+        self.assertEqual(phonemes, mock_subprocess.check_output().decode())
+        mock_subprocess.check_output_called_with(m.args + ['-o', 'abc.wav',
+                                                           '-t', 'hello'])
+
+    def test_viseme(self, _, mock_device_api):
+        mock_device_api.return_value = device_instance_mock
+        m = Mimic('en-US', {})
+        viseme_string = ('pau:0.206 m:0.287 ah:0.401 ch:0.513 dh:0.578 '
+                         'iy:0.699 s:0.835 ey:1.013 m:1.118 w:1.213 ey:1.345 '
+                         'dh:1.415 ae:1.491 t:1.539 b:1.616 r:1.671 ih:1.744 '
+                         'k:1.819 s:1.923 d:1.978 ow:2.118 n:2.206 t:2.301 '
+                         'pau:2.408')
+
+        vis = m.viseme(viseme_string)
+        self.assertEqual(vis,
+                         [('4', 0.206), ('4', 0.287), ('0', 0.401),
+                          ('3', 0.513), ('3', 0.578), ('0', 0.699),
+                          ('3', 0.835), ('0', 1.013), ('4', 1.118),
+                          ('2', 1.213), ('0', 1.345), ('3', 1.415),
+                          ('0', 1.491), ('3', 1.539), ('4', 1.616),
+                          ('2', 1.671), ('0', 1.744), ('3', 1.819),
+                          ('3', 1.923), ('3', 1.978), ('2', 2.118),
+                          ('3', 2.206), ('3', 2.301), ('4', 2.408)])
+
+    @mock.patch('mycroft.tts.mimic_tts.Thread')
+    def test_subscriber(self, mock_thread, _, mock_device_api):
+        mock_device_api.return_value = subscribed_device
+
+        m = Mimic('en-US', {'voice': 'trinity'})
+        mock_thread.assert_called_with(target=download_subscriber_voices,
+                                       args=['trinity'])
+        self.assertTrue(m.is_subscriber)
+        self.assertEqual(m.args, [BIN, '-voice', 'ap', '-psdur', '-ssml'])
+        with mock.patch('mycroft.tts.mimic_tts.exists') as mock_exists:
+            mock_exists.return_value = True
+            self.assertEqual(m.args, [SUBSCRIBER_VOICES['trinity'], '-voice',
+                                      'trinity', '-psdur', '-ssml'])
+
+    @mock.patch('mycroft.tts.mimic_tts.sleep')
+    @mock.patch('mycroft.tts.mimic_tts.download')
+    def test_download(self, mock_download, mock_sleep, _, mock_device_api):
+        mock_device_api.return_value = subscribed_device
+        dl = mock.Mock()
+        dl.done = False
+
+        def sleep_sideeffect(_):
+            """After one sleep call the download should be considered done."""
+            nonlocal dl
+            dl.done = True
+
+        mock_sleep.side_effect = sleep_sideeffect
+        mock_download.return_value = dl
+
+        download_subscriber_voices('trinity')
+        self.assertEqual(mock_download.call_args[0][:2],
+                         ('https://trinity', '/opt/mycroft/voices/mimic_tn'))
+        make_executable = mock_download.call_args[0][2]
+
+        # Check that the excutable flag is set to the downloaded file
+        with mock.patch('mycroft.tts.mimic_tts.os.chmod') as mock_chmod:
+            with mock.patch('mycroft.tts.mimic_tts.os.stat') as mock_stat:
+                st_mock = mock.Mock()
+                mock_stat.return_value = st_mock
+                st_mock.st_mode = 0
+                make_executable('/test')
+                mock_chmod.assert_called_with('/test', stat.S_IEXEC)
--- a/test/unittests/tts/test_tts.py
+++ b/test/unittests/tts/test_tts.py
@ -1,26 +1,125 @@
+from queue import Queue
+import time
+
 import unittest
+from unittest import mock

 import mycroft.tts

+mock_phoneme = mock.Mock(name='phoneme')
+mock_audio = mock.Mock(name='audio')
+mock_viseme = mock.Mock(name='viseme')

+
+class MockTTS(mycroft.tts.TTS):
+    def __init__(self, lang, config, validator, audio_ext='wav',
+                 phonetic_spelling=True, ssml_tags=None):
+        super().__init__(lang, config, validator, audio_ext)
+        self.get_tts = mock.Mock()
+        self.get_tts.return_value = (mock_audio, mock_phoneme)
+        self.viseme = mock.Mock()
+        self.viseme.return_value = mock_viseme
+
+
+class MockTTSValidator(mycroft.tts.TTSValidator):
+    def validate(self):
+        pass
+
+    def validate_lang(self):
+        pass
+
+    def validate_connection(self):
+        pass
+
+    def get_tts_class(self):
+        return TestTTS
+
+
+class TestPlaybackThread(unittest.TestCase):
+    def test_lifecycle(self):
+        playback = mycroft.tts.PlaybackThread(Queue())
+        playback.init(mock.Mock())
+        playback.start()
+        playback.stop()
+        playback.join()
+
+    @mock.patch('mycroft.tts.tts.time')
+    @mock.patch('mycroft.tts.tts.play_wav')
+    @mock.patch('mycroft.tts.tts.play_mp3')
+    def test_process_queue(self, mock_play_mp3, mock_play_wav, mock_time):
+        queue = Queue()
+        playback = mycroft.tts.PlaybackThread(queue)
+        mock_tts = mock.Mock()
+        playback.init(mock_tts)
+        playback.enclosure = mock.Mock()
+        playback.start()
+        try:
+            # Test wav data
+            wav_mock = mock.Mock(name='wav_data')
+            queue.put(('wav', wav_mock, None, 0, False))
+            time.sleep(0.2)
+            mock_tts.begin_audio.called_with()
+            mock_play_wav.assert_called_with(wav_mock, environment=None)
+            mock_tts.end_audio.assert_called_with(False)
+
+            # Test mp3 data and trigger listening True
+            mp3_mock = mock.Mock(name='mp3_data')
+            queue.put(('mp3', mp3_mock, None, 0, True))
+            time.sleep(0.2)
+            mock_play_mp3.assert_called_with(mp3_mock, environment=None)
+            mock_tts.end_audio.assert_called_with(True)
+            self.assertFalse(playback.enclosure.get.called)
+
+            # Test sending visemes
+            mock_time.return_value = 1234
+            visemes = mock.Mock(name='visemes')
+            queue.put(('mp3', mp3_mock, visemes, 0, True))
+            time.sleep(0.2)
+            playback.enclosure.mouth_viseme.assert_called_with(1234, visemes)
+
+        finally:
+            # Terminate the thread
+            playback.stop()
+            playback.join()
+
+
+@mock.patch('mycroft.tts.tts.PlaybackThread')
 class TestTTS(unittest.TestCase):
-    def test_ssml_support(self):
-        class TestTTS(mycroft.tts.TTS):
-            def execute(self, sentence, ident=None):
-                pass
+    def test_execute(self, mock_playback_thread):
+        tts = MockTTS("en-US", {}, MockTTSValidator(None))
+        bus_mock = mock.Mock()
+        tts.init(bus_mock)
+        self.assertTrue(tts.bus is bus_mock)

-        class TestTTSValidator(mycroft.tts.TTSValidator):
-            def validate(self):
-                pass
+        tts.queue = mock.Mock()
+        with mock.patch('mycroft.tts.tts.open') as mock_open:
+            tts.execute('Oh no, not again', 42)
+        self.assertTrue(tts.get_tts.called)
+        tts.queue.put.assert_called_with(('wav', mock_audio, mock_viseme,
+                                         42, False))

-            def validate_lang(self):
-                pass
+    @mock.patch('mycroft.tts.tts.open')
+    def test_phoneme_cache(self, mock_open, _):
+        tts = MockTTS("en-US", {}, MockTTSValidator(None))
+        mock_context = mock.Mock(name='context')
+        mock_file = mock.MagicMock(name='file')
+        mock_open.return_value = mock_file
+        mock_file.__enter__.return_value = mock_context

-            def validate_connection(self):
-                pass
+        phonemes = mock.Mock()
+        # Test save phonemes
+        tts.save_phonemes('abc', phonemes)
+        mock_context.write.assert_called_with(phonemes)

-            def get_tts_class(self):
-                return TestTTS
+        # Test load phonemes
+        mock_context.read.return_value = 'phonemes '
+        read_phonemes = tts.load_phonemes('abc')
+        self.assertEqual(read_phonemes, None)
+        with mock.patch('mycroft.tts.tts.os.path.exists') as _:
+            read_phonemes = tts.load_phonemes('abc')
+            self.assertEqual(read_phonemes, 'phonemes')  # assert stripped
+
+    def test_ssml_support(self, _):

        sentence = "<speak>Prosody can be used to change the way words " \
                   "sound. The following words are " \
@ -46,7 +145,7 @@ class TestTTS(unittest.TestCase):
                            "</foo_invalid end=whatever>"
        sentence_extra_ssml = "<whispered>whisper tts<\\whispered>"

-        tts = TestTTS("en-US", {}, TestTTSValidator(None))
+        tts = MockTTS("en-US", {}, MockTTSValidator(None))

        # test valid ssml
        tts.ssml_tags = ['speak', 'prosody']
@ -75,3 +174,43 @@ class TestTTS(unittest.TestCase):

        self.assertEqual(mycroft.tts.TTS.remove_ssml(sentence),
                         sentence_no_ssml)
+
+
+class TestTTSFactory(unittest.TestCase):
+    @mock.patch('mycroft.tts.tts.Configuration')
+    def test_create(self, mock_config):
+        config = {
+            'tts': {
+                'module': 'mock'
+            }
+        }
+
+        mock_config.get.return_value = config
+        mock_mimic = mock.Mock(name='Mimic')
+        mock_mimic_instance = mock.Mock(name='mimic')
+        mock_mimic.return_value = mock_mimic_instance
+
+        mock_tts_class = mock.Mock()
+        mock_tts_instance = mock.Mock()
+        mock_tts_class.return_value = mock_tts_instance
+
+        mycroft.tts.TTSFactory.CLASSES['mimic'] = mock_mimic
+        mycroft.tts.TTSFactory.CLASSES['mock'] = mock_tts_class
+
+        # Check that correct module is selected
+        tts_instance = mycroft.tts.TTSFactory.create()
+        self.assertEqual(tts_instance, mock_tts_instance)
+
+        # Assert falling back to mimic if load fails
+        def side_effect(*args):
+            raise Exception
+
+        mock_tts_class.side_effect = side_effect
+        tts_instance = mycroft.tts.TTSFactory.create()
+        self.assertEqual(tts_instance, mock_mimic_instance)
+
+        # Make sure exception is raised when mimic fails
+        mock_mimic.side_effect = side_effect
+        config['tts']['module'] = 'mimic'
+        with self.assertRaises(Exception):
+            tts_instance = mycroft.tts.TTSFactory.create()