mycroft-core/mycroft/tts/mimic_tts.py

# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Mimic TTS, a local TTS backend.

This Backend uses the mimic executable to render text into speech.
"""
import os
import os.path
from os.path import exists, join, expanduser
import stat
import subprocess
from threading import Thread
from time import sleep

from mycroft import MYCROFT_ROOT_PATH
from mycroft.api import DeviceApi
from mycroft.configuration import Configuration
from mycroft.util.download import download
from mycroft.util.log import LOG

from .tts import TTS, TTSValidator


def get_mimic_binary():
    """Find the mimic binary, either from config or from PATH.

    Returns:
        (str) path of mimic executable
    """
    config = Configuration.get().get("tts", {}).get("mimic")

    bin_ = config.get("path",
                      os.path.join(MYCROFT_ROOT_PATH, 'mimic', 'bin', 'mimic'))

    if not os.path.isfile(bin_):
        # Search for mimic on the path
        import distutils.spawn

        bin_ = distutils.spawn.find_executable("mimic")

    return bin_


def get_subscriber_voices():
    """Get dict of mimic voices exclusive to subscribers.

    Returns:
        (dict) map of voices to custom Mimic executables.
    """
    data_dir = expanduser(Configuration.get()['data_dir'])
    return {'trinity': join(data_dir, 'voices/mimic_tn')}


def download_subscriber_voices(selected_voice):
    """Function to download all premium voices.

    The function starts with the currently selected if applicable
    """
    subscriber_voices = get_subscriber_voices()

    def make_executable(dest):
        """Call back function to make the downloaded file executable."""
        LOG.info('Make executable new voice binary executable')
        # make executable
        file_stat = os.stat(dest)
        os.chmod(dest, file_stat.st_mode | stat.S_IEXEC)

    # First download the selected voice if needed
    voice_file = subscriber_voices.get(selected_voice)
    if voice_file is not None and not exists(voice_file):
        LOG.info('Voice doesn\'t exist, downloading')
        url = DeviceApi().get_subscriber_voice_url(selected_voice)
        # Check we got an url
        if url:
            dl_status = download(url, voice_file, make_executable)
            # Wait for completion
            while not dl_status.done:
                sleep(1)
        else:
            LOG.debug('{} is not available for this architecture'
                      .format(selected_voice))

    # Download the rest of the subsciber voices as needed
    for voice in subscriber_voices:
        voice_file = subscriber_voices[voice]
        if not exists(voice_file):
            url = DeviceApi().get_subscriber_voice_url(voice)
            # Check we got an url
            if url:
                dl_status = download(url, voice_file, make_executable)
                # Wait for completion
                while not dl_status.done:
                    sleep(1)
            else:
                LOG.debug('{} is not available for this architecture'
                          .format(voice))


def parse_phonemes(phonemes):
    """Parse mimic phoneme string into a list of phone, duration pairs.

    Arguments
        phonemes (bytes): phoneme output from mimic
    Returns:
        (list) list of phoneme duration pairs
    """
    phon_str = phonemes.decode()
    pairs = phon_str.split(' ')
    return [pair.split(':') for pair in pairs if ':' in pair]


class Mimic(TTS):
    """TTS interface for local mimic v1."""
    def __init__(self, lang, config):
        super(Mimic, self).__init__(
            lang, config, MimicValidator(self), 'wav',
            ssml_tags=["speak", "ssml", "phoneme", "voice", "audio", "prosody"]
        )
        self.default_binary = get_mimic_binary()

        self.clear_cache()

        # Download subscriber voices if needed
        self.subscriber_voices = get_subscriber_voices()
        self.is_subscriber = DeviceApi().is_subscriber
        if self.is_subscriber:
            trd = Thread(target=download_subscriber_voices, args=[self.voice])
            trd.daemon = True
            trd.start()

    def modify_tag(self, tag):
        """Modify the SSML to suite Mimic."""
        ssml_conversions = {
            'x-slow': '0.4',
            'slow': '0.7',
            'medium': '1.0',
            'high': '1.3',
            'x-high': '1.6',
            'speed': 'rate'
        }
        for key, value in ssml_conversions.items():
            tag = tag.replace(key, value)
        return tag

    @property
    def args(self):
        """Build mimic arguments."""
        subscriber_voices = self.subscriber_voices
        if (self.voice in subscriber_voices and
                exists(subscriber_voices[self.voice]) and self.is_subscriber):
            # Use subscriber voice
            mimic_bin = subscriber_voices[self.voice]
            voice = self.voice
        elif self.voice in subscriber_voices:
            # Premium voice but bin doesn't exist, use ap while downloading
            mimic_bin = self.default_binary
            voice = 'ap'
        else:
            # Normal case use normal binary and selected voice
            mimic_bin = self.default_binary
            voice = self.voice

        args = [mimic_bin, '-voice', voice, '-psdur', '-ssml']

        stretch = self.config.get('duration_stretch', None)
        if stretch:
            args += ['--setf', 'duration_stretch={}'.format(stretch)]
        return args

    def get_tts(self, sentence, wav_file):
        """Generate WAV and phonemes.

        Args:
            sentence (str): sentence to generate audio for
            wav_file (str): output file

        Returns:
            tuple ((str) file location, (str) generated phonemes)
        """
        phonemes = subprocess.check_output(self.args + ['-o', wav_file,
                                                        '-t', sentence])
        return wav_file, parse_phonemes(phonemes)

    def viseme(self, phoneme_pairs):
        """Convert phoneme string to visemes.

        Args:
            phoneme_pairs (list): Phoneme output from mimic

        Returns:
            (list) list of tuples of viseme and duration
        """
        visemes = []
        for phon, dur in phoneme_pairs:
            visemes.append((VISIMES.get(phon, '4'), float(dur)))
        return visemes


class MimicValidator(TTSValidator):
    """Validator class checking that Mimic can be used."""
    def validate_lang(self):
        """Verify that the language is supported."""
        # TODO: Verify version of mimic can handle the requested language

    def validate_connection(self):
        """Check that Mimic executable is found and works."""
        mimic_bin = get_mimic_binary()
        try:
            subprocess.call([mimic_bin, '--version'])
        except Exception as err:
            if mimic_bin:
                LOG.error('Failed to find mimic at: {}'.format(mimic_bin))
            else:
                LOG.error('Mimic executable not found')
            raise Exception(
                'Mimic was not found. Run install-mimic.sh to install it.') \
                from err

    def get_tts_class(self):
        """Return the TTS class associated with the validator."""
        return Mimic


# Mapping based on Jeffers phoneme to viseme map, seen in table 1 from:
# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.221.6377&rep=rep1&type=pdf
#
# Mycroft unit visemes based on images found at:
# http://www.web3.lu/wp-content/uploads/2014/09/visemes.jpg
#
# Mapping was created partially based on the "12 mouth shapes visuals seen at:
# https://wolfpaulus.com/journal/software/lipsynchronization/

VISIMES = {
    # /A group
    'v': '5',
    'f': '5',
    # /B group
    'uh': '2',
    'w': '2',
    'uw': '2',
    'er': '2',
    'r': '2',
    'ow': '2',
    # /C group
    'b': '4',
    'p': '4',
    'm': '4',
    # /D group
    'aw': '1',
    # /E group
    'th': '3',
    'dh': '3',
    # /F group
    'zh': '3',
    'ch': '3',
    'sh': '3',
    'jh': '3',
    # /G group
    'oy': '6',
    'ao': '6',
    # /Hgroup
    'z': '3',
    's': '3',
    # /I group
    'ae': '0',
    'eh': '0',
    'ey': '0',
    'ah': '0',
    'ih': '0',
    'y': '0',
    'iy': '0',
    'aa': '0',
    'ay': '0',
    'ax': '0',
    'hh': '0',
    # /J group
    'n': '3',
    't': '3',
    'd': '3',
    'l': '3',
    # /K group
    'g': '3',
    'ng': '3',
    'k': '3',
    # blank mouth
    'pau': '4',
}