mycroft-core/mycroft/tts/mimic_tts.py

300 lines
8.8 KiB
Python

# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Mimic TTS, a local TTS backend.
This Backend uses the mimic executable to render text into speech.
"""
import os
import os.path
from os.path import exists, join, expanduser
import stat
import subprocess
from threading import Thread
from time import sleep
from mycroft import MYCROFT_ROOT_PATH
from mycroft.api import DeviceApi
from mycroft.configuration import Configuration
from mycroft.util.download import download
from mycroft.util.log import LOG
from .tts import TTS, TTSValidator
def get_mimic_binary():
"""Find the mimic binary, either from config or from PATH.
Returns:
(str) path of mimic executable
"""
config = Configuration.get().get("tts", {}).get("mimic")
bin_ = config.get("path",
os.path.join(MYCROFT_ROOT_PATH, 'mimic', 'bin', 'mimic'))
if not os.path.isfile(bin_):
# Search for mimic on the path
import distutils.spawn
bin_ = distutils.spawn.find_executable("mimic")
return bin_
def get_subscriber_voices():
"""Get dict of mimic voices exclusive to subscribers.
Returns:
(dict) map of voices to custom Mimic executables.
"""
data_dir = expanduser(Configuration.get()['data_dir'])
return {'trinity': join(data_dir, 'voices/mimic_tn')}
def download_subscriber_voices(selected_voice):
"""Function to download all premium voices.
The function starts with the currently selected if applicable
"""
subscriber_voices = get_subscriber_voices()
def make_executable(dest):
"""Call back function to make the downloaded file executable."""
LOG.info('Make executable new voice binary executable')
# make executable
file_stat = os.stat(dest)
os.chmod(dest, file_stat.st_mode | stat.S_IEXEC)
# First download the selected voice if needed
voice_file = subscriber_voices.get(selected_voice)
if voice_file is not None and not exists(voice_file):
LOG.info('Voice doesn\'t exist, downloading')
url = DeviceApi().get_subscriber_voice_url(selected_voice)
# Check we got an url
if url:
dl_status = download(url, voice_file, make_executable)
# Wait for completion
while not dl_status.done:
sleep(1)
else:
LOG.debug('{} is not available for this architecture'
.format(selected_voice))
# Download the rest of the subsciber voices as needed
for voice in subscriber_voices:
voice_file = subscriber_voices[voice]
if not exists(voice_file):
url = DeviceApi().get_subscriber_voice_url(voice)
# Check we got an url
if url:
dl_status = download(url, voice_file, make_executable)
# Wait for completion
while not dl_status.done:
sleep(1)
else:
LOG.debug('{} is not available for this architecture'
.format(voice))
def parse_phonemes(phonemes):
"""Parse mimic phoneme string into a list of phone, duration pairs.
Arguments
phonemes (bytes): phoneme output from mimic
Returns:
(list) list of phoneme duration pairs
"""
phon_str = phonemes.decode()
pairs = phon_str.split(' ')
return [pair.split(':') for pair in pairs if ':' in pair]
class Mimic(TTS):
"""TTS interface for local mimic v1."""
def __init__(self, lang, config):
super(Mimic, self).__init__(
lang, config, MimicValidator(self), 'wav',
ssml_tags=["speak", "ssml", "phoneme", "voice", "audio", "prosody"]
)
self.default_binary = get_mimic_binary()
self.clear_cache()
# Download subscriber voices if needed
self.subscriber_voices = get_subscriber_voices()
self.is_subscriber = DeviceApi().is_subscriber
if self.is_subscriber:
trd = Thread(target=download_subscriber_voices, args=[self.voice])
trd.daemon = True
trd.start()
def modify_tag(self, tag):
"""Modify the SSML to suite Mimic."""
ssml_conversions = {
'x-slow': '0.4',
'slow': '0.7',
'medium': '1.0',
'high': '1.3',
'x-high': '1.6',
'speed': 'rate'
}
for key, value in ssml_conversions.items():
tag = tag.replace(key, value)
return tag
@property
def args(self):
"""Build mimic arguments."""
subscriber_voices = self.subscriber_voices
if (self.voice in subscriber_voices and
exists(subscriber_voices[self.voice]) and self.is_subscriber):
# Use subscriber voice
mimic_bin = subscriber_voices[self.voice]
voice = self.voice
elif self.voice in subscriber_voices:
# Premium voice but bin doesn't exist, use ap while downloading
mimic_bin = self.default_binary
voice = 'ap'
else:
# Normal case use normal binary and selected voice
mimic_bin = self.default_binary
voice = self.voice
args = [mimic_bin, '-voice', voice, '-psdur', '-ssml']
stretch = self.config.get('duration_stretch', None)
if stretch:
args += ['--setf', 'duration_stretch={}'.format(stretch)]
return args
def get_tts(self, sentence, wav_file):
"""Generate WAV and phonemes.
Args:
sentence (str): sentence to generate audio for
wav_file (str): output file
Returns:
tuple ((str) file location, (str) generated phonemes)
"""
phonemes = subprocess.check_output(self.args + ['-o', wav_file,
'-t', sentence])
return wav_file, parse_phonemes(phonemes)
def viseme(self, phoneme_pairs):
"""Convert phoneme string to visemes.
Args:
phoneme_pairs (list): Phoneme output from mimic
Returns:
(list) list of tuples of viseme and duration
"""
visemes = []
for phon, dur in phoneme_pairs:
visemes.append((VISIMES.get(phon, '4'), float(dur)))
return visemes
class MimicValidator(TTSValidator):
"""Validator class checking that Mimic can be used."""
def validate_lang(self):
"""Verify that the language is supported."""
# TODO: Verify version of mimic can handle the requested language
def validate_connection(self):
"""Check that Mimic executable is found and works."""
mimic_bin = get_mimic_binary()
try:
subprocess.call([mimic_bin, '--version'])
except Exception as err:
if mimic_bin:
LOG.error('Failed to find mimic at: {}'.format(mimic_bin))
else:
LOG.error('Mimic executable not found')
raise Exception(
'Mimic was not found. Run install-mimic.sh to install it.') \
from err
def get_tts_class(self):
"""Return the TTS class associated with the validator."""
return Mimic
# Mapping based on Jeffers phoneme to viseme map, seen in table 1 from:
# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.221.6377&rep=rep1&type=pdf
#
# Mycroft unit visemes based on images found at:
# http://www.web3.lu/wp-content/uploads/2014/09/visemes.jpg
#
# Mapping was created partially based on the "12 mouth shapes visuals seen at:
# https://wolfpaulus.com/journal/software/lipsynchronization/
VISIMES = {
# /A group
'v': '5',
'f': '5',
# /B group
'uh': '2',
'w': '2',
'uw': '2',
'er': '2',
'r': '2',
'ow': '2',
# /C group
'b': '4',
'p': '4',
'm': '4',
# /D group
'aw': '1',
# /E group
'th': '3',
'dh': '3',
# /F group
'zh': '3',
'ch': '3',
'sh': '3',
'jh': '3',
# /G group
'oy': '6',
'ao': '6',
# /Hgroup
'z': '3',
's': '3',
# /I group
'ae': '0',
'eh': '0',
'ey': '0',
'ah': '0',
'ih': '0',
'y': '0',
'iy': '0',
'aa': '0',
'ay': '0',
'ax': '0',
'hh': '0',
# /J group
'n': '3',
't': '3',
'd': '3',
'l': '3',
# /K group
'g': '3',
'ng': '3',
'k': '3',
# blank mouth
'pau': '4',
}