Merge remote-tracking branch 'refs/remotes/origin/master'
Conflicts: mycroft/client/speech/listener.py mycroft/client/speech/wakeword_recognizer.pypull/60/head
commit
fac834cf4a
|
@ -20,21 +20,19 @@ import threading
|
|||
import time
|
||||
from Queue import Queue
|
||||
|
||||
import os
|
||||
import pyee
|
||||
import speech_recognition as sr
|
||||
from speech_recognition import AudioData
|
||||
|
||||
from mycroft.client.speech import wakeword_recognizer
|
||||
from mycroft.client.speech.local_recognizer import LocalRecognizer
|
||||
from mycroft.client.speech.mic import MutableMicrophone, Recognizer
|
||||
from mycroft.client.speech.recognizer_wrapper import (
|
||||
from mycroft.client.speech.recognizer_wrapper import \
|
||||
RemoteRecognizerWrapperFactory
|
||||
)
|
||||
from mycroft.client.speech.word_extractor import WordExtractor
|
||||
from mycroft.configuration.config import ConfigurationManager
|
||||
from mycroft.messagebus.message import Message
|
||||
from mycroft.metrics import MetricsAggregator, Stopwatch
|
||||
from mycroft.session import SessionManager
|
||||
from mycroft.util import read_stripped_lines, CerberusAccessDenied
|
||||
from mycroft.util import CerberusAccessDenied
|
||||
from mycroft.util.log import getLogger
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
@ -49,6 +47,7 @@ class AudioProducer(threading.Thread):
|
|||
given a mic and a recognizer implementation, continuously listens to the
|
||||
mic for potential speech chunks and pushes them onto the queue.
|
||||
"""
|
||||
|
||||
def __init__(self, state, queue, mic, recognizer, emitter):
|
||||
threading.Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
@ -75,110 +74,6 @@ class AudioProducer(threading.Thread):
|
|||
self.emitter.emit("recognizer_loop:ioerror", ex)
|
||||
|
||||
|
||||
class WakewordExtractor:
|
||||
|
||||
MAX_ERROR_SECONDS = 0.02
|
||||
TRIM_SECONDS = 0.1
|
||||
# The seconds the safe end position is pushed back to ensure pocketsphinx
|
||||
# is consistent
|
||||
PUSH_BACK_SECONDS = 0.2
|
||||
# The seconds of silence padded where the wakeword was removed
|
||||
SILENCE_SECONDS = 0.2
|
||||
|
||||
def __init__(self, audio_data, recognizer, metrics):
|
||||
self.audio_data = audio_data
|
||||
self.recognizer = recognizer
|
||||
self.silence_data = self.__generate_silence(
|
||||
self.SILENCE_SECONDS, self.audio_data.sample_rate,
|
||||
self.audio_data.sample_width)
|
||||
self.wav_data = self.audio_data.get_wav_data()
|
||||
self.AUDIO_SIZE = float(len(self.wav_data))
|
||||
self.range = self.Range(0, self.AUDIO_SIZE / 2)
|
||||
self.metrics = metrics
|
||||
|
||||
class Range:
|
||||
def __init__(self, begin, end):
|
||||
self.begin = begin
|
||||
self.end = end
|
||||
|
||||
def get_marker(self, get_begin):
|
||||
if get_begin:
|
||||
return self.begin
|
||||
else:
|
||||
return self.end
|
||||
|
||||
def add_to_marker(self, add_begin, value):
|
||||
if add_begin:
|
||||
self.begin += value
|
||||
else:
|
||||
self.end += value
|
||||
|
||||
def narrow(self, value):
|
||||
self.begin += value
|
||||
self.end -= value
|
||||
|
||||
@staticmethod
|
||||
def __found_in_segment(name, byte_data, recognizer, metrics):
|
||||
|
||||
hypothesis = recognizer.transcribe(byte_data, metrics=metrics)
|
||||
if hypothesis and hypothesis.hypstr.lower().find(name):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def audio_pos(self, raw_pos):
|
||||
return int(self.audio_data.sample_width *
|
||||
round(float(raw_pos)/self.audio_data.sample_width))
|
||||
|
||||
def get_audio_segment(self, begin, end):
|
||||
return self.wav_data[self.audio_pos(begin): self.audio_pos(end)]
|
||||
|
||||
def __calculate_marker(self, use_begin, sign_if_found, range, delta):
|
||||
while (2 * delta >= self.MAX_ERROR_SECONDS *
|
||||
self.audio_data.sample_rate * self.audio_data.sample_width):
|
||||
byte_data = self.get_audio_segment(range.begin, range.end)
|
||||
found = self.__found_in_segment(
|
||||
"mycroft", byte_data, self.recognizer, self.metrics)
|
||||
sign = sign_if_found if found else -sign_if_found
|
||||
range.add_to_marker(use_begin, delta * sign)
|
||||
delta /= 2
|
||||
return range.get_marker(use_begin)
|
||||
|
||||
def calculate_range(self):
|
||||
delta = self.AUDIO_SIZE / 4
|
||||
self.range.end = self.__calculate_marker(
|
||||
False, -1, self.Range(0, self.AUDIO_SIZE / 2), delta)
|
||||
|
||||
# Ensures the end position is well past the wakeword part of the audio
|
||||
pos_end_safe = min(
|
||||
self.AUDIO_SIZE, self.range.end + self.PUSH_BACK_SECONDS *
|
||||
self.audio_data.sample_rate * self.audio_data.sample_width)
|
||||
delta = pos_end_safe / 4
|
||||
begin = pos_end_safe / 2
|
||||
self.range.begin = self.__calculate_marker(
|
||||
True, 1, self.Range(begin, pos_end_safe), delta)
|
||||
self.range.narrow(self.TRIM_SECONDS * self.audio_data.sample_rate *
|
||||
self.audio_data.sample_width)
|
||||
|
||||
@staticmethod
|
||||
def __generate_silence(seconds, sample_rate, sample_width):
|
||||
return '\0'*int(seconds * sample_rate * sample_width)
|
||||
|
||||
def get_audio_data_before(self):
|
||||
byte_data = self.get_audio_segment(
|
||||
0, self.range.begin) + self.silence_data
|
||||
return AudioData(
|
||||
byte_data, self.audio_data.sample_rate,
|
||||
self.audio_data.sample_width)
|
||||
|
||||
def get_audio_data_after(self):
|
||||
byte_data = self.silence_data + self.get_audio_segment(
|
||||
self.range.end, self.AUDIO_SIZE)
|
||||
return AudioData(
|
||||
byte_data, self.audio_data.sample_rate,
|
||||
self.audio_data.sample_width)
|
||||
|
||||
|
||||
class AudioConsumer(threading.Thread):
|
||||
"""
|
||||
AudioConsumer
|
||||
|
@ -188,76 +83,76 @@ class AudioConsumer(threading.Thread):
|
|||
# In seconds, the minimum audio size to be sent to remote STT
|
||||
MIN_AUDIO_SIZE = 1.0
|
||||
|
||||
def __init__(
|
||||
self, state, queue, emitter, wakeup_recognizer,
|
||||
wakeword_recognizer, wrapped_remote_recognizer, wakeup_prefixes,
|
||||
wakeup_words):
|
||||
def __init__(self, state, queue, emitter, wakeup_recognizer,
|
||||
mycroft_recognizer, remote_recognizer):
|
||||
threading.Thread.__init__(self)
|
||||
self.daemon = True
|
||||
self.queue = queue
|
||||
self.state = state
|
||||
self.emitter = emitter
|
||||
self.wakeup_recognizer = wakeup_recognizer
|
||||
self.ww_recognizer = wakeword_recognizer
|
||||
self.wrapped_remote_recognizer = wrapped_remote_recognizer
|
||||
self.wakeup_prefixes = wakeup_prefixes
|
||||
self.wakeup_words = wakeup_words
|
||||
self.mycroft_recognizer = mycroft_recognizer
|
||||
self.remote_recognizer = remote_recognizer
|
||||
self.metrics = MetricsAggregator()
|
||||
|
||||
def run(self):
|
||||
while self.state.running:
|
||||
self.try_consume_audio()
|
||||
self.read_audio()
|
||||
|
||||
@staticmethod
|
||||
def _audio_length(audio):
|
||||
return float(
|
||||
len(audio.frame_data))/(audio.sample_rate*audio.sample_width)
|
||||
return float(len(audio.frame_data)) / (
|
||||
audio.sample_rate * audio.sample_width)
|
||||
|
||||
def try_consume_audio(self):
|
||||
def read_audio(self):
|
||||
timer = Stopwatch()
|
||||
hyp = None
|
||||
audio = self.queue.get()
|
||||
self.metrics.timer(
|
||||
"mycroft.recognizer.audio.length_s", self._audio_length(audio))
|
||||
self.metrics.timer("mycroft.recognizer.audio.length_s",
|
||||
self._audio_length(audio))
|
||||
self.queue.task_done()
|
||||
timer.start()
|
||||
|
||||
if self.state.sleeping:
|
||||
hyp = self.wakeup_recognizer.transcribe(
|
||||
audio.get_wav_data(), metrics=self.metrics)
|
||||
if hyp and hyp.hypstr:
|
||||
logger.debug("sleeping recognition: " + hyp.hypstr)
|
||||
if hyp and hyp.hypstr.lower().find("wake up") >= 0:
|
||||
self.process_wake_up(audio)
|
||||
elif self.state.skip_wakeword:
|
||||
self.process_skip_wake_word(audio)
|
||||
else:
|
||||
self.process_wake_word(audio, timer)
|
||||
|
||||
self.metrics.flush()
|
||||
|
||||
def process_wake_up(self, audio):
|
||||
if self.wakeup_recognizer.is_recognized(audio.frame_data,
|
||||
self.metrics):
|
||||
SessionManager.touch()
|
||||
self.state.sleeping = False
|
||||
self.__speak("I'm awake.") # TODO: Localization
|
||||
self.metrics.increment("mycroft.wakeup")
|
||||
else:
|
||||
if not self.state.skip_wakeword:
|
||||
hyp = self.ww_recognizer.transcribe(
|
||||
audio.get_wav_data(), metrics=self.metrics)
|
||||
|
||||
if hyp and hyp.hypstr.lower().find("mycroft") >= 0:
|
||||
extractor = WakewordExtractor(
|
||||
audio, self.ww_recognizer, self.metrics)
|
||||
def process_wake_word(self, audio, timer):
|
||||
hyp = self.mycroft_recognizer.transcribe(audio.frame_data,
|
||||
self.metrics)
|
||||
|
||||
if self.mycroft_recognizer.contains(hyp):
|
||||
extractor = WordExtractor(audio, self.mycroft_recognizer,
|
||||
self.metrics)
|
||||
timer.lap()
|
||||
extractor.calculate_range()
|
||||
self.metrics.timer(
|
||||
"mycroft.recognizer.extractor.time_s", timer.lap())
|
||||
self.metrics.timer("mycroft.recognizer.extractor.time_s",
|
||||
timer.lap())
|
||||
audio_before = extractor.get_audio_data_before()
|
||||
self.metrics.timer(
|
||||
"mycroft.recognizer.audio_extracted.length_s",
|
||||
self.metrics.timer("mycroft.recognizer.audio_extracted.length_s",
|
||||
self._audio_length(audio_before))
|
||||
audio_after = extractor.get_audio_data_after()
|
||||
self.metrics.timer(
|
||||
"mycroft.recognizer.audio_extracted.length_s",
|
||||
self.metrics.timer("mycroft.recognizer.audio_extracted.length_s",
|
||||
self._audio_length(audio_after))
|
||||
|
||||
SessionManager.touch()
|
||||
payload = {
|
||||
'utterance': hyp.hypstr,
|
||||
'session': SessionManager.get().session_id,
|
||||
'pos_begin': int(extractor.range.begin),
|
||||
'pos_end': int(extractor.range.end)
|
||||
'pos_begin': extractor.begin,
|
||||
'pos_end': extractor.end
|
||||
}
|
||||
self.emitter.emit("recognizer_loop:wakeword", payload)
|
||||
|
||||
|
@ -268,40 +163,37 @@ class AudioConsumer(threading.Thread):
|
|||
self.state.skip_wakeword = True
|
||||
self.metrics.increment("mycroft.wakeword")
|
||||
|
||||
elif self.state.skip_wakeword:
|
||||
def process_skip_wake_word(self, audio):
|
||||
SessionManager.touch()
|
||||
try:
|
||||
self.transcribe([audio])
|
||||
except sr.UnknownValueError:
|
||||
logger.warn(
|
||||
"Speech Recognition could not understand audio")
|
||||
logger.warn("Speech Recognition could not understand audio")
|
||||
self.__speak("Sorry, I didn't catch that.")
|
||||
self.metrics.increment("mycroft.recognizer.error")
|
||||
self.state.skip_wakeword = False
|
||||
else:
|
||||
self.metrics.clear()
|
||||
self.metrics.flush()
|
||||
|
||||
def __speak(self, utterance):
|
||||
"""
|
||||
Speak commands should be asynchronous to avoid filling up the
|
||||
portaudio buffer.
|
||||
Speak commands should be asynchronous to avoid filling up the portaudio
|
||||
buffer.
|
||||
:param utterance:
|
||||
:return:
|
||||
"""
|
||||
|
||||
def target():
|
||||
self.emitter.emit(
|
||||
"speak",
|
||||
Message("speak",
|
||||
metadata={'utterance': utterance,
|
||||
'session': SessionManager.get().session_id}))
|
||||
payload = {
|
||||
'utterance': utterance,
|
||||
'session': SessionManager.get().session_id
|
||||
}
|
||||
self.emitter.emit("speak", Message("speak", metadata=payload))
|
||||
|
||||
threading.Thread(target=target).start()
|
||||
|
||||
def _create_remote_stt_runnable(self, audio, utterances):
|
||||
def runnable():
|
||||
try:
|
||||
text = self.wrapped_remote_recognizer.transcribe(
|
||||
text = self.remote_recognizer.transcribe(
|
||||
audio, metrics=self.metrics).lower()
|
||||
except sr.UnknownValueError:
|
||||
pass
|
||||
|
@ -319,6 +211,7 @@ class AudioConsumer(threading.Thread):
|
|||
logger.debug("STT: " + text)
|
||||
if text.strip() != '':
|
||||
utterances.append(text)
|
||||
|
||||
return runnable
|
||||
|
||||
def transcribe(self, audio_segments):
|
||||
|
@ -360,20 +253,15 @@ class RecognizerLoop(pyee.EventEmitter):
|
|||
device_index=None,
|
||||
lang=core_config.get('lang')):
|
||||
pyee.EventEmitter.__init__(self)
|
||||
self.microphone = MutableMicrophone(
|
||||
sample_rate=sample_rate, device_index=device_index)
|
||||
self.microphone = MutableMicrophone(sample_rate=sample_rate,
|
||||
device_index=device_index)
|
||||
|
||||
# FIXME - channels are not been used
|
||||
self.microphone.CHANNELS = channels
|
||||
self.ww_recognizer = wakeword_recognizer.create_recognizer(
|
||||
samprate=sample_rate, lang=lang)
|
||||
self.wakeup_recognizer = wakeword_recognizer.create_recognizer(
|
||||
samprate=sample_rate, lang=lang,
|
||||
keyphrase="wake up mycroft") # TODO - localization
|
||||
self.mycroft_recognizer = LocalRecognizer(sample_rate, lang)
|
||||
# TODO - localization
|
||||
self.wakeup_recognizer = LocalRecognizer(sample_rate, lang, "wake up")
|
||||
self.remote_recognizer = Recognizer()
|
||||
basedir = os.path.dirname(__file__)
|
||||
self.wakeup_words = read_stripped_lines(os.path.join(
|
||||
basedir, 'model', lang, 'WakeUpWord.voc'))
|
||||
self.wakeup_prefixes = read_stripped_lines(
|
||||
os.path.join(basedir, 'model', lang, 'PrefixWakeUp.voc'))
|
||||
self.state = RecognizerLoopState()
|
||||
|
||||
def start_async(self):
|
||||
|
@ -388,11 +276,9 @@ class RecognizerLoop(pyee.EventEmitter):
|
|||
queue,
|
||||
self,
|
||||
self.wakeup_recognizer,
|
||||
self.ww_recognizer,
|
||||
self.mycroft_recognizer,
|
||||
RemoteRecognizerWrapperFactory.wrap_recognizer(
|
||||
self.remote_recognizer),
|
||||
self.wakeup_prefixes,
|
||||
self.wakeup_words).start()
|
||||
self.remote_recognizer)).start()
|
||||
|
||||
def stop(self):
|
||||
self.state.running = False
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
# Copyright 2016 Mycroft AI, Inc.
|
||||
#
|
||||
# This file is part of Mycroft Core.
|
||||
#
|
||||
# Mycroft Core is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Mycroft Core is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
import time
|
||||
|
||||
import os
|
||||
from pocketsphinx.pocketsphinx import *
|
||||
|
||||
__author__ = 'seanfitz, jdorleans'
|
||||
|
||||
BASEDIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
class LocalRecognizer(object):
|
||||
def __init__(self, sample_rate=16000, lang="en-us", key_phrase="mycroft"):
|
||||
self.lang = lang
|
||||
self.key_phrase = key_phrase
|
||||
self.sample_rate = sample_rate
|
||||
self.configure()
|
||||
|
||||
def configure(self):
|
||||
config = Decoder.default_config()
|
||||
config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang,
|
||||
'hmm'))
|
||||
config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang,
|
||||
'mycroft-en-us.dict'))
|
||||
config.set_string('-keyphrase', self.key_phrase)
|
||||
config.set_float('-kws_threshold', float('1e-45'))
|
||||
config.set_float('-samprate', self.sample_rate)
|
||||
config.set_int('-nfft', 2048)
|
||||
config.set_string('-logfn', '/dev/null')
|
||||
self.decoder = Decoder(config)
|
||||
|
||||
def transcribe(self, byte_data, metrics=None):
|
||||
start = time.time()
|
||||
self.decoder.start_utt()
|
||||
self.decoder.process_raw(byte_data, False, False)
|
||||
self.decoder.end_utt()
|
||||
if metrics:
|
||||
metrics.timer("mycroft.stt.local.time_s", time.time() - start)
|
||||
return self.decoder.hyp()
|
||||
|
||||
def is_recognized(self, byte_data, metrics):
|
||||
hyp = self.transcribe(byte_data, metrics)
|
||||
return hyp and self.key_phrase in hyp.hypstr.lower()
|
||||
|
||||
def contains(self, hypothesis):
|
||||
return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
|
|
@ -1,4 +0,0 @@
|
|||
hey
|
||||
hay
|
||||
okay
|
||||
ok
|
|
@ -1,7 +0,0 @@
|
|||
minecraft
|
||||
microsoft
|
||||
mycroft
|
||||
micro
|
||||
my friend
|
||||
my brother
|
||||
mike ross
|
|
@ -1,4 +1,8 @@
|
|||
hey HH EY
|
||||
ok OW K EY
|
||||
okay OW K EY
|
||||
alright AO L R AY T
|
||||
allright AA L R AY T
|
||||
mycroft M AY K R AO F T
|
||||
up AH P
|
||||
wake W EY K
|
||||
|
|
|
@ -1,78 +0,0 @@
|
|||
# Copyright 2016 Mycroft AI, Inc.
|
||||
#
|
||||
# This file is part of Mycroft Core.
|
||||
#
|
||||
# Mycroft Core is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Mycroft Core is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
from mycroft.metrics import Stopwatch
|
||||
|
||||
import os
|
||||
from pocketsphinx import Decoder
|
||||
|
||||
from cmath import exp, pi
|
||||
|
||||
__author__ = 'seanfitz'
|
||||
|
||||
BASEDIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
def fft(x):
|
||||
"""
|
||||
fft function to clean data, but most be converted to array of IEEE floats
|
||||
first
|
||||
:param x:
|
||||
:return:
|
||||
"""
|
||||
N = len(x)
|
||||
if N <= 1:
|
||||
return x
|
||||
even = fft(x[0::2])
|
||||
odd = fft(x[1::2])
|
||||
T = [exp(-2j*pi*k/N)*odd[k] for k in xrange(N/2)]
|
||||
return [even[k] + T[k] for k in xrange(N/2)] + \
|
||||
[even[k] - T[k] for k in xrange(N/2)]
|
||||
|
||||
|
||||
class Recognizer(object):
|
||||
def __init__(self, local_recognizer):
|
||||
self.local_recognizer = local_recognizer
|
||||
|
||||
def transcribe(self, wav_data, metrics=None):
|
||||
timer = Stopwatch()
|
||||
timer.start()
|
||||
self.local_recognizer.start_utt()
|
||||
self.local_recognizer.process_raw(wav_data, False, False)
|
||||
self.local_recognizer.end_utt()
|
||||
if metrics:
|
||||
metrics.timer("mycroft.stt.local.time_s", timer.stop())
|
||||
return self.local_recognizer.hyp()
|
||||
|
||||
|
||||
def create_recognizer(samprate=16000, lang="en-us", keyphrase="hey mycroft"):
|
||||
sphinx_config = Decoder.default_config()
|
||||
|
||||
sphinx_config.set_string(
|
||||
'-hmm', os.path.join(BASEDIR, 'model', lang, 'hmm'))
|
||||
sphinx_config.set_string(
|
||||
'-dict', os.path.join(BASEDIR, 'model', lang, 'mycroft-en-us.dict'))
|
||||
sphinx_config.set_string('-keyphrase', keyphrase)
|
||||
sphinx_config.set_float('-kws_threshold', float('1e-45'))
|
||||
sphinx_config.set_float('-samprate', samprate)
|
||||
sphinx_config.set_int('-nfft', 2048)
|
||||
sphinx_config.set_string('-logfn', '/dev/null')
|
||||
|
||||
decoder = Decoder(sphinx_config)
|
||||
|
||||
return Recognizer(decoder)
|
|
@ -0,0 +1,76 @@
|
|||
# Copyright 2016 Mycroft AI, Inc.
|
||||
#
|
||||
# This file is part of Mycroft Core.
|
||||
#
|
||||
# Mycroft Core is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Mycroft Core is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
from speech_recognition import AudioData
|
||||
|
||||
__author__ = 'jdorleans'
|
||||
|
||||
|
||||
class WordExtractor:
|
||||
SILENCE_SECS = 0.1
|
||||
PRECISION_RATE = 0.01
|
||||
|
||||
def __init__(self, audio, recognizer, metrics):
|
||||
self.audio = audio
|
||||
self.recognizer = recognizer
|
||||
self.audio_size = len(self.audio.frame_data)
|
||||
self.delta = int(self.audio_size / 2)
|
||||
self.begin = 0
|
||||
self.end = self.audio_size
|
||||
self.precision = int(self.audio_size * self.PRECISION_RATE)
|
||||
self.silence_data = self.create_silence(self.SILENCE_SECS,
|
||||
self.audio.sample_rate,
|
||||
self.audio.sample_width)
|
||||
self.metrics = metrics
|
||||
|
||||
def __add(self, is_begin, value):
|
||||
if is_begin:
|
||||
self.begin += value
|
||||
else:
|
||||
self.end += value
|
||||
|
||||
def __calculate_marker(self, is_begin):
|
||||
dt = self.delta
|
||||
sign = 1 if is_begin else -1
|
||||
|
||||
while dt > self.precision:
|
||||
self.__add(is_begin, dt * sign)
|
||||
segment = self.audio.frame_data[self.begin:self.end]
|
||||
found = self.recognizer.is_recognized(segment, self.metrics)
|
||||
if not found:
|
||||
self.__add(is_begin, dt * -sign)
|
||||
dt = int(dt / 2)
|
||||
|
||||
def calculate_range(self):
|
||||
self.__calculate_marker(False)
|
||||
self.__calculate_marker(True)
|
||||
|
||||
@staticmethod
|
||||
def create_silence(seconds, sample_rate, sample_width):
|
||||
return '\0' * int(seconds * sample_rate * sample_width)
|
||||
|
||||
def get_audio_data_before(self):
|
||||
byte_data = self.audio.frame_data[0:self.begin] + self.silence_data
|
||||
return AudioData(byte_data, self.audio.sample_rate,
|
||||
self.audio.sample_width)
|
||||
|
||||
def get_audio_data_after(self):
|
||||
byte_data = self.silence_data + self.audio.frame_data[
|
||||
self.end:self.audio_size]
|
||||
return AudioData(byte_data, self.audio.sample_rate,
|
||||
self.audio.sample_width)
|
|
@ -1,35 +1,51 @@
|
|||
from Queue import Queue
|
||||
from os.path import dirname, join
|
||||
# Copyright 2016 Mycroft AI, Inc.
|
||||
#
|
||||
# This file is part of Mycroft Core.
|
||||
#
|
||||
# Mycroft Core is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Mycroft Core is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
import unittest
|
||||
from Queue import Queue
|
||||
|
||||
from os.path import dirname, join
|
||||
from speech_recognition import WavFile, AudioData
|
||||
from mycroft.client.speech.listener import (
|
||||
WakewordExtractor,
|
||||
AudioConsumer,
|
||||
RecognizerLoop
|
||||
)
|
||||
|
||||
from mycroft.client.speech.listener import AudioConsumer, RecognizerLoop
|
||||
from mycroft.client.speech.recognizer_wrapper import (
|
||||
RemoteRecognizerWrapperFactory
|
||||
)
|
||||
|
||||
|
||||
__author__ = 'seanfitz'
|
||||
|
||||
|
||||
class MockRecognizer(object):
|
||||
def __init__(self, transcription=None):
|
||||
def __init__(self):
|
||||
self.transcriptions = []
|
||||
|
||||
def recognize_google(self, audio, key=None, language=None, show_all=False):
|
||||
return self.tanscriptions.pop(0)
|
||||
return self.transcriptions.pop(0)
|
||||
|
||||
def set_transcriptions(self, transcriptions):
|
||||
self.tanscriptions = transcriptions
|
||||
self.transcriptions = transcriptions
|
||||
|
||||
|
||||
class AudioConsumerTest(unittest.TestCase):
|
||||
"""
|
||||
AudioConsumerTest
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
self.loop = RecognizerLoop()
|
||||
self.queue = Queue()
|
||||
|
@ -40,11 +56,9 @@ class AudioConsumerTest(unittest.TestCase):
|
|||
self.queue,
|
||||
self.loop,
|
||||
self.loop.wakeup_recognizer,
|
||||
self.loop.ww_recognizer,
|
||||
self.loop.mycroft_recognizer,
|
||||
RemoteRecognizerWrapperFactory.wrap_recognizer(
|
||||
self.recognizer, 'google'),
|
||||
self.loop.wakeup_prefixes,
|
||||
self.loop.wakeup_words)
|
||||
self.recognizer, 'google'))
|
||||
|
||||
def __create_sample_from_test_file(self, sample_name):
|
||||
root_dir = dirname(dirname(dirname(__file__)))
|
||||
|
@ -56,116 +70,119 @@ class AudioConsumerTest(unittest.TestCase):
|
|||
source.stream.read(), wavfile.SAMPLE_RATE,
|
||||
wavfile.SAMPLE_WIDTH)
|
||||
|
||||
def test_audio_pos_front_back(self):
|
||||
audio = self.__create_sample_from_test_file('mycroft_in_utterance')
|
||||
self.queue.put(audio)
|
||||
TRUE_POS_BEGIN = 69857 + int(
|
||||
WakewordExtractor.TRIM_SECONDS * audio.sample_rate *
|
||||
audio.sample_width)
|
||||
TRUE_POS_END = 89138 - int(
|
||||
WakewordExtractor.TRIM_SECONDS * audio.sample_rate *
|
||||
audio.sample_width)
|
||||
def test_word_extraction(self):
|
||||
"""
|
||||
This is intended to test the extraction of the word: ``mycroft``.
|
||||
The values for ``ideal_begin`` and ``ideal_end`` were found using an
|
||||
audio tool like Audacity and they represent a sample value position of
|
||||
the audio. ``tolerance`` is an acceptable margin error for the distance
|
||||
between the ideal and actual values found by the ``WordExtractor``
|
||||
"""
|
||||
|
||||
TOLERANCE_RANGE_FRAMES = (
|
||||
WakewordExtractor.MAX_ERROR_SECONDS * audio.sample_rate *
|
||||
audio.sample_width)
|
||||
audio = self.__create_sample_from_test_file('weather_mycroft')
|
||||
self.queue.put(audio)
|
||||
tolerance = 4000
|
||||
ideal_begin = 70000
|
||||
ideal_end = 92000
|
||||
|
||||
monitor = {}
|
||||
self.recognizer.set_transcriptions(
|
||||
["what's the weather next week", ""])
|
||||
self.recognizer.set_transcriptions(["what's the weather next week"])
|
||||
|
||||
def wakeword_callback(message):
|
||||
monitor['pos_begin'] = message.get('pos_begin')
|
||||
monitor['pos_end'] = message.get('pos_end')
|
||||
|
||||
self.loop.once('recognizer_loop:wakeword', wakeword_callback)
|
||||
self.consumer.try_consume_audio()
|
||||
self.consumer.read_audio()
|
||||
|
||||
pos_begin = monitor.get('pos_begin')
|
||||
self.assertIsNotNone(pos_begin)
|
||||
diff = abs(pos_begin - TRUE_POS_BEGIN)
|
||||
actual_begin = monitor.get('pos_begin')
|
||||
self.assertIsNotNone(actual_begin)
|
||||
diff = abs(actual_begin - ideal_begin)
|
||||
self.assertTrue(
|
||||
diff <= TOLERANCE_RANGE_FRAMES,
|
||||
str(diff) + " is not less than " + str(TOLERANCE_RANGE_FRAMES))
|
||||
diff <= tolerance,
|
||||
str(diff) + " is not less than " + str(tolerance))
|
||||
|
||||
pos_end = monitor.get('pos_end')
|
||||
self.assertIsNotNone(pos_end)
|
||||
diff = abs(pos_end - TRUE_POS_END)
|
||||
actual_end = monitor.get('pos_end')
|
||||
self.assertIsNotNone(actual_end)
|
||||
diff = abs(actual_end - ideal_end)
|
||||
self.assertTrue(
|
||||
diff <= TOLERANCE_RANGE_FRAMES,
|
||||
str(diff) + " is not less than " + str(TOLERANCE_RANGE_FRAMES))
|
||||
diff <= tolerance,
|
||||
str(diff) + " is not less than " + str(tolerance))
|
||||
|
||||
def test_wakeword_in_beginning(self):
|
||||
self.queue.put(self.__create_sample_from_test_file('mycroft'))
|
||||
self.queue.put(self.__create_sample_from_test_file('weather_mycroft'))
|
||||
self.recognizer.set_transcriptions(["what's the weather next week"])
|
||||
monitor = {}
|
||||
self.recognizer.set_transcriptions([
|
||||
"what's the weather next week", ""])
|
||||
|
||||
def callback(message):
|
||||
monitor['utterances'] = message.get('utterances')
|
||||
|
||||
self.loop.once('recognizer_loop:utterance', callback)
|
||||
self.consumer.try_consume_audio()
|
||||
self.consumer.read_audio()
|
||||
|
||||
utterances = monitor.get('utterances')
|
||||
self.assertIsNotNone(utterances)
|
||||
self.assertTrue(len(utterances) == 1)
|
||||
self.assertEquals("what's the weather next week", utterances[0])
|
||||
|
||||
def test_wakeword_in_phrase(self):
|
||||
def test_wakeword(self):
|
||||
self.queue.put(self.__create_sample_from_test_file('mycroft'))
|
||||
self.recognizer.set_transcriptions(["silence"])
|
||||
monitor = {}
|
||||
self.recognizer.set_transcriptions([
|
||||
"he can do other stuff too", "what's the weather in cincinnati"])
|
||||
|
||||
def callback(message):
|
||||
monitor['utterances'] = message.get('utterances')
|
||||
|
||||
self.loop.once('recognizer_loop:utterance', callback)
|
||||
self.consumer.try_consume_audio()
|
||||
self.consumer.read_audio()
|
||||
|
||||
utterances = monitor.get('utterances')
|
||||
self.assertIsNotNone(utterances)
|
||||
self.assertTrue(len(utterances) == 2)
|
||||
self.assertEquals("he can do other stuff too", utterances[0])
|
||||
self.assertEquals("what's the weather in cincinnati", utterances[1])
|
||||
self.assertTrue(len(utterances) == 1)
|
||||
self.assertEquals("silence", utterances[0])
|
||||
|
||||
def test_call_and_response(self):
|
||||
def test_ignore_wakeword_when_sleeping(self):
|
||||
self.queue.put(self.__create_sample_from_test_file('mycroft'))
|
||||
self.recognizer.set_transcriptions(["not detected"])
|
||||
self.loop.sleep()
|
||||
monitor = {}
|
||||
self.recognizer.set_transcriptions(["mycroft", ""])
|
||||
|
||||
def wakeword_callback(message):
|
||||
monitor['wakeword'] = message.get('utterance')
|
||||
|
||||
self.loop.once('recognizer_loop:wakeword', wakeword_callback)
|
||||
self.consumer.read_audio()
|
||||
self.assertIsNone(monitor.get('wakeword'))
|
||||
self.assertTrue(self.loop.state.sleeping)
|
||||
|
||||
def test_wakeup(self):
|
||||
self.queue.put(self.__create_sample_from_test_file('mycroft_wakeup'))
|
||||
self.loop.sleep()
|
||||
self.consumer.read_audio()
|
||||
self.assertFalse(self.loop.state.sleeping)
|
||||
|
||||
def test_call_and_response(self):
|
||||
self.queue.put(self.__create_sample_from_test_file('mycroft'))
|
||||
self.recognizer.set_transcriptions(["silence"])
|
||||
monitor = {}
|
||||
|
||||
def wakeword_callback(message):
|
||||
monitor['wakeword'] = message.get('utterance')
|
||||
|
||||
self.loop.once('recognizer_loop:wakeword', wakeword_callback)
|
||||
self.consumer.read_audio()
|
||||
self.assertIsNotNone(monitor.get('wakeword'))
|
||||
|
||||
self.queue.put(self.__create_sample_from_test_file('weather_mycroft'))
|
||||
self.recognizer.set_transcriptions(["what's the weather next week"])
|
||||
|
||||
def utterance_callback(message):
|
||||
monitor['utterances'] = message.get('utterances')
|
||||
|
||||
self.loop.once('recognizer_loop:wakeword', wakeword_callback)
|
||||
self.consumer.try_consume_audio()
|
||||
|
||||
self.assertIsNotNone(monitor.get('wakeword'))
|
||||
|
||||
self.queue.put(self.__create_sample_from_test_file('mycroft'))
|
||||
self.recognizer.set_transcriptions(
|
||||
["what's the weather next week", ""])
|
||||
self.loop.once('recognizer_loop:utterance', utterance_callback)
|
||||
self.consumer.try_consume_audio()
|
||||
self.consumer.read_audio()
|
||||
|
||||
utterances = monitor.get('utterances')
|
||||
self.assertIsNotNone(utterances)
|
||||
self.assertTrue(len(utterances) == 1)
|
||||
self.assertEquals("what's the weather next week", utterances[0])
|
||||
|
||||
def test_ignore_wakeword_when_sleeping(self):
|
||||
self.queue.put(self.__create_sample_from_test_file('mycroft'))
|
||||
self.loop.sleep()
|
||||
monitor = {}
|
||||
self.recognizer.set_transcriptions(["", ""])
|
||||
|
||||
def wakeword_callback(message):
|
||||
monitor['wakeword'] = message.get('utterance')
|
||||
|
||||
self.loop.once('recognizer_loop:wakeword', wakeword_callback)
|
||||
self.consumer.try_consume_audio()
|
||||
|
||||
self.assertIsNone(monitor.get('wakeword'))
|
||||
self.assertTrue(self.loop.state.sleeping)
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,31 @@
|
|||
import unittest
|
||||
|
||||
import os
|
||||
from speech_recognition import WavFile
|
||||
|
||||
from mycroft.client.speech.local_recognizer import LocalRecognizer
|
||||
|
||||
__author__ = 'seanfitz'
|
||||
|
||||
DATA_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
|
||||
|
||||
|
||||
class LocalRecognizerTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.recognizer = LocalRecognizer()
|
||||
|
||||
def testRecognizerWrapper(self):
|
||||
source = WavFile(os.path.join(DATA_DIR, "hey_mycroft.wav"))
|
||||
with source as audio:
|
||||
hyp = self.recognizer.transcribe(audio.stream.read())
|
||||
assert "mycroft" in hyp.hypstr.lower()
|
||||
source = WavFile(os.path.join(DATA_DIR, "mycroft.wav"))
|
||||
with source as audio:
|
||||
hyp = self.recognizer.transcribe(audio.stream.read())
|
||||
assert "mycroft" in hyp.hypstr.lower()
|
||||
|
||||
def testRecognitionInLongerUtterance(self):
|
||||
source = WavFile(os.path.join(DATA_DIR, "weather_mycroft.wav"))
|
||||
with source as audio:
|
||||
hyp = self.recognizer.transcribe(audio.stream.read())
|
||||
assert "mycroft" in hyp.hypstr.lower()
|
|
@ -1,32 +0,0 @@
|
|||
from speech_recognition import WavFile
|
||||
import os
|
||||
|
||||
from mycroft.client.speech import wakeword_recognizer
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
__author__ = 'seanfitz'
|
||||
|
||||
DATA_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
|
||||
|
||||
|
||||
class WakewordRecognizerTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.ww_recognizer = wakeword_recognizer.create_recognizer()
|
||||
|
||||
def testRecognizerWrapper(self):
|
||||
source = WavFile(os.path.join(DATA_DIR, "hey_mycroft.wav"))
|
||||
with source as audio:
|
||||
hyp = self.ww_recognizer.transcribe(audio.stream.read())
|
||||
assert hyp.hypstr.lower() == "hey mycroft"
|
||||
source = WavFile(os.path.join(DATA_DIR, "mycroft.wav"))
|
||||
with source as audio:
|
||||
hyp = self.ww_recognizer.transcribe(audio.stream.read())
|
||||
assert hyp.hypstr.lower() == "hey mycroft"
|
||||
|
||||
def testRecognitionInLongerUtterance(self):
|
||||
source = WavFile(os.path.join(DATA_DIR, "mycroft_in_utterance.wav"))
|
||||
with source as audio:
|
||||
hyp = self.ww_recognizer.transcribe(audio.stream.read())
|
||||
assert hyp.hypstr.lower() == "hey mycroft"
|
Loading…
Reference in New Issue