Rewrote listener

pull/172/head
Wolfgange3311999 2016-06-12 18:06:07 -05:00 committed by Matthew Scholefield
parent 3870e3c9c7
commit b1900c3d81
7 changed files with 185 additions and 186 deletions

View File

@ -171,16 +171,17 @@ class Enclosure:
def __register_events(self):
self.client.on('mycroft.paired', self.__update_events)
self.client.on('recognizer_loop:wakeword', self.eyes.blink)
self.__register_mouth_events()
def __register_mouth_events(self):
self.client.on('recognizer_loop:listening', self.mouth.listen)
self.client.on('recognizer_loop:record_begin', self.mouth.listen)
self.client.on('recognizer_loop:record_end', self.mouth.reset)
self.client.on('recognizer_loop:audio_output_start', self.mouth.talk)
self.client.on('recognizer_loop:audio_output_end', self.mouth.reset)
def __remove_mouth_events(self):
self.client.remove('recognizer_loop:listening', self.mouth.listen)
self.client.remove('recognizer_loop:record_begin', self.mouth.listen)
self.client.remove('recognizer_loop:record_end', self.mouth.reset)
self.client.remove('recognizer_loop:audio_output_start',
self.mouth.talk)
self.client.remove('recognizer_loop:audio_output_end',

View File

@ -24,13 +24,12 @@ import pyee
import speech_recognition as sr
from mycroft.client.speech.local_recognizer import LocalRecognizer
from mycroft.client.speech.mic import MutableMicrophone, Recognizer
from mycroft.client.speech.mic import MutableMicrophone, ResponsiveRecognizer
from mycroft.client.speech.recognizer_wrapper import \
RemoteRecognizerWrapperFactory
from mycroft.client.speech.word_extractor import WordExtractor
from mycroft.configuration import ConfigurationManager
from mycroft.messagebus.message import Message
from mycroft.metrics import MetricsAggregator, Stopwatch
from mycroft.metrics import MetricsAggregator
from mycroft.session import SessionManager
from mycroft.util import CerberusAccessDenied
from mycroft.util.log import getLogger
@ -62,8 +61,7 @@ class AudioProducer(threading.Thread):
self.recognizer.adjust_for_ambient_noise(source)
while self.state.running:
try:
self.emitter.emit("recognizer_loop:listening")
audio = self.recognizer.listen(source)
audio = self.recognizer.listen(source, self.emitter)
self.queue.put(audio)
except IOError, ex:
# NOTE: Audio stack on raspi is slightly different, throws
@ -105,73 +103,28 @@ class AudioConsumer(threading.Thread):
audio.sample_rate * audio.sample_width)
def read_audio(self):
timer = Stopwatch()
audio = self.queue.get()
self.metrics.timer("mycroft.recognizer.audio.length_s",
self._audio_length(audio))
self.queue.task_done()
timer.start()
audio_data = self.queue.get()
if self.state.sleeping:
self.process_wake_up(audio)
elif self.state.skip_wakeword:
self.process_skip_wake_word(audio)
self.try_wake_up(audio_data)
else:
self.process_wake_word(audio, timer)
self.process_audio(audio_data)
self.metrics.flush()
def try_wake_up(self, audio_segments):
for segment in audio_segments:
if self.wakeup_recognizer.is_recognized(segment.frame_data,
self.metrics):
SessionManager.touch()
self.state.sleeping = False
self.__speak("I'm awake.") # TODO: Localization
self.metrics.increment("mycroft.wakeup")
def process_wake_up(self, audio):
if self.wakeup_recognizer.is_recognized(audio.frame_data,
self.metrics):
SessionManager.touch()
self.state.sleeping = False
self.__speak("I'm awake.") # TODO: Localization
self.metrics.increment("mycroft.wakeup")
def process_wake_word(self, audio, timer):
hyp = self.mycroft_recognizer.transcribe(audio.frame_data,
self.metrics)
if self.mycroft_recognizer.contains(hyp):
extractor = WordExtractor(audio, self.mycroft_recognizer,
self.metrics)
timer.lap()
extractor.calculate_range()
self.metrics.timer("mycroft.recognizer.extractor.time_s",
timer.lap())
audio_before = extractor.get_audio_data_before()
self.metrics.timer("mycroft.recognizer.audio_extracted.length_s",
self._audio_length(audio_before))
audio_after = extractor.get_audio_data_after()
self.metrics.timer("mycroft.recognizer.audio_extracted.length_s",
self._audio_length(audio_after))
SessionManager.touch()
payload = {
'utterance': hyp.hypstr,
'session': SessionManager.get().session_id,
'pos_begin': extractor.begin,
'pos_end': extractor.end
}
self.emitter.emit("recognizer_loop:wakeword", payload)
try:
self.transcribe([audio_before, audio_after])
except sr.UnknownValueError:
self.__speak("Go ahead")
self.state.skip_wakeword = True
self.metrics.increment("mycroft.wakeword")
def process_skip_wake_word(self, audio):
SessionManager.touch()
def process_audio(self, audio):
try:
self.transcribe([audio])
except sr.UnknownValueError:
except sr.UnknownValueError: # TODO: Localization
logger.warn("Speech Recognition could not understand audio")
self.__speak("Sorry, I didn't catch that.")
self.metrics.increment("mycroft.recognizer.error")
self.state.skip_wakeword = False
def __speak(self, utterance):
payload = {
@ -184,18 +137,18 @@ class AudioConsumer(threading.Thread):
def runnable():
try:
text = self.remote_recognizer.transcribe(
audio, metrics=self.metrics).lower()
audio, self.metrics).lower()
except sr.UnknownValueError:
pass
except sr.RequestError as e:
logger.error(
"Could not request results from Speech Recognition "
"service; {0}".format(e))
"Could not request results from Speech Recognition "
"service; {0}".format(e))
except CerberusAccessDenied as e:
logger.error("AccessDenied from Cerberus proxy.")
self.__speak(
"Your device is not registered yet. To start pairing, "
"login at cerberus dot mycroft dot A.I")
"Your device is not registered yet. To start pairing, "
"login at cerberus dot mycroft dot A.I")
utterances.append("pair my device")
except Exception as e:
logger.error("Unexpected exception: {0}".format(e))
@ -253,7 +206,7 @@ class RecognizerLoop(pyee.EventEmitter):
self.mycroft_recognizer = LocalRecognizer(sample_rate, lang)
# TODO - localization
self.wakeup_recognizer = LocalRecognizer(sample_rate, lang, "wake up")
self.remote_recognizer = Recognizer()
self.remote_recognizer = ResponsiveRecognizer(self.mycroft_recognizer)
self.state = RecognizerLoopState()
def start_async(self):
@ -270,7 +223,7 @@ class RecognizerLoop(pyee.EventEmitter):
self.wakeup_recognizer,
self.mycroft_recognizer,
RemoteRecognizerWrapperFactory.wrap_recognizer(
self.remote_recognizer)).start()
self.remote_recognizer)).start()
def stop(self):
self.state.running = False

View File

@ -59,5 +59,5 @@ class LocalRecognizer(object):
hyp = self.transcribe(byte_data, metrics)
return hyp and self.key_phrase in hyp.hypstr.lower()
def contains(self, hypothesis):
def found_wake_word(self, hypothesis):
return hypothesis and self.key_phrase in hypothesis.hypstr.lower()

View File

@ -35,14 +35,14 @@ loop = None
config = ConfigurationManager.get()
def handle_listening():
logger.info("Listening...")
client.emit(Message('recognizer_loop:listening'))
def handle_record_begin():
logger.info("Begin Recording...")
client.emit(Message('recognizer_loop:record_begin'))
def handle_wakeword(event):
logger.info("Wakeword Detected: " + event['utterance'])
client.emit(Message('recognizer_loop:wakeword', event))
def handle_record_end():
logger.info("End Recording...")
client.emit(Message('recognizer_loop:record_end'))
def handle_utterance(event):
@ -93,9 +93,9 @@ def main():
if device_index:
device_index = int(device_index)
loop = RecognizerLoop(device_index=device_index)
loop.on('recognizer_loop:listening', handle_listening)
loop.on('recognizer_loop:wakeword', handle_wakeword)
loop.on('recognizer_loop:utterance', handle_utterance)
loop.on('recognizer_loop:record_begin', handle_record_begin)
loop.on('recognizer_loop:record_end', handle_record_end)
loop.on('speak', handle_speak)
client.on('speak', handle_speak)
client.on(

View File

@ -17,7 +17,6 @@
import collections
import math
import audioop
from time import sleep
@ -30,6 +29,7 @@ from speech_recognition import (
)
import speech_recognition
from mycroft.util.log import getLogger
logger = getLogger(__name__)
__author__ = 'seanfitz'
@ -116,123 +116,168 @@ class MutableMicrophone(Microphone):
self.stream.unmute()
class Recognizer(speech_recognition.Recognizer):
def __init__(self):
class ResponsiveRecognizer(speech_recognition.Recognizer):
# The maximum audio in seconds to keep for transcribing a phrase
# The wake word must fit in this time
SAVED_WW_SEC = 1.0
# Padding of silence when feeding to pocketsphinx
SILENCE_SEC = 0.01
# The minimum seconds of noise before a
# phrase can be considered complete
MIN_LOUD_SEC_PER_PHRASE = 0.2
# The maximum length a phrase can be recorded,
# provided there is noise the entire time
RECORDING_TIMEOUT = 30.0
# Time between pocketsphinx checks for the wake word
SEC_BETWEEN_WW_CHECKS = 0.2
def __init__(self, wake_word_recognizer):
speech_recognition.Recognizer.__init__(self)
self.daemon = True
self.max_audio_length_sec = 30
def listen(self, source, timeout=None):
self.wake_word_recognizer = wake_word_recognizer
self.audio = pyaudio.PyAudio()
@staticmethod
def record_sound_chunk(source):
return source.stream.read(source.CHUNK)
@staticmethod
def calc_energy(sound_chunk, sample_width):
return audioop.rms(sound_chunk, sample_width)
def wake_word_in_audio(self, frame_data):
hyp = self.wake_word_recognizer.transcribe(frame_data)
return self.wake_word_recognizer.found_wake_word(hyp)
def record_phrase(self, source, sec_per_buffer):
"""
Records a single phrase from ``source`` (an ``AudioSource`` instance)
into an ``AudioData`` instance, which it returns.
This attempts to record an entire spoken phrase. Essentially,
this waits for a period of silence and then returns the audio
This is done by waiting until the audio has an energy above
``recognizer_instance.energy_threshold`` (the user has started
speaking), and then recording until it encounters
``recognizer_instance.pause_threshold`` seconds of non-speaking or
there is no more audio input. The ending silence is not included.
The ``timeout`` parameter is the maximum number of seconds that it
will wait for a phrase to start before giving up and throwing an
``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is
``None``, it will wait indefinitely.
:rtype: bytearray
:param source: AudioSource
:param sec_per_buffer: Based on source.SAMPLE_RATE
:return: bytearray representing the frame_data of the recorded phrase
"""
assert isinstance(source, AudioSource), \
"Source must be an audio source"
assert self.pause_threshold >= self.non_speaking_duration >= 0
num_loud_chunks = 0
noise = 0
seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
# number of buffers of non-speaking audio before the phrase is
# complete
pause_buffer_count = int(
math.ceil(self.pause_threshold / seconds_per_buffer))
# minimum number of buffers of speaking audio before we consider the
# speaking audio a phrase
phrase_buffer_count = int(math.ceil(self.phrase_threshold /
seconds_per_buffer))
# maximum number of buffers of non-speaking audio to retain before and
# after
non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration /
seconds_per_buffer))
max_noise = 20
min_noise = 0
# read audio input for phrases until there is a phrase that is long
# enough
elapsed_time = 0 # number of seconds of audio read
while True:
frames = collections.deque()
def increase_noise(level):
if level < max_noise:
return level + 2
return level
# store audio input until the phrase starts
while True:
elapsed_time += seconds_per_buffer
# handle timeout if specified
if timeout and elapsed_time > timeout:
raise WaitTimeoutError("listening timed out")
def decrease_noise(level):
if level > min_noise:
return level - 1
return level
buffer = source.stream.read(source.CHUNK)
if len(buffer) == 0:
break # reached end of the stream
frames.append(buffer)
# ensure we only keep the needed amount of non-speaking buffers
if len(frames) > non_speaking_buffer_count:
frames.popleft()
# Smallest number of loud chunks required to return
min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer)
# detect whether speaking has started on audio input
# energy of the audio signal
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
if energy > self.energy_threshold:
break
# bytearray to store audio in
byte_data = '\0' * source.SAMPLE_WIDTH
# dynamically adjust the energy threshold using assymmetric
# weighted average
# do not adjust dynamic energy level for this sample if it is
# muted audio (energy == 0)
self.adjust_energy_threshold(energy, seconds_per_buffer)
# read audio input until the phrase ends
pause_count, phrase_count = 0, 0
while True:
elapsed_time += seconds_per_buffer
phrase_complete = False
while not phrase_complete:
chunk = self.record_sound_chunk(source)
byte_data = byte_data + chunk
buffer = source.stream.read(source.CHUNK)
if len(buffer) == 0:
break # reached end of the stream
frames.append(buffer)
phrase_count += 1
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
is_loud = energy > self.energy_threshold
if is_loud:
noise = increase_noise(noise)
num_loud_chunks += 1
else:
noise = decrease_noise(noise)
self.adjust_threshold(energy, sec_per_buffer)
# check if speaking has stopped for longer than the pause
# threshold on the audio input
# energy of the audio signal
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
if energy > self.energy_threshold:
pause_count = 0
else:
pause_count += 1
if pause_count > pause_buffer_count: # end of the phrase
break
if noise <= min_noise and num_loud_chunks > min_loud_chunks:
phrase_complete = True
if (len(frames) * seconds_per_buffer >=
self.max_audio_length_sec):
# if we hit the end of the audio length, readjust
# energy_threshold
for frame in frames:
energy = audioop.rms(frame, source.SAMPLE_WIDTH)
self.adjust_energy_threshold(
energy, seconds_per_buffer)
break
return byte_data
# check how long the detected phrase is, and retry listening if
# the phrase is too short
phrase_count -= pause_count
if phrase_count >= phrase_buffer_count:
break # phrase is long enough, stop listening
@staticmethod
def sec_to_bytes(sec, source):
return sec * source.SAMPLE_RATE * source.SAMPLE_WIDTH
# obtain frame data
for i in range(pause_count - non_speaking_buffer_count):
frames.pop() # remove extra non-speaking frames at the end
frame_data = b"".join(list(frames))
def wait_until_wake_word(self, source, sec_per_buffer):
num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
source.SAMPLE_WIDTH)
return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
silence = '\0' * num_silent_bytes
def adjust_energy_threshold(self, energy, seconds_per_buffer):
# bytearray to store audio in
byte_data = silence
buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer
buffers_since_check = 0.0
# Max bytes for byte_data before audio is removed from the front
max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)
said_wake_word = False
while not said_wake_word:
chunk = self.record_sound_chunk(source)
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
if energy < self.energy_threshold:
self.adjust_threshold(energy, sec_per_buffer)
needs_to_grow = len(byte_data) < max_size
if needs_to_grow:
byte_data = byte_data + chunk
else: # Remove beginning of audio and add new chunk to end
byte_data = byte_data[len(chunk):] + chunk
buffers_since_check += 1.0
if buffers_since_check < buffers_per_check:
buffers_since_check -= buffers_per_check
said_wake_word = self.wake_word_in_audio(byte_data + silence)
@staticmethod
def create_audio_data(raw_data, source):
"""
Constructs an AudioData instance with the same parameters
as the source and the specified frame_data
"""
return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def listen(self, source, emitter):
"""
Listens for audio that Mycroft should respond to
:param source: an ``AudioSource`` instance for reading from
:param emitter: a pyee EventEmitter for sending when the wakeword
has been found
"""
assert isinstance(source, AudioSource), "Source must be an AudioSource"
bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH
sec_per_buffer = float(source.CHUNK) / bytes_per_sec
logger.debug("Waiting for wake word...")
self.wait_until_wake_word(source, sec_per_buffer)
logger.debug("Recording...")
emitter.emit("recognizer_loop:record_begin")
frame_data = self.record_phrase(source, sec_per_buffer)
audio_data = self.create_audio_data(frame_data, source)
emitter.emit("recognizer_loop:record_end")
logger.debug("Thinking...")
return audio_data
def adjust_threshold(self, energy, seconds_per_buffer):
if self.dynamic_energy_threshold and energy > 0:
# account for different chunk sizes and rates
damping = (

View File

@ -70,7 +70,7 @@ class WordExtractor:
self.audio.sample_width)
def get_audio_data_after(self):
byte_data = self.silence_data + self.audio.frame_data[
self.end:self.audio_size]
byte_data = self.silence_data + self.audio.frame_data[self.end:
self.audio_size]
return AudioData(byte_data, self.audio.sample_rate,
self.audio.sample_width)

View File

@ -33,7 +33,7 @@ class PairingSkill(MycroftSkill):
def initialize(self):
intent = IntentBuilder("PairingIntent").require(
"DevicePairingPhrase").build()
"DevicePairingPhrase").build()
self.load_data_files(dirname(__file__))
self.register_intent(intent, handler=self.handle_pairing_request)
@ -46,8 +46,8 @@ class PairingSkill(MycroftSkill):
self.emitter.on("recognizer_loop:audio_output_start",
self.__display_pairing_code)
self.speak_dialog(
"pairing.instructions",
data={"pairing_code": ', ,'.join(self.client.pairing_code)})
"pairing.instructions",
data={"pairing_code": ', ,'.join(self.client.pairing_code)})
def __display_pairing_code(self, event=None):
if self.client.paired: