This fixes several issues related to wake-up words and audio recording, including #595.

* The wake up sound is now played synchronously, thus not included in the recorded audio
* The minimum recorded phrase must be at least 0.5 seconds (instead of 0.1), and must be continuously quiet for that duration
* The silence threshold is reset every time we begin listening for a wakeword
* The silence threshold adjusts upward to just above the ambient sound while waiting for the wake-word
* Reformatted some comments to use Google-style docstrings, and added more comments
pull/660/head
penrods 2017-04-09 16:34:48 -07:00 committed by Steve Penrod
parent 0036f404b5
commit 23c445a904
2 changed files with 111 additions and 39 deletions

View File

@ -27,7 +27,7 @@ from mycroft.identity import IdentityManager
from mycroft.messagebus.client.ws import WebsocketClient
from mycroft.messagebus.message import Message
from mycroft.tts import TTSFactory
from mycroft.util import kill, play_wav, resolve_resource_file, create_signal
from mycroft.util import kill, create_signal
from mycroft.util.log import getLogger
from mycroft.lock import Lock as PIDLock # Create/Support PID locking file
@ -42,15 +42,6 @@ config = ConfigurationManager.get()
def handle_record_begin():
logger.info("Begin Recording...")
# If enabled, play a wave file with a short sound to audibly
# indicate recording has begun.
if config.get('confirm_listening'):
file = resolve_resource_file(
config.get('sounds').get('start_listening'))
if file:
play_wav(file)
ws.emit(Message('recognizer_loop:record_begin'))

View File

@ -30,10 +30,16 @@ from speech_recognition import (
)
from mycroft.configuration import ConfigurationManager
from mycroft.util import check_for_signal, get_ipc_directory
from mycroft.util import (
check_for_signal,
get_ipc_directory,
resolve_resource_file,
play_wav
)
from mycroft.util.log import getLogger
listener_config = ConfigurationManager.get().get('listener')
config = ConfigurationManager.get()
listener_config = config.get('listener')
logger = getLogger(__name__)
__author__ = 'seanfitz'
@ -130,9 +136,13 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
# The minimum seconds of noise before a
# phrase can be considered complete
MIN_LOUD_SEC_PER_PHRASE = 0.1
MIN_LOUD_SEC_PER_PHRASE = 0.5
# The maximum length a phrase can be recorded,
# The minimum seconds of silence required at the end
# before a phrase will be considered complete
MIN_SILENCE_AT_END = 0.25
# The maximum seconds a phrase can be recorded,
# provided there is noise the entire time
RECORDING_TIMEOUT = 10.0
@ -163,22 +173,30 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
hyp = self.wake_word_recognizer.transcribe(frame_data)
return self.wake_word_recognizer.found_wake_word(hyp)
def record_phrase(self, source, sec_per_buffer):
"""
This attempts to record an entire spoken phrase. Essentially,
this waits for a period of silence and then returns the audio
def _record_phrase(self, source, sec_per_buffer):
"""Record an entire spoken phrase.
:rtype: bytearray
:param source: AudioSource
:param sec_per_buffer: Based on source.SAMPLE_RATE
:return: bytearray representing the frame_data of the recorded phrase
Essentially, this code waits for a period of silence and then returns
the audio. If silence isn't detected, it will terminate and return
a buffer of RECORDING_TIMEOUT duration.
Args:
source (AudioSource): Source producing the audio chunks
sec_per_buffer (float): Fractional number of seconds in each chunk
Returns:
bytearray: complete audio buffer recorded, including any
silence at the end of the user's utterance
"""
num_loud_chunks = 0
noise = 0
max_noise = 25
min_noise = 0
silence_duration = 0
def increase_noise(level):
if level < max_noise:
return level + 200 * sec_per_buffer
@ -217,7 +235,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
num_loud_chunks += 1
else:
noise = decrease_noise(noise)
self.adjust_threshold(energy, sec_per_buffer)
self._adjust_threshold(energy, sec_per_buffer)
if num_chunks % 10 == 0:
with open(self.mic_level_file, 'w') as f:
@ -226,7 +244,14 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
f.close()
was_loud_enough = num_loud_chunks > min_loud_chunks
quiet_enough = noise <= min_noise
if quiet_enough:
silence_duration += sec_per_buffer
if silence_duration < self.MIN_SILENCE_AT_END:
quiet_enough = False # gotta be silent for min of 1/4 sec
else:
silence_duration = 0
recorded_too_much_silence = num_chunks > max_chunks_of_silence
if quiet_enough and (was_loud_enough or recorded_too_much_silence):
phrase_complete = True
@ -239,7 +264,13 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
def sec_to_bytes(sec, source):
return sec * source.SAMPLE_RATE * source.SAMPLE_WIDTH
def wait_until_wake_word(self, source, sec_per_buffer):
def _wait_until_wake_word(self, source, sec_per_buffer):
"""Listen continuously on source until a wake word is spoken
Args:
source (AudioSource): Source producing the audio chunks
sec_per_buffer (float): Fractional number of seconds in each chunk
"""
num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
source.SAMPLE_WIDTH)
@ -255,7 +286,17 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)
said_wake_word = False
# Rolling buffer to track the audio energy (loudness) heard on
# the source recently. An average audio energy is maintained
# based on these levels.
energies = []
idx_energy = 0
avg_energy = 0.0
energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs
counter = 0
while not said_wake_word:
if check_for_signal('buttonPress'):
said_wake_word = True
@ -265,16 +306,33 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
if energy < self.energy_threshold * self.multiplier:
self.adjust_threshold(energy, sec_per_buffer)
self._adjust_threshold(energy, sec_per_buffer)
if counter > 2:
if len(energies) < energy_avg_samples:
# build the average
energies.append(energy)
avg_energy += float(energy)/energy_avg_samples
else:
# maintain the running average and rolling buffer
avg_energy -= float(energies[idx_energy])/energy_avg_samples
avg_energy += float(energy)/energy_avg_samples
energies[idx_energy] = energy
idx_energy = (idx_energy+1) % energy_avg_samples
# maintain the threshold using average
if energy < avg_energy*1.5:
if energy > self.energy_threshold:
# bump the threshold to just above this value
self.energy_threshold = energy*1.2
# Periodically output energy level stats. This can be used to
# visualize the microphone input, e.g. a needle on a meter.
if counter % 3:
with open(self.mic_level_file, 'w') as f:
f.write("Energy: cur=" + str(energy) + " thresh=" +
str(self.energy_threshold))
f.close()
counter = 0
else:
counter += 1
counter += 1
# At first, the buffer is empty and must fill up. After that
# just drop the first chunk bytes to keep it the same size.
@ -290,7 +348,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
said_wake_word = self.wake_word_in_audio(byte_data + silence)
@staticmethod
def create_audio_data(raw_data, source):
def _create_audio_data(raw_data, source):
"""
Constructs an AudioData instance with the same parameters
as the source and the specified frame_data
@ -298,31 +356,54 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def listen(self, source, emitter):
"""
Listens for audio that Mycroft should respond to
"""Listens for chunks of audio that Mycroft should perform STT on.
:param source: an ``AudioSource`` instance for reading from
:param emitter: a pyee EventEmitter for sending when the wakeword
has been found
This will listen continuously for a wake-up-word, then return the
audio chunk containing the spoken phrase that comes immediately
afterwards.
Args:
source (AudioSource): Source producing the audio chunks
emitter (EventEmitter): Emitter for notifications of when recording
begins and ends.
Returns:
AudioData: audio with the user's utterance, minus the wake-up-word
"""
assert isinstance(source, AudioSource), "Source must be an AudioSource"
# bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH
sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
# Every time a new 'listen()' request begins, reset the threshold
# used for silence detection. This is as good of a reset point as
# any, as we expect the user and Mycroft to not be talking.
# NOTE: adjust_for_ambient_noise() doc claims it will stop early if
# speech is detected, but there is no code to actually do that.
self.adjust_for_ambient_noise(source, 1.0)
logger.debug("Waiting for wake word...")
self.wait_until_wake_word(source, sec_per_buffer)
self._wait_until_wake_word(source, sec_per_buffer)
logger.debug("Recording...")
emitter.emit("recognizer_loop:record_begin")
frame_data = self.record_phrase(source, sec_per_buffer)
audio_data = self.create_audio_data(frame_data, source)
# If enabled, play a wave file with a short sound to audibly
# indicate recording has begun.
if config.get('confirm_listening'):
file = resolve_resource_file(
config.get('sounds').get('start_listening'))
if file:
play_wav(file)
frame_data = self._record_phrase(source, sec_per_buffer)
audio_data = self._create_audio_data(frame_data, source)
emitter.emit("recognizer_loop:record_end")
logger.debug("Thinking...")
return audio_data
def adjust_threshold(self, energy, seconds_per_buffer):
def _adjust_threshold(self, energy, seconds_per_buffer):
if self.dynamic_energy_threshold and energy > 0:
# account for different chunk sizes and rates
damping = (