This fixes several issues related to wake-up words and audio recording, including #595.
* The wake up sound is now played synchronously, thus not included in the recorded audio * The minimum recorded phrase must be at least 0.5 seconds (instead of 0.1), and must be continuously quiet for that duration * The silence threshold is reset every time we begin listening for a wakeword * The silence threshold adjusts upward to just above the ambient sound while waiting for the wake-word * Reformatted some comments to use Google-style docstrings, and added more commentspull/660/head
parent
0036f404b5
commit
23c445a904
|
@ -27,7 +27,7 @@ from mycroft.identity import IdentityManager
|
|||
from mycroft.messagebus.client.ws import WebsocketClient
|
||||
from mycroft.messagebus.message import Message
|
||||
from mycroft.tts import TTSFactory
|
||||
from mycroft.util import kill, play_wav, resolve_resource_file, create_signal
|
||||
from mycroft.util import kill, create_signal
|
||||
from mycroft.util.log import getLogger
|
||||
from mycroft.lock import Lock as PIDLock # Create/Support PID locking file
|
||||
|
||||
|
@ -42,15 +42,6 @@ config = ConfigurationManager.get()
|
|||
|
||||
def handle_record_begin():
|
||||
logger.info("Begin Recording...")
|
||||
|
||||
# If enabled, play a wave file with a short sound to audibly
|
||||
# indicate recording has begun.
|
||||
if config.get('confirm_listening'):
|
||||
file = resolve_resource_file(
|
||||
config.get('sounds').get('start_listening'))
|
||||
if file:
|
||||
play_wav(file)
|
||||
|
||||
ws.emit(Message('recognizer_loop:record_begin'))
|
||||
|
||||
|
||||
|
|
|
@ -30,10 +30,16 @@ from speech_recognition import (
|
|||
)
|
||||
|
||||
from mycroft.configuration import ConfigurationManager
|
||||
from mycroft.util import check_for_signal, get_ipc_directory
|
||||
from mycroft.util import (
|
||||
check_for_signal,
|
||||
get_ipc_directory,
|
||||
resolve_resource_file,
|
||||
play_wav
|
||||
)
|
||||
from mycroft.util.log import getLogger
|
||||
|
||||
listener_config = ConfigurationManager.get().get('listener')
|
||||
config = ConfigurationManager.get()
|
||||
listener_config = config.get('listener')
|
||||
logger = getLogger(__name__)
|
||||
__author__ = 'seanfitz'
|
||||
|
||||
|
@ -130,9 +136,13 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
|
||||
# The minimum seconds of noise before a
|
||||
# phrase can be considered complete
|
||||
MIN_LOUD_SEC_PER_PHRASE = 0.1
|
||||
MIN_LOUD_SEC_PER_PHRASE = 0.5
|
||||
|
||||
# The maximum length a phrase can be recorded,
|
||||
# The minimum seconds of silence required at the end
|
||||
# before a phrase will be considered complete
|
||||
MIN_SILENCE_AT_END = 0.25
|
||||
|
||||
# The maximum seconds a phrase can be recorded,
|
||||
# provided there is noise the entire time
|
||||
RECORDING_TIMEOUT = 10.0
|
||||
|
||||
|
@ -163,22 +173,30 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
hyp = self.wake_word_recognizer.transcribe(frame_data)
|
||||
return self.wake_word_recognizer.found_wake_word(hyp)
|
||||
|
||||
def record_phrase(self, source, sec_per_buffer):
|
||||
"""
|
||||
This attempts to record an entire spoken phrase. Essentially,
|
||||
this waits for a period of silence and then returns the audio
|
||||
def _record_phrase(self, source, sec_per_buffer):
|
||||
"""Record an entire spoken phrase.
|
||||
|
||||
:rtype: bytearray
|
||||
:param source: AudioSource
|
||||
:param sec_per_buffer: Based on source.SAMPLE_RATE
|
||||
:return: bytearray representing the frame_data of the recorded phrase
|
||||
Essentially, this code waits for a period of silence and then returns
|
||||
the audio. If silence isn't detected, it will terminate and return
|
||||
a buffer of RECORDING_TIMEOUT duration.
|
||||
|
||||
Args:
|
||||
source (AudioSource): Source producing the audio chunks
|
||||
sec_per_buffer (float): Fractional number of seconds in each chunk
|
||||
|
||||
Returns:
|
||||
bytearray: complete audio buffer recorded, including any
|
||||
silence at the end of the user's utterance
|
||||
"""
|
||||
|
||||
num_loud_chunks = 0
|
||||
noise = 0
|
||||
|
||||
max_noise = 25
|
||||
min_noise = 0
|
||||
|
||||
silence_duration = 0
|
||||
|
||||
def increase_noise(level):
|
||||
if level < max_noise:
|
||||
return level + 200 * sec_per_buffer
|
||||
|
@ -217,7 +235,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
num_loud_chunks += 1
|
||||
else:
|
||||
noise = decrease_noise(noise)
|
||||
self.adjust_threshold(energy, sec_per_buffer)
|
||||
self._adjust_threshold(energy, sec_per_buffer)
|
||||
|
||||
if num_chunks % 10 == 0:
|
||||
with open(self.mic_level_file, 'w') as f:
|
||||
|
@ -226,7 +244,14 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
f.close()
|
||||
|
||||
was_loud_enough = num_loud_chunks > min_loud_chunks
|
||||
|
||||
quiet_enough = noise <= min_noise
|
||||
if quiet_enough:
|
||||
silence_duration += sec_per_buffer
|
||||
if silence_duration < self.MIN_SILENCE_AT_END:
|
||||
quiet_enough = False # gotta be silent for min of 1/4 sec
|
||||
else:
|
||||
silence_duration = 0
|
||||
recorded_too_much_silence = num_chunks > max_chunks_of_silence
|
||||
if quiet_enough and (was_loud_enough or recorded_too_much_silence):
|
||||
phrase_complete = True
|
||||
|
@ -239,7 +264,13 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
def sec_to_bytes(sec, source):
|
||||
return sec * source.SAMPLE_RATE * source.SAMPLE_WIDTH
|
||||
|
||||
def wait_until_wake_word(self, source, sec_per_buffer):
|
||||
def _wait_until_wake_word(self, source, sec_per_buffer):
|
||||
"""Listen continuously on source until a wake word is spoken
|
||||
|
||||
Args:
|
||||
source (AudioSource): Source producing the audio chunks
|
||||
sec_per_buffer (float): Fractional number of seconds in each chunk
|
||||
"""
|
||||
num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
|
||||
source.SAMPLE_WIDTH)
|
||||
|
||||
|
@ -255,7 +286,17 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)
|
||||
|
||||
said_wake_word = False
|
||||
|
||||
# Rolling buffer to track the audio energy (loudness) heard on
|
||||
# the source recently. An average audio energy is maintained
|
||||
# based on these levels.
|
||||
energies = []
|
||||
idx_energy = 0
|
||||
avg_energy = 0.0
|
||||
energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs
|
||||
|
||||
counter = 0
|
||||
|
||||
while not said_wake_word:
|
||||
if check_for_signal('buttonPress'):
|
||||
said_wake_word = True
|
||||
|
@ -265,16 +306,33 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
|
||||
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
|
||||
if energy < self.energy_threshold * self.multiplier:
|
||||
self.adjust_threshold(energy, sec_per_buffer)
|
||||
self._adjust_threshold(energy, sec_per_buffer)
|
||||
|
||||
if counter > 2:
|
||||
if len(energies) < energy_avg_samples:
|
||||
# build the average
|
||||
energies.append(energy)
|
||||
avg_energy += float(energy)/energy_avg_samples
|
||||
else:
|
||||
# maintain the running average and rolling buffer
|
||||
avg_energy -= float(energies[idx_energy])/energy_avg_samples
|
||||
avg_energy += float(energy)/energy_avg_samples
|
||||
energies[idx_energy] = energy
|
||||
idx_energy = (idx_energy+1) % energy_avg_samples
|
||||
|
||||
# maintain the threshold using average
|
||||
if energy < avg_energy*1.5:
|
||||
if energy > self.energy_threshold:
|
||||
# bump the threshold to just above this value
|
||||
self.energy_threshold = energy*1.2
|
||||
|
||||
# Periodically output energy level stats. This can be used to
|
||||
# visualize the microphone input, e.g. a needle on a meter.
|
||||
if counter % 3:
|
||||
with open(self.mic_level_file, 'w') as f:
|
||||
f.write("Energy: cur=" + str(energy) + " thresh=" +
|
||||
str(self.energy_threshold))
|
||||
f.close()
|
||||
counter = 0
|
||||
else:
|
||||
counter += 1
|
||||
counter += 1
|
||||
|
||||
# At first, the buffer is empty and must fill up. After that
|
||||
# just drop the first chunk bytes to keep it the same size.
|
||||
|
@ -290,7 +348,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
said_wake_word = self.wake_word_in_audio(byte_data + silence)
|
||||
|
||||
@staticmethod
|
||||
def create_audio_data(raw_data, source):
|
||||
def _create_audio_data(raw_data, source):
|
||||
"""
|
||||
Constructs an AudioData instance with the same parameters
|
||||
as the source and the specified frame_data
|
||||
|
@ -298,31 +356,54 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|||
return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
|
||||
|
||||
def listen(self, source, emitter):
|
||||
"""
|
||||
Listens for audio that Mycroft should respond to
|
||||
"""Listens for chunks of audio that Mycroft should perform STT on.
|
||||
|
||||
:param source: an ``AudioSource`` instance for reading from
|
||||
:param emitter: a pyee EventEmitter for sending when the wakeword
|
||||
has been found
|
||||
This will listen continuously for a wake-up-word, then return the
|
||||
audio chunk containing the spoken phrase that comes immediately
|
||||
afterwards.
|
||||
|
||||
Args:
|
||||
source (AudioSource): Source producing the audio chunks
|
||||
emitter (EventEmitter): Emitter for notifications of when recording
|
||||
begins and ends.
|
||||
|
||||
Returns:
|
||||
AudioData: audio with the user's utterance, minus the wake-up-word
|
||||
"""
|
||||
assert isinstance(source, AudioSource), "Source must be an AudioSource"
|
||||
|
||||
# bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH
|
||||
sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
|
||||
|
||||
# Every time a new 'listen()' request begins, reset the threshold
|
||||
# used for silence detection. This is as good of a reset point as
|
||||
# any, as we expect the user and Mycroft to not be talking.
|
||||
# NOTE: adjust_for_ambient_noise() doc claims it will stop early if
|
||||
# speech is detected, but there is no code to actually do that.
|
||||
self.adjust_for_ambient_noise(source, 1.0)
|
||||
|
||||
logger.debug("Waiting for wake word...")
|
||||
self.wait_until_wake_word(source, sec_per_buffer)
|
||||
self._wait_until_wake_word(source, sec_per_buffer)
|
||||
|
||||
logger.debug("Recording...")
|
||||
emitter.emit("recognizer_loop:record_begin")
|
||||
frame_data = self.record_phrase(source, sec_per_buffer)
|
||||
audio_data = self.create_audio_data(frame_data, source)
|
||||
|
||||
# If enabled, play a wave file with a short sound to audibly
|
||||
# indicate recording has begun.
|
||||
if config.get('confirm_listening'):
|
||||
file = resolve_resource_file(
|
||||
config.get('sounds').get('start_listening'))
|
||||
if file:
|
||||
play_wav(file)
|
||||
|
||||
frame_data = self._record_phrase(source, sec_per_buffer)
|
||||
audio_data = self._create_audio_data(frame_data, source)
|
||||
emitter.emit("recognizer_loop:record_end")
|
||||
logger.debug("Thinking...")
|
||||
|
||||
return audio_data
|
||||
|
||||
def adjust_threshold(self, energy, seconds_per_buffer):
|
||||
def _adjust_threshold(self, energy, seconds_per_buffer):
|
||||
if self.dynamic_energy_threshold and energy > 0:
|
||||
# account for different chunk sizes and rates
|
||||
damping = (
|
||||
|
|
Loading…
Reference in New Issue