766 lines
28 KiB
Python
766 lines
28 KiB
Python
# Copyright 2017 Mycroft AI Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import audioop
|
|
from time import sleep, time as get_time
|
|
|
|
from collections import deque, namedtuple
|
|
import datetime
|
|
import json
|
|
import os
|
|
from os.path import isdir, join
|
|
import pyaudio
|
|
import requests
|
|
import speech_recognition
|
|
from hashlib import md5
|
|
from io import BytesIO, StringIO
|
|
from speech_recognition import (
|
|
Microphone,
|
|
AudioSource,
|
|
AudioData
|
|
)
|
|
from tempfile import gettempdir
|
|
from threading import Thread, Lock
|
|
|
|
from mycroft.api import DeviceApi
|
|
from mycroft.configuration import Configuration
|
|
from mycroft.session import SessionManager
|
|
from mycroft.util import (
|
|
check_for_signal,
|
|
get_ipc_directory,
|
|
resolve_resource_file,
|
|
play_wav
|
|
)
|
|
from mycroft.util.log import LOG
|
|
|
|
from .data_structures import RollingMean, CyclicAudioBuffer
|
|
|
|
|
|
WakeWordData = namedtuple('WakeWordData',
|
|
['audio', 'found', 'stopped', 'end_audio'])
|
|
|
|
|
|
class MutableStream:
|
|
def __init__(self, wrapped_stream, format, muted=False):
|
|
assert wrapped_stream is not None
|
|
self.wrapped_stream = wrapped_stream
|
|
|
|
self.SAMPLE_WIDTH = pyaudio.get_sample_size(format)
|
|
self.muted_buffer = b''.join([b'\x00' * self.SAMPLE_WIDTH])
|
|
self.read_lock = Lock()
|
|
|
|
self.muted = muted
|
|
if muted:
|
|
self.mute()
|
|
|
|
def mute(self):
|
|
"""Stop the stream and set the muted flag."""
|
|
with self.read_lock:
|
|
self.muted = True
|
|
self.wrapped_stream.stop_stream()
|
|
|
|
def unmute(self):
|
|
"""Start the stream and clear the muted flag."""
|
|
with self.read_lock:
|
|
self.muted = False
|
|
self.wrapped_stream.start_stream()
|
|
|
|
def read(self, size, of_exc=False):
|
|
"""Read data from stream.
|
|
|
|
Args:
|
|
size (int): Number of bytes to read
|
|
of_exc (bool): flag determining if the audio producer thread
|
|
should throw IOError at overflows.
|
|
|
|
Returns:
|
|
(bytes) Data read from device
|
|
"""
|
|
frames = deque()
|
|
remaining = size
|
|
with self.read_lock:
|
|
while remaining > 0:
|
|
# If muted during read return empty buffer. This ensures no
|
|
# reads occur while the stream is stopped
|
|
if self.muted:
|
|
return self.muted_buffer
|
|
|
|
to_read = min(self.wrapped_stream.get_read_available(),
|
|
remaining)
|
|
if to_read <= 0:
|
|
sleep(.01)
|
|
continue
|
|
result = self.wrapped_stream.read(to_read,
|
|
exception_on_overflow=of_exc)
|
|
frames.append(result)
|
|
remaining -= to_read
|
|
|
|
input_latency = self.wrapped_stream.get_input_latency()
|
|
if input_latency > 0.2:
|
|
LOG.warning("High input latency: %f" % input_latency)
|
|
audio = b"".join(list(frames))
|
|
return audio
|
|
|
|
def close(self):
|
|
self.wrapped_stream.close()
|
|
self.wrapped_stream = None
|
|
|
|
def is_stopped(self):
|
|
try:
|
|
return self.wrapped_stream.is_stopped()
|
|
except Exception as e:
|
|
LOG.error(repr(e))
|
|
return True # Assume the stream has been closed and thusly stopped
|
|
|
|
def stop_stream(self):
|
|
return self.wrapped_stream.stop_stream()
|
|
|
|
|
|
class MutableMicrophone(Microphone):
|
|
def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024,
|
|
mute=False):
|
|
Microphone.__init__(self, device_index=device_index,
|
|
sample_rate=sample_rate, chunk_size=chunk_size)
|
|
self.muted = False
|
|
if mute:
|
|
self.mute()
|
|
|
|
def __enter__(self):
|
|
return self._start()
|
|
|
|
def _start(self):
|
|
"""Open the selected device and setup the stream."""
|
|
assert self.stream is None, \
|
|
"This audio source is already inside a context manager"
|
|
self.audio = pyaudio.PyAudio()
|
|
self.stream = MutableStream(self.audio.open(
|
|
input_device_index=self.device_index, channels=1,
|
|
format=self.format, rate=self.SAMPLE_RATE,
|
|
frames_per_buffer=self.CHUNK,
|
|
input=True, # stream is an input stream
|
|
), self.format, self.muted)
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
return self._stop()
|
|
|
|
def _stop(self):
|
|
"""Stop and close an open stream."""
|
|
try:
|
|
if not self.stream.is_stopped():
|
|
self.stream.stop_stream()
|
|
self.stream.close()
|
|
except Exception:
|
|
LOG.exception('Failed to stop mic input stream')
|
|
# Let's pretend nothing is wrong...
|
|
|
|
self.stream = None
|
|
self.audio.terminate()
|
|
|
|
def restart(self):
|
|
"""Shutdown input device and restart."""
|
|
self._stop()
|
|
self._start()
|
|
|
|
def mute(self):
|
|
self.muted = True
|
|
if self.stream:
|
|
self.stream.mute()
|
|
|
|
def unmute(self):
|
|
self.muted = False
|
|
if self.stream:
|
|
self.stream.unmute()
|
|
|
|
def is_muted(self):
|
|
return self.muted
|
|
|
|
def duration_to_bytes(self, sec):
|
|
"""Converts a duration in seconds to number of recorded bytes.
|
|
|
|
Args:
|
|
sec: number of seconds
|
|
|
|
Returns:
|
|
(int) equivalent number of bytes recorded by this Mic
|
|
"""
|
|
return int(sec * self.SAMPLE_RATE) * self.SAMPLE_WIDTH
|
|
|
|
|
|
def get_silence(num_bytes):
|
|
return b'\0' * num_bytes
|
|
|
|
|
|
class NoiseTracker:
|
|
"""Noise tracker, used to deterimine if an audio utterance is complete.
|
|
|
|
The current implementation expects a number of loud chunks (not necessary
|
|
in one continous sequence) followed by a short period of continous quiet
|
|
audio data to be considered complete.
|
|
|
|
Args:
|
|
minimum (int): lower noise level will be threshold for "quiet" level
|
|
maximum (int): ceiling of noise level
|
|
sec_per_buffer (float): the length of each buffer used when updating
|
|
the tracker
|
|
loud_time_limit (float): time in seconds of low noise to be considered
|
|
a complete sentence
|
|
silence_time_limit (float): time limit for silence to abort sentence
|
|
silence_after_loud (float): time of silence to finalize the sentence.
|
|
default 0.25 seconds.
|
|
"""
|
|
def __init__(self, minimum, maximum, sec_per_buffer, loud_time_limit,
|
|
silence_time_limit, silence_after_loud_time=0.25):
|
|
self.min_level = minimum
|
|
self.max_level = maximum
|
|
self.sec_per_buffer = sec_per_buffer
|
|
|
|
self.num_loud_chunks = 0
|
|
self.level = 0
|
|
|
|
# Smallest number of loud chunks required to return loud enough
|
|
self.min_loud_chunks = int(loud_time_limit / sec_per_buffer)
|
|
|
|
self.max_silence_duration = silence_time_limit
|
|
self.silence_duration = 0
|
|
|
|
# time of quite period after long enough loud data to consider the
|
|
# sentence complete
|
|
self.silence_after_loud = silence_after_loud_time
|
|
|
|
# Constants
|
|
self.increase_multiplier = 200
|
|
self.decrease_multiplier = 100
|
|
|
|
def _increase_noise(self):
|
|
"""Bumps the current level.
|
|
|
|
Modifies the noise level with a factor depending in the buffer length.
|
|
"""
|
|
if self.level < self.max_level:
|
|
self.level += self.increase_multiplier * self.sec_per_buffer
|
|
|
|
def _decrease_noise(self):
|
|
"""Decrease the current level.
|
|
|
|
Modifies the noise level with a factor depending in the buffer length.
|
|
"""
|
|
if self.level > self.min_level:
|
|
self.level -= self.decrease_multiplier * self.sec_per_buffer
|
|
|
|
def update(self, is_loud):
|
|
"""Update the tracking. with either a loud chunk or a quiet chunk.
|
|
|
|
Args:
|
|
is_loud: True if a loud chunk should be registered
|
|
False if a quiet chunk should be registered
|
|
"""
|
|
if is_loud:
|
|
self._increase_noise()
|
|
self.num_loud_chunks += 1
|
|
else:
|
|
self._decrease_noise()
|
|
# Update duration of energy under the threshold level
|
|
if self._quiet_enough():
|
|
self.silence_duration += self.sec_per_buffer
|
|
else: # Reset silence duration
|
|
self.silence_duration = 0
|
|
|
|
def _loud_enough(self):
|
|
"""Check if the noise loudness criteria is fulfilled.
|
|
|
|
The noise is considered loud enough if it's been over the threshold
|
|
for a certain number of chunks (accumulated, not in a row).
|
|
"""
|
|
return self.num_loud_chunks > self.min_loud_chunks
|
|
|
|
def _quiet_enough(self):
|
|
"""Check if the noise quietness criteria is fulfilled.
|
|
|
|
The quiet level is instant and will return True if the level is lower
|
|
or equal to the minimum noise level.
|
|
"""
|
|
return self.level <= self.min_level
|
|
|
|
def recording_complete(self):
|
|
"""Has the end creteria for the recording been met.
|
|
|
|
If the noise level has decresed from a loud level to a low level
|
|
the user has stopped speaking.
|
|
|
|
Alternatively if a lot of silence was recorded without detecting
|
|
a loud enough phrase.
|
|
"""
|
|
too_much_silence = (self.silence_duration > self.max_silence_duration)
|
|
if too_much_silence:
|
|
LOG.debug('Too much silence recorded without start of sentence '
|
|
'detected')
|
|
return ((self._quiet_enough() and
|
|
self.silence_duration > self.silence_after_loud) and
|
|
(self._loud_enough() or too_much_silence))
|
|
|
|
|
|
class ResponsiveRecognizer(speech_recognition.Recognizer):
|
|
# Padding of silence when feeding to pocketsphinx
|
|
SILENCE_SEC = 0.01
|
|
|
|
# The minimum seconds of noise before a
|
|
# phrase can be considered complete
|
|
MIN_LOUD_SEC_PER_PHRASE = 0.5
|
|
|
|
# The minimum seconds of silence required at the end
|
|
# before a phrase will be considered complete
|
|
MIN_SILENCE_AT_END = 0.25
|
|
|
|
# Time between pocketsphinx checks for the wake word
|
|
SEC_BETWEEN_WW_CHECKS = 0.2
|
|
|
|
def __init__(self, wake_word_recognizer, watchdog=None):
|
|
self._watchdog = watchdog or (lambda: None) # Default to dummy func
|
|
self.config = Configuration.get()
|
|
listener_config = self.config.get('listener')
|
|
self.upload_url = listener_config['wake_word_upload']['url']
|
|
self.upload_disabled = listener_config['wake_word_upload']['disable']
|
|
self.wake_word_name = wake_word_recognizer.key_phrase
|
|
|
|
self.overflow_exc = listener_config.get('overflow_exception', False)
|
|
|
|
super().__init__()
|
|
self.wake_word_recognizer = wake_word_recognizer
|
|
self.audio = pyaudio.PyAudio()
|
|
self.multiplier = listener_config.get('multiplier')
|
|
self.energy_ratio = listener_config.get('energy_ratio')
|
|
|
|
# Check the config for the flag to save wake words, utterances
|
|
# and for a path under which to save them
|
|
self.save_utterances = listener_config.get('save_utterances', False)
|
|
self.save_wake_words = listener_config.get('record_wake_words', False)
|
|
self.save_path = listener_config.get('save_path', gettempdir())
|
|
self.saved_wake_words_dir = join(self.save_path, 'mycroft_wake_words')
|
|
if self.save_wake_words and not isdir(self.saved_wake_words_dir):
|
|
os.mkdir(self.saved_wake_words_dir)
|
|
self.saved_utterances_dir = join(self.save_path, 'mycroft_utterances')
|
|
if self.save_utterances and not isdir(self.saved_utterances_dir):
|
|
os.mkdir(self.saved_utterances_dir)
|
|
|
|
self.mic_level_file = os.path.join(get_ipc_directory(), "mic_level")
|
|
|
|
# Signal statuses
|
|
self._stop_signaled = False
|
|
self._listen_triggered = False
|
|
|
|
self._account_id = None
|
|
|
|
# The maximum seconds a phrase can be recorded,
|
|
# provided there is noise the entire time
|
|
self.recording_timeout = listener_config.get('recording_timeout',
|
|
10.0)
|
|
|
|
# The maximum time it will continue to record silence
|
|
# when not enough noise has been detected
|
|
self.recording_timeout_with_silence = listener_config.get(
|
|
'recording_timeout_with_silence', 3.0)
|
|
|
|
@property
|
|
def account_id(self):
|
|
"""Fetch account from backend when needed.
|
|
|
|
If an error occurs it's handled and a temporary value is returned.
|
|
When a value is received it will be cached until next start.
|
|
"""
|
|
if not self._account_id:
|
|
try:
|
|
self._account_id = DeviceApi().get()['user']['uuid']
|
|
except (requests.RequestException, AttributeError):
|
|
pass # These are expected and won't be reported
|
|
except Exception as e:
|
|
LOG.debug('Unhandled exception while determining device_id, '
|
|
'Error: {}'.format(repr(e)))
|
|
|
|
return self._account_id or '0'
|
|
|
|
def record_sound_chunk(self, source):
|
|
return source.stream.read(source.CHUNK, self.overflow_exc)
|
|
|
|
@staticmethod
|
|
def calc_energy(sound_chunk, sample_width):
|
|
return audioop.rms(sound_chunk, sample_width)
|
|
|
|
def _record_phrase(
|
|
self,
|
|
source,
|
|
sec_per_buffer,
|
|
stream=None,
|
|
ww_frames=None
|
|
):
|
|
"""Record an entire spoken phrase.
|
|
|
|
Essentially, this code waits for a period of silence and then returns
|
|
the audio. If silence isn't detected, it will terminate and return
|
|
a buffer of self.recording_timeout duration.
|
|
|
|
Args:
|
|
source (AudioSource): Source producing the audio chunks
|
|
sec_per_buffer (float): Fractional number of seconds in each chunk
|
|
stream (AudioStreamHandler): Stream target that will receive chunks
|
|
of the utterance audio while it is
|
|
being recorded.
|
|
ww_frames (deque): Frames of audio data from the last part of wake
|
|
word detection.
|
|
|
|
Returns:
|
|
bytearray: complete audio buffer recorded, including any
|
|
silence at the end of the user's utterance
|
|
"""
|
|
noise_tracker = NoiseTracker(0, 25, sec_per_buffer,
|
|
self.MIN_LOUD_SEC_PER_PHRASE,
|
|
self.recording_timeout_with_silence)
|
|
|
|
# Maximum number of chunks to record before timing out
|
|
max_chunks = int(self.recording_timeout / sec_per_buffer)
|
|
num_chunks = 0
|
|
|
|
# bytearray to store audio in, initialized with a single sample of
|
|
# silence.
|
|
byte_data = get_silence(source.SAMPLE_WIDTH)
|
|
|
|
if stream:
|
|
stream.stream_start()
|
|
|
|
phrase_complete = False
|
|
while num_chunks < max_chunks and not phrase_complete:
|
|
if ww_frames:
|
|
chunk = ww_frames.popleft()
|
|
else:
|
|
chunk = self.record_sound_chunk(source)
|
|
byte_data += chunk
|
|
num_chunks += 1
|
|
|
|
if stream:
|
|
stream.stream_chunk(chunk)
|
|
|
|
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
|
|
test_threshold = self.energy_threshold * self.multiplier
|
|
is_loud = energy > test_threshold
|
|
noise_tracker.update(is_loud)
|
|
if not is_loud:
|
|
self._adjust_threshold(energy, sec_per_buffer)
|
|
|
|
# The phrase is complete if the noise_tracker end of sentence
|
|
# criteria is met or if the top-button is pressed
|
|
phrase_complete = (noise_tracker.recording_complete() or
|
|
check_for_signal('buttonPress'))
|
|
|
|
# Periodically write the energy level to the mic level file.
|
|
if num_chunks % 10 == 0:
|
|
self._watchdog()
|
|
self.write_mic_level(energy, source)
|
|
|
|
return byte_data
|
|
|
|
def write_mic_level(self, energy, source):
|
|
with open(self.mic_level_file, 'w') as f:
|
|
f.write('Energy: cur={} thresh={:.3f} muted={}'.format(
|
|
energy,
|
|
self.energy_threshold,
|
|
int(source.muted)
|
|
)
|
|
)
|
|
|
|
def _skip_wake_word(self):
|
|
"""Check if told programatically to skip the wake word
|
|
|
|
For example when we are in a dialog with the user.
|
|
"""
|
|
if self._listen_triggered:
|
|
return True
|
|
|
|
# Pressing the Mark 1 button can start recording (unless
|
|
# it is being used to mean 'stop' instead)
|
|
if check_for_signal('buttonPress', 1):
|
|
# give other processes time to consume this signal if
|
|
# it was meant to be a 'stop'
|
|
sleep(0.25)
|
|
if check_for_signal('buttonPress'):
|
|
# Signal is still here, assume it was intended to
|
|
# begin recording
|
|
LOG.debug("Button Pressed, wakeword not needed")
|
|
return True
|
|
|
|
return False
|
|
|
|
def stop(self):
|
|
"""Signal stop and exit waiting state."""
|
|
self._stop_signaled = True
|
|
|
|
def _compile_metadata(self):
|
|
ww_module = self.wake_word_recognizer.__class__.__name__
|
|
if ww_module == 'PreciseHotword':
|
|
model_path = self.wake_word_recognizer.precise_model
|
|
with open(model_path, 'rb') as f:
|
|
model_hash = md5(f.read()).hexdigest()
|
|
else:
|
|
model_hash = '0'
|
|
|
|
return {
|
|
'name': self.wake_word_name.replace(' ', '-'),
|
|
'engine': md5(ww_module.encode('utf-8')).hexdigest(),
|
|
'time': str(int(1000 * get_time())),
|
|
'sessionId': SessionManager.get().session_id,
|
|
'accountId': self.account_id,
|
|
'model': str(model_hash)
|
|
}
|
|
|
|
def trigger_listen(self):
|
|
"""Externally trigger listening."""
|
|
LOG.debug('Listen triggered from external source.')
|
|
self._listen_triggered = True
|
|
|
|
def _upload_wakeword(self, audio, metadata):
|
|
"""Upload the wakeword in a background thread."""
|
|
LOG.debug(
|
|
"Wakeword uploading has been disabled. The API endpoint used in "
|
|
"Mycroft-core v20.2 and below has been deprecated. To contribute "
|
|
"new wakeword samples please upgrade to v20.8 or above."
|
|
)
|
|
# def upload(audio, metadata):
|
|
# requests.post(self.upload_url,
|
|
# files={'audio': BytesIO(audio.get_wav_data()),
|
|
# 'metadata': StringIO(json.dumps(metadata))})
|
|
# Thread(target=upload, daemon=True, args=(audio, metadata)).start()
|
|
|
|
def _send_wakeword_info(self, emitter):
|
|
"""Send messagebus message indicating that a wakeword was received.
|
|
|
|
Args:
|
|
emitter: bus emitter to send information on.
|
|
"""
|
|
SessionManager.touch()
|
|
payload = {'utterance': self.wake_word_name,
|
|
'session': SessionManager.get().session_id}
|
|
emitter.emit("recognizer_loop:wakeword", payload)
|
|
|
|
def _write_wakeword_to_disk(self, audio, metadata):
|
|
"""Write wakeword to disk.
|
|
|
|
Args:
|
|
audio: Audio data to write
|
|
metadata: List of metadata about the captured wakeword
|
|
"""
|
|
filename = join(self.saved_wake_words_dir,
|
|
'_'.join(str(metadata[k]) for k in sorted(metadata)) +
|
|
'.wav')
|
|
with open(filename, 'wb') as f:
|
|
f.write(audio.get_wav_data())
|
|
|
|
def _handle_wakeword_found(self, audio_data, source):
|
|
"""Perform actions to be triggered after a wakeword is found.
|
|
|
|
This includes: emit event on messagebus that a wakeword is heard,
|
|
store wakeword to disk if configured and sending the wakeword data
|
|
to the cloud in case the user has opted into the data sharing.
|
|
"""
|
|
# Save and upload positive wake words as appropriate
|
|
upload_allowed = (self.config['opt_in'] and not self.upload_disabled)
|
|
if (self.save_wake_words or upload_allowed):
|
|
audio = self._create_audio_data(audio_data, source)
|
|
metadata = self._compile_metadata()
|
|
if self.save_wake_words:
|
|
# Save wake word locally
|
|
self._write_wakeword_to_disk(audio, metadata)
|
|
# Upload wake word for opt_in people
|
|
if upload_allowed:
|
|
self._upload_wakeword(audio, metadata)
|
|
|
|
def _wait_until_wake_word(self, source, sec_per_buffer):
|
|
"""Listen continuously on source until a wake word is spoken
|
|
|
|
Args:
|
|
source (AudioSource): Source producing the audio chunks
|
|
sec_per_buffer (float): Fractional number of seconds in each chunk
|
|
"""
|
|
|
|
# The maximum audio in seconds to keep for transcribing a phrase
|
|
# The wake word must fit in this time
|
|
ww_duration = self.wake_word_recognizer.expected_duration
|
|
ww_test_duration = max(3, ww_duration)
|
|
|
|
mic_write_counter = 0
|
|
num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
|
|
source.SAMPLE_WIDTH)
|
|
|
|
silence = get_silence(num_silent_bytes)
|
|
|
|
# Max bytes for byte_data before audio is removed from the front
|
|
max_size = source.duration_to_bytes(ww_duration)
|
|
test_size = source.duration_to_bytes(ww_test_duration)
|
|
audio_buffer = CyclicAudioBuffer(max_size, silence)
|
|
|
|
buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer
|
|
buffers_since_check = 0.0
|
|
|
|
# Rolling buffer to track the audio energy (loudness) heard on
|
|
# the source recently. An average audio energy is maintained
|
|
# based on these levels.
|
|
average_samples = int(5 / sec_per_buffer) # average over last 5 secs
|
|
audio_mean = RollingMean(average_samples)
|
|
|
|
# These are frames immediately after wake word is detected
|
|
# that we want to keep to send to STT
|
|
ww_frames = deque(maxlen=7)
|
|
|
|
said_wake_word = False
|
|
audio_data = None
|
|
while (not said_wake_word and not self._stop_signaled and
|
|
not self._skip_wake_word()):
|
|
chunk = self.record_sound_chunk(source)
|
|
audio_buffer.append(chunk)
|
|
ww_frames.append(chunk)
|
|
|
|
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
|
|
audio_mean.append_sample(energy)
|
|
|
|
if energy < self.energy_threshold * self.multiplier:
|
|
self._adjust_threshold(energy, sec_per_buffer)
|
|
# maintain the threshold using average
|
|
if self.energy_threshold < energy < audio_mean.value * 1.5:
|
|
# bump the threshold to just above this value
|
|
self.energy_threshold = energy * 1.2
|
|
|
|
# Periodically output energy level stats. This can be used to
|
|
# visualize the microphone input, e.g. a needle on a meter.
|
|
if mic_write_counter % 3:
|
|
self._watchdog()
|
|
self.write_mic_level(energy, source)
|
|
mic_write_counter += 1
|
|
|
|
buffers_since_check += 1.0
|
|
# Send chunk to wake_word_recognizer
|
|
self.wake_word_recognizer.update(chunk)
|
|
|
|
if buffers_since_check > buffers_per_check:
|
|
buffers_since_check -= buffers_per_check
|
|
audio_data = audio_buffer.get_last(test_size) + silence
|
|
said_wake_word = \
|
|
self.wake_word_recognizer.found_wake_word(audio_data)
|
|
|
|
self._listen_triggered = False
|
|
return WakeWordData(audio_data, said_wake_word,
|
|
self._stop_signaled, ww_frames)
|
|
|
|
@staticmethod
|
|
def _create_audio_data(raw_data, source):
|
|
"""
|
|
Constructs an AudioData instance with the same parameters
|
|
as the source and the specified frame_data
|
|
"""
|
|
return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
|
|
|
|
def mute_and_confirm_listening(self, source):
|
|
audio_file = resolve_resource_file(
|
|
self.config.get('sounds').get('start_listening'))
|
|
if audio_file:
|
|
source.mute()
|
|
play_wav(audio_file).wait()
|
|
source.unmute()
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def listen(self, source, emitter, stream=None):
|
|
"""Listens for chunks of audio that Mycroft should perform STT on.
|
|
|
|
This will listen continuously for a wake-up-word, then return the
|
|
audio chunk containing the spoken phrase that comes immediately
|
|
afterwards.
|
|
|
|
Args:
|
|
source (AudioSource): Source producing the audio chunks
|
|
emitter (EventEmitter): Emitter for notifications of when recording
|
|
begins and ends.
|
|
stream (AudioStreamHandler): Stream target that will receive chunks
|
|
of the utterance audio while it is
|
|
being recorded
|
|
|
|
Returns:
|
|
AudioData: audio with the user's utterance, minus the wake-up-word
|
|
"""
|
|
assert isinstance(source, AudioSource), "Source must be an AudioSource"
|
|
|
|
# bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH
|
|
sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
|
|
|
|
# Every time a new 'listen()' request begins, reset the threshold
|
|
# used for silence detection. This is as good of a reset point as
|
|
# any, as we expect the user and Mycroft to not be talking.
|
|
# NOTE: adjust_for_ambient_noise() doc claims it will stop early if
|
|
# speech is detected, but there is no code to actually do that.
|
|
self.adjust_for_ambient_noise(source, 1.0)
|
|
|
|
LOG.debug("Waiting for wake word...")
|
|
ww_data = self._wait_until_wake_word(source, sec_per_buffer)
|
|
|
|
ww_frames = None
|
|
if ww_data.found:
|
|
# If the wakeword was heard send it
|
|
self._send_wakeword_info(emitter)
|
|
self._handle_wakeword_found(ww_data.audio, source)
|
|
ww_frames = ww_data.end_audio
|
|
if ww_data.stopped:
|
|
# If the waiting returned from a stop signal
|
|
return
|
|
|
|
LOG.debug("Recording...")
|
|
# If enabled, play a wave file with a short sound to audibly
|
|
# indicate recording has begun.
|
|
if self.config.get('confirm_listening'):
|
|
if self.mute_and_confirm_listening(source):
|
|
# Clear frames from wakeword detctions since they're
|
|
# irrelevant after mute - play wav - unmute sequence
|
|
ww_frames = None
|
|
|
|
# Notify system of recording start
|
|
emitter.emit("recognizer_loop:record_begin")
|
|
|
|
frame_data = self._record_phrase(
|
|
source,
|
|
sec_per_buffer,
|
|
stream,
|
|
ww_frames
|
|
)
|
|
audio_data = self._create_audio_data(frame_data, source)
|
|
emitter.emit("recognizer_loop:record_end")
|
|
if self.save_utterances:
|
|
LOG.info("Recording utterance")
|
|
stamp = str(datetime.datetime.now())
|
|
filename = "/{}/{}.wav".format(
|
|
self.saved_utterances_dir,
|
|
stamp
|
|
)
|
|
with open(filename, 'wb') as filea:
|
|
filea.write(audio_data.get_wav_data())
|
|
LOG.debug("Thinking...")
|
|
|
|
return audio_data
|
|
|
|
def _adjust_threshold(self, energy, seconds_per_buffer):
|
|
if self.dynamic_energy_threshold and energy > 0:
|
|
# account for different chunk sizes and rates
|
|
damping = (
|
|
self.dynamic_energy_adjustment_damping ** seconds_per_buffer)
|
|
target_energy = energy * self.energy_ratio
|
|
self.energy_threshold = (
|
|
self.energy_threshold * damping +
|
|
target_energy * (1 - damping))
|