mycroft-core/mycroft/client/speech/mic.py

766 lines
28 KiB
Python

# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import audioop
from time import sleep, time as get_time
from collections import deque, namedtuple
import datetime
import json
import os
from os.path import isdir, join
import pyaudio
import requests
import speech_recognition
from hashlib import md5
from io import BytesIO, StringIO
from speech_recognition import (
Microphone,
AudioSource,
AudioData
)
from tempfile import gettempdir
from threading import Thread, Lock
from mycroft.api import DeviceApi
from mycroft.configuration import Configuration
from mycroft.session import SessionManager
from mycroft.util import (
check_for_signal,
get_ipc_directory,
resolve_resource_file,
play_wav
)
from mycroft.util.log import LOG
from .data_structures import RollingMean, CyclicAudioBuffer
WakeWordData = namedtuple('WakeWordData',
['audio', 'found', 'stopped', 'end_audio'])
class MutableStream:
def __init__(self, wrapped_stream, format, muted=False):
assert wrapped_stream is not None
self.wrapped_stream = wrapped_stream
self.SAMPLE_WIDTH = pyaudio.get_sample_size(format)
self.muted_buffer = b''.join([b'\x00' * self.SAMPLE_WIDTH])
self.read_lock = Lock()
self.muted = muted
if muted:
self.mute()
def mute(self):
"""Stop the stream and set the muted flag."""
with self.read_lock:
self.muted = True
self.wrapped_stream.stop_stream()
def unmute(self):
"""Start the stream and clear the muted flag."""
with self.read_lock:
self.muted = False
self.wrapped_stream.start_stream()
def read(self, size, of_exc=False):
"""Read data from stream.
Args:
size (int): Number of bytes to read
of_exc (bool): flag determining if the audio producer thread
should throw IOError at overflows.
Returns:
(bytes) Data read from device
"""
frames = deque()
remaining = size
with self.read_lock:
while remaining > 0:
# If muted during read return empty buffer. This ensures no
# reads occur while the stream is stopped
if self.muted:
return self.muted_buffer
to_read = min(self.wrapped_stream.get_read_available(),
remaining)
if to_read <= 0:
sleep(.01)
continue
result = self.wrapped_stream.read(to_read,
exception_on_overflow=of_exc)
frames.append(result)
remaining -= to_read
input_latency = self.wrapped_stream.get_input_latency()
if input_latency > 0.2:
LOG.warning("High input latency: %f" % input_latency)
audio = b"".join(list(frames))
return audio
def close(self):
self.wrapped_stream.close()
self.wrapped_stream = None
def is_stopped(self):
try:
return self.wrapped_stream.is_stopped()
except Exception as e:
LOG.error(repr(e))
return True # Assume the stream has been closed and thusly stopped
def stop_stream(self):
return self.wrapped_stream.stop_stream()
class MutableMicrophone(Microphone):
def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024,
mute=False):
Microphone.__init__(self, device_index=device_index,
sample_rate=sample_rate, chunk_size=chunk_size)
self.muted = False
if mute:
self.mute()
def __enter__(self):
return self._start()
def _start(self):
"""Open the selected device and setup the stream."""
assert self.stream is None, \
"This audio source is already inside a context manager"
self.audio = pyaudio.PyAudio()
self.stream = MutableStream(self.audio.open(
input_device_index=self.device_index, channels=1,
format=self.format, rate=self.SAMPLE_RATE,
frames_per_buffer=self.CHUNK,
input=True, # stream is an input stream
), self.format, self.muted)
return self
def __exit__(self, exc_type, exc_value, traceback):
return self._stop()
def _stop(self):
"""Stop and close an open stream."""
try:
if not self.stream.is_stopped():
self.stream.stop_stream()
self.stream.close()
except Exception:
LOG.exception('Failed to stop mic input stream')
# Let's pretend nothing is wrong...
self.stream = None
self.audio.terminate()
def restart(self):
"""Shutdown input device and restart."""
self._stop()
self._start()
def mute(self):
self.muted = True
if self.stream:
self.stream.mute()
def unmute(self):
self.muted = False
if self.stream:
self.stream.unmute()
def is_muted(self):
return self.muted
def duration_to_bytes(self, sec):
"""Converts a duration in seconds to number of recorded bytes.
Args:
sec: number of seconds
Returns:
(int) equivalent number of bytes recorded by this Mic
"""
return int(sec * self.SAMPLE_RATE) * self.SAMPLE_WIDTH
def get_silence(num_bytes):
return b'\0' * num_bytes
class NoiseTracker:
"""Noise tracker, used to deterimine if an audio utterance is complete.
The current implementation expects a number of loud chunks (not necessary
in one continous sequence) followed by a short period of continous quiet
audio data to be considered complete.
Args:
minimum (int): lower noise level will be threshold for "quiet" level
maximum (int): ceiling of noise level
sec_per_buffer (float): the length of each buffer used when updating
the tracker
loud_time_limit (float): time in seconds of low noise to be considered
a complete sentence
silence_time_limit (float): time limit for silence to abort sentence
silence_after_loud (float): time of silence to finalize the sentence.
default 0.25 seconds.
"""
def __init__(self, minimum, maximum, sec_per_buffer, loud_time_limit,
silence_time_limit, silence_after_loud_time=0.25):
self.min_level = minimum
self.max_level = maximum
self.sec_per_buffer = sec_per_buffer
self.num_loud_chunks = 0
self.level = 0
# Smallest number of loud chunks required to return loud enough
self.min_loud_chunks = int(loud_time_limit / sec_per_buffer)
self.max_silence_duration = silence_time_limit
self.silence_duration = 0
# time of quite period after long enough loud data to consider the
# sentence complete
self.silence_after_loud = silence_after_loud_time
# Constants
self.increase_multiplier = 200
self.decrease_multiplier = 100
def _increase_noise(self):
"""Bumps the current level.
Modifies the noise level with a factor depending in the buffer length.
"""
if self.level < self.max_level:
self.level += self.increase_multiplier * self.sec_per_buffer
def _decrease_noise(self):
"""Decrease the current level.
Modifies the noise level with a factor depending in the buffer length.
"""
if self.level > self.min_level:
self.level -= self.decrease_multiplier * self.sec_per_buffer
def update(self, is_loud):
"""Update the tracking. with either a loud chunk or a quiet chunk.
Args:
is_loud: True if a loud chunk should be registered
False if a quiet chunk should be registered
"""
if is_loud:
self._increase_noise()
self.num_loud_chunks += 1
else:
self._decrease_noise()
# Update duration of energy under the threshold level
if self._quiet_enough():
self.silence_duration += self.sec_per_buffer
else: # Reset silence duration
self.silence_duration = 0
def _loud_enough(self):
"""Check if the noise loudness criteria is fulfilled.
The noise is considered loud enough if it's been over the threshold
for a certain number of chunks (accumulated, not in a row).
"""
return self.num_loud_chunks > self.min_loud_chunks
def _quiet_enough(self):
"""Check if the noise quietness criteria is fulfilled.
The quiet level is instant and will return True if the level is lower
or equal to the minimum noise level.
"""
return self.level <= self.min_level
def recording_complete(self):
"""Has the end creteria for the recording been met.
If the noise level has decresed from a loud level to a low level
the user has stopped speaking.
Alternatively if a lot of silence was recorded without detecting
a loud enough phrase.
"""
too_much_silence = (self.silence_duration > self.max_silence_duration)
if too_much_silence:
LOG.debug('Too much silence recorded without start of sentence '
'detected')
return ((self._quiet_enough() and
self.silence_duration > self.silence_after_loud) and
(self._loud_enough() or too_much_silence))
class ResponsiveRecognizer(speech_recognition.Recognizer):
# Padding of silence when feeding to pocketsphinx
SILENCE_SEC = 0.01
# The minimum seconds of noise before a
# phrase can be considered complete
MIN_LOUD_SEC_PER_PHRASE = 0.5
# The minimum seconds of silence required at the end
# before a phrase will be considered complete
MIN_SILENCE_AT_END = 0.25
# Time between pocketsphinx checks for the wake word
SEC_BETWEEN_WW_CHECKS = 0.2
def __init__(self, wake_word_recognizer, watchdog=None):
self._watchdog = watchdog or (lambda: None) # Default to dummy func
self.config = Configuration.get()
listener_config = self.config.get('listener')
self.upload_url = listener_config['wake_word_upload']['url']
self.upload_disabled = listener_config['wake_word_upload']['disable']
self.wake_word_name = wake_word_recognizer.key_phrase
self.overflow_exc = listener_config.get('overflow_exception', False)
super().__init__()
self.wake_word_recognizer = wake_word_recognizer
self.audio = pyaudio.PyAudio()
self.multiplier = listener_config.get('multiplier')
self.energy_ratio = listener_config.get('energy_ratio')
# Check the config for the flag to save wake words, utterances
# and for a path under which to save them
self.save_utterances = listener_config.get('save_utterances', False)
self.save_wake_words = listener_config.get('record_wake_words', False)
self.save_path = listener_config.get('save_path', gettempdir())
self.saved_wake_words_dir = join(self.save_path, 'mycroft_wake_words')
if self.save_wake_words and not isdir(self.saved_wake_words_dir):
os.mkdir(self.saved_wake_words_dir)
self.saved_utterances_dir = join(self.save_path, 'mycroft_utterances')
if self.save_utterances and not isdir(self.saved_utterances_dir):
os.mkdir(self.saved_utterances_dir)
self.mic_level_file = os.path.join(get_ipc_directory(), "mic_level")
# Signal statuses
self._stop_signaled = False
self._listen_triggered = False
self._account_id = None
# The maximum seconds a phrase can be recorded,
# provided there is noise the entire time
self.recording_timeout = listener_config.get('recording_timeout',
10.0)
# The maximum time it will continue to record silence
# when not enough noise has been detected
self.recording_timeout_with_silence = listener_config.get(
'recording_timeout_with_silence', 3.0)
@property
def account_id(self):
"""Fetch account from backend when needed.
If an error occurs it's handled and a temporary value is returned.
When a value is received it will be cached until next start.
"""
if not self._account_id:
try:
self._account_id = DeviceApi().get()['user']['uuid']
except (requests.RequestException, AttributeError):
pass # These are expected and won't be reported
except Exception as e:
LOG.debug('Unhandled exception while determining device_id, '
'Error: {}'.format(repr(e)))
return self._account_id or '0'
def record_sound_chunk(self, source):
return source.stream.read(source.CHUNK, self.overflow_exc)
@staticmethod
def calc_energy(sound_chunk, sample_width):
return audioop.rms(sound_chunk, sample_width)
def _record_phrase(
self,
source,
sec_per_buffer,
stream=None,
ww_frames=None
):
"""Record an entire spoken phrase.
Essentially, this code waits for a period of silence and then returns
the audio. If silence isn't detected, it will terminate and return
a buffer of self.recording_timeout duration.
Args:
source (AudioSource): Source producing the audio chunks
sec_per_buffer (float): Fractional number of seconds in each chunk
stream (AudioStreamHandler): Stream target that will receive chunks
of the utterance audio while it is
being recorded.
ww_frames (deque): Frames of audio data from the last part of wake
word detection.
Returns:
bytearray: complete audio buffer recorded, including any
silence at the end of the user's utterance
"""
noise_tracker = NoiseTracker(0, 25, sec_per_buffer,
self.MIN_LOUD_SEC_PER_PHRASE,
self.recording_timeout_with_silence)
# Maximum number of chunks to record before timing out
max_chunks = int(self.recording_timeout / sec_per_buffer)
num_chunks = 0
# bytearray to store audio in, initialized with a single sample of
# silence.
byte_data = get_silence(source.SAMPLE_WIDTH)
if stream:
stream.stream_start()
phrase_complete = False
while num_chunks < max_chunks and not phrase_complete:
if ww_frames:
chunk = ww_frames.popleft()
else:
chunk = self.record_sound_chunk(source)
byte_data += chunk
num_chunks += 1
if stream:
stream.stream_chunk(chunk)
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
test_threshold = self.energy_threshold * self.multiplier
is_loud = energy > test_threshold
noise_tracker.update(is_loud)
if not is_loud:
self._adjust_threshold(energy, sec_per_buffer)
# The phrase is complete if the noise_tracker end of sentence
# criteria is met or if the top-button is pressed
phrase_complete = (noise_tracker.recording_complete() or
check_for_signal('buttonPress'))
# Periodically write the energy level to the mic level file.
if num_chunks % 10 == 0:
self._watchdog()
self.write_mic_level(energy, source)
return byte_data
def write_mic_level(self, energy, source):
with open(self.mic_level_file, 'w') as f:
f.write('Energy: cur={} thresh={:.3f} muted={}'.format(
energy,
self.energy_threshold,
int(source.muted)
)
)
def _skip_wake_word(self):
"""Check if told programatically to skip the wake word
For example when we are in a dialog with the user.
"""
if self._listen_triggered:
return True
# Pressing the Mark 1 button can start recording (unless
# it is being used to mean 'stop' instead)
if check_for_signal('buttonPress', 1):
# give other processes time to consume this signal if
# it was meant to be a 'stop'
sleep(0.25)
if check_for_signal('buttonPress'):
# Signal is still here, assume it was intended to
# begin recording
LOG.debug("Button Pressed, wakeword not needed")
return True
return False
def stop(self):
"""Signal stop and exit waiting state."""
self._stop_signaled = True
def _compile_metadata(self):
ww_module = self.wake_word_recognizer.__class__.__name__
if ww_module == 'PreciseHotword':
model_path = self.wake_word_recognizer.precise_model
with open(model_path, 'rb') as f:
model_hash = md5(f.read()).hexdigest()
else:
model_hash = '0'
return {
'name': self.wake_word_name.replace(' ', '-'),
'engine': md5(ww_module.encode('utf-8')).hexdigest(),
'time': str(int(1000 * get_time())),
'sessionId': SessionManager.get().session_id,
'accountId': self.account_id,
'model': str(model_hash)
}
def trigger_listen(self):
"""Externally trigger listening."""
LOG.debug('Listen triggered from external source.')
self._listen_triggered = True
def _upload_wakeword(self, audio, metadata):
"""Upload the wakeword in a background thread."""
LOG.debug(
"Wakeword uploading has been disabled. The API endpoint used in "
"Mycroft-core v20.2 and below has been deprecated. To contribute "
"new wakeword samples please upgrade to v20.8 or above."
)
# def upload(audio, metadata):
# requests.post(self.upload_url,
# files={'audio': BytesIO(audio.get_wav_data()),
# 'metadata': StringIO(json.dumps(metadata))})
# Thread(target=upload, daemon=True, args=(audio, metadata)).start()
def _send_wakeword_info(self, emitter):
"""Send messagebus message indicating that a wakeword was received.
Args:
emitter: bus emitter to send information on.
"""
SessionManager.touch()
payload = {'utterance': self.wake_word_name,
'session': SessionManager.get().session_id}
emitter.emit("recognizer_loop:wakeword", payload)
def _write_wakeword_to_disk(self, audio, metadata):
"""Write wakeword to disk.
Args:
audio: Audio data to write
metadata: List of metadata about the captured wakeword
"""
filename = join(self.saved_wake_words_dir,
'_'.join(str(metadata[k]) for k in sorted(metadata)) +
'.wav')
with open(filename, 'wb') as f:
f.write(audio.get_wav_data())
def _handle_wakeword_found(self, audio_data, source):
"""Perform actions to be triggered after a wakeword is found.
This includes: emit event on messagebus that a wakeword is heard,
store wakeword to disk if configured and sending the wakeword data
to the cloud in case the user has opted into the data sharing.
"""
# Save and upload positive wake words as appropriate
upload_allowed = (self.config['opt_in'] and not self.upload_disabled)
if (self.save_wake_words or upload_allowed):
audio = self._create_audio_data(audio_data, source)
metadata = self._compile_metadata()
if self.save_wake_words:
# Save wake word locally
self._write_wakeword_to_disk(audio, metadata)
# Upload wake word for opt_in people
if upload_allowed:
self._upload_wakeword(audio, metadata)
def _wait_until_wake_word(self, source, sec_per_buffer):
"""Listen continuously on source until a wake word is spoken
Args:
source (AudioSource): Source producing the audio chunks
sec_per_buffer (float): Fractional number of seconds in each chunk
"""
# The maximum audio in seconds to keep for transcribing a phrase
# The wake word must fit in this time
ww_duration = self.wake_word_recognizer.expected_duration
ww_test_duration = max(3, ww_duration)
mic_write_counter = 0
num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
source.SAMPLE_WIDTH)
silence = get_silence(num_silent_bytes)
# Max bytes for byte_data before audio is removed from the front
max_size = source.duration_to_bytes(ww_duration)
test_size = source.duration_to_bytes(ww_test_duration)
audio_buffer = CyclicAudioBuffer(max_size, silence)
buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer
buffers_since_check = 0.0
# Rolling buffer to track the audio energy (loudness) heard on
# the source recently. An average audio energy is maintained
# based on these levels.
average_samples = int(5 / sec_per_buffer) # average over last 5 secs
audio_mean = RollingMean(average_samples)
# These are frames immediately after wake word is detected
# that we want to keep to send to STT
ww_frames = deque(maxlen=7)
said_wake_word = False
audio_data = None
while (not said_wake_word and not self._stop_signaled and
not self._skip_wake_word()):
chunk = self.record_sound_chunk(source)
audio_buffer.append(chunk)
ww_frames.append(chunk)
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
audio_mean.append_sample(energy)
if energy < self.energy_threshold * self.multiplier:
self._adjust_threshold(energy, sec_per_buffer)
# maintain the threshold using average
if self.energy_threshold < energy < audio_mean.value * 1.5:
# bump the threshold to just above this value
self.energy_threshold = energy * 1.2
# Periodically output energy level stats. This can be used to
# visualize the microphone input, e.g. a needle on a meter.
if mic_write_counter % 3:
self._watchdog()
self.write_mic_level(energy, source)
mic_write_counter += 1
buffers_since_check += 1.0
# Send chunk to wake_word_recognizer
self.wake_word_recognizer.update(chunk)
if buffers_since_check > buffers_per_check:
buffers_since_check -= buffers_per_check
audio_data = audio_buffer.get_last(test_size) + silence
said_wake_word = \
self.wake_word_recognizer.found_wake_word(audio_data)
self._listen_triggered = False
return WakeWordData(audio_data, said_wake_word,
self._stop_signaled, ww_frames)
@staticmethod
def _create_audio_data(raw_data, source):
"""
Constructs an AudioData instance with the same parameters
as the source and the specified frame_data
"""
return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def mute_and_confirm_listening(self, source):
audio_file = resolve_resource_file(
self.config.get('sounds').get('start_listening'))
if audio_file:
source.mute()
play_wav(audio_file).wait()
source.unmute()
return True
else:
return False
def listen(self, source, emitter, stream=None):
"""Listens for chunks of audio that Mycroft should perform STT on.
This will listen continuously for a wake-up-word, then return the
audio chunk containing the spoken phrase that comes immediately
afterwards.
Args:
source (AudioSource): Source producing the audio chunks
emitter (EventEmitter): Emitter for notifications of when recording
begins and ends.
stream (AudioStreamHandler): Stream target that will receive chunks
of the utterance audio while it is
being recorded
Returns:
AudioData: audio with the user's utterance, minus the wake-up-word
"""
assert isinstance(source, AudioSource), "Source must be an AudioSource"
# bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH
sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
# Every time a new 'listen()' request begins, reset the threshold
# used for silence detection. This is as good of a reset point as
# any, as we expect the user and Mycroft to not be talking.
# NOTE: adjust_for_ambient_noise() doc claims it will stop early if
# speech is detected, but there is no code to actually do that.
self.adjust_for_ambient_noise(source, 1.0)
LOG.debug("Waiting for wake word...")
ww_data = self._wait_until_wake_word(source, sec_per_buffer)
ww_frames = None
if ww_data.found:
# If the wakeword was heard send it
self._send_wakeword_info(emitter)
self._handle_wakeword_found(ww_data.audio, source)
ww_frames = ww_data.end_audio
if ww_data.stopped:
# If the waiting returned from a stop signal
return
LOG.debug("Recording...")
# If enabled, play a wave file with a short sound to audibly
# indicate recording has begun.
if self.config.get('confirm_listening'):
if self.mute_and_confirm_listening(source):
# Clear frames from wakeword detctions since they're
# irrelevant after mute - play wav - unmute sequence
ww_frames = None
# Notify system of recording start
emitter.emit("recognizer_loop:record_begin")
frame_data = self._record_phrase(
source,
sec_per_buffer,
stream,
ww_frames
)
audio_data = self._create_audio_data(frame_data, source)
emitter.emit("recognizer_loop:record_end")
if self.save_utterances:
LOG.info("Recording utterance")
stamp = str(datetime.datetime.now())
filename = "/{}/{}.wav".format(
self.saved_utterances_dir,
stamp
)
with open(filename, 'wb') as filea:
filea.write(audio_data.get_wav_data())
LOG.debug("Thinking...")
return audio_data
def _adjust_threshold(self, energy, seconds_per_buffer):
if self.dynamic_energy_threshold and energy > 0:
# account for different chunk sizes and rates
damping = (
self.dynamic_energy_adjustment_damping ** seconds_per_buffer)
target_energy = energy * self.energy_ratio
self.energy_threshold = (
self.energy_threshold * damping +
target_energy * (1 - damping))