mycroft-core/mycroft/client/speech/mic.py

226 lines
8.4 KiB
Python

import collections
import math
import audioop
from time import sleep
import pyaudio
from speech_recognition import (
Microphone,
AudioSource,
WaitTimeoutError,
AudioData
)
import speech_recognition
from mycroft.util.log import getLogger
logger = getLogger(__name__)
__author__ = 'seanfitz'
class MutableStream(object):
def __init__(self, wrapped_stream, format, muted=False):
assert wrapped_stream is not None
self.wrapped_stream = wrapped_stream
self.muted = muted
self.SAMPLE_WIDTH = pyaudio.get_sample_size(format)
self.muted_buffer = b''.join([b'\x00' * self.SAMPLE_WIDTH])
def mute(self):
self.muted = True
def unmute(self):
self.muted = False
def read(self, size):
frames = collections.deque()
remaining = size
while remaining > 0:
to_read = min(self.wrapped_stream.get_read_available(), remaining)
if to_read == 0:
sleep(.01)
continue
result = self.wrapped_stream.read(to_read)
frames.append(result)
remaining -= to_read
if self.muted:
return self.muted_buffer
input_latency = self.wrapped_stream.get_input_latency()
if input_latency > 0.2:
logger.warn("High input latency: %f" % input_latency)
audio = b"".join(list(frames))
return audio
def close(self):
self.wrapped_stream.close()
self.wrapped_stream = None
def is_stopped(self):
return self.wrapped_stream.is_stopped()
def stop_stream(self):
return self.wrapped_stream.stop_stream()
class MutableMicrophone(Microphone):
def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024):
Microphone.__init__(
self, device_index=device_index, sample_rate=sample_rate,
chunk_size=chunk_size)
self.muted = False
def __enter__(self):
assert self.stream is None, \
"This audio source is already inside a context manager"
self.audio = pyaudio.PyAudio()
self.stream = MutableStream(self.audio.open(
input_device_index=self.device_index, channels=1,
format=self.format, rate=self.SAMPLE_RATE,
frames_per_buffer=self.CHUNK,
input=True, # stream is an input stream
), self.format, self.muted)
return self
def __exit__(self, exc_type, exc_value, traceback):
if not self.stream.is_stopped():
self.stream.stop_stream()
self.stream.close()
self.stream = None
self.audio.terminate()
def mute(self):
self.muted = True
if self.stream:
self.stream.mute()
def unmute(self):
self.muted = False
if self.stream:
self.stream.unmute()
class Recognizer(speech_recognition.Recognizer):
def __init__(self):
speech_recognition.Recognizer.__init__(self)
self.max_audio_length_sec = 30
def listen(self, source, timeout=None):
"""
Records a single phrase from ``source`` (an ``AudioSource`` instance)
into an ``AudioData`` instance, which it returns.
This is done by waiting until the audio has an energy above
``recognizer_instance.energy_threshold`` (the user has started
speaking), and then recording until it encounters
``recognizer_instance.pause_threshold`` seconds of non-speaking or
there is no more audio input. The ending silence is not included.
The ``timeout`` parameter is the maximum number of seconds that it
will wait for a phrase to start before giving up and throwing an
``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is
``None``, it will wait indefinitely.
"""
assert isinstance(source, AudioSource), \
"Source must be an audio source"
assert self.pause_threshold >= self.non_speaking_duration >= 0
seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
# number of buffers of non-speaking audio before the phrase is
# complete
pause_buffer_count = int(
math.ceil(self.pause_threshold / seconds_per_buffer))
# minimum number of buffers of speaking audio before we consider the
# speaking audio a phrase
phrase_buffer_count = int(math.ceil(self.phrase_threshold /
seconds_per_buffer))
# maximum number of buffers of non-speaking audio to retain before and
# after
non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration /
seconds_per_buffer))
# read audio input for phrases until there is a phrase that is long
# enough
elapsed_time = 0 # number of seconds of audio read
while True:
frames = collections.deque()
# store audio input until the phrase starts
while True:
elapsed_time += seconds_per_buffer
# handle timeout if specified
if timeout and elapsed_time > timeout:
raise WaitTimeoutError("listening timed out")
buffer = source.stream.read(source.CHUNK)
if len(buffer) == 0:
break # reached end of the stream
frames.append(buffer)
# ensure we only keep the needed amount of non-speaking buffers
if len(frames) > non_speaking_buffer_count:
frames.popleft()
# detect whether speaking has started on audio input
# energy of the audio signal
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
if energy > self.energy_threshold:
break
# dynamically adjust the energy threshold using assymmetric
# weighted average
# do not adjust dynamic energy level for this sample if it is
# muted audio (energy == 0)
self.adjust_energy_threshold(energy, seconds_per_buffer)
# read audio input until the phrase ends
pause_count, phrase_count = 0, 0
while True:
elapsed_time += seconds_per_buffer
buffer = source.stream.read(source.CHUNK)
if len(buffer) == 0:
break # reached end of the stream
frames.append(buffer)
phrase_count += 1
# check if speaking has stopped for longer than the pause
# threshold on the audio input
# energy of the audio signal
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
if energy > self.energy_threshold:
pause_count = 0
else:
pause_count += 1
if pause_count > pause_buffer_count: # end of the phrase
break
if (len(frames) * seconds_per_buffer >=
self.max_audio_length_sec):
# if we hit the end of the audio length, readjust
# energy_threshold
for frame in frames:
energy = audioop.rms(frame, source.SAMPLE_WIDTH)
self.adjust_energy_threshold(
energy, seconds_per_buffer)
break
# check how long the detected phrase is, and retry listening if
# the phrase is too short
phrase_count -= pause_count
if phrase_count >= phrase_buffer_count:
break # phrase is long enough, stop listening
# obtain frame data
for i in range(pause_count - non_speaking_buffer_count):
frames.pop() # remove extra non-speaking frames at the end
frame_data = b"".join(list(frames))
return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def adjust_energy_threshold(self, energy, seconds_per_buffer):
if self.dynamic_energy_threshold and energy > 0:
# account for different chunk sizes and rates
damping = (
self.dynamic_energy_adjustment_damping ** seconds_per_buffer)
target_energy = energy * self.dynamic_energy_ratio
self.energy_threshold = (
self.energy_threshold * damping +
target_energy * (1 - damping))