Merge pull request #2267 from MycroftAI/feature/better_no_confirm_listening

Improve operation without confirm listening by adding the last frames sent to the wakeword detection to the transcription queue.
pull/2268/head^2
Åke 2019-08-22 21:01:14 +02:00 committed by GitHub
commit e6dc2d145e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 36 additions and 12 deletions

View File

@ -15,7 +15,7 @@
import audioop
from time import sleep, time as get_time
import collections
from collections import deque
import datetime
import json
import os
@ -79,7 +79,7 @@ class MutableStream:
Returns:
Data read from device
"""
frames = collections.deque()
frames = deque()
remaining = size
while remaining > 0:
# If muted during read return empty buffer. This ensures no
@ -234,7 +234,13 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
def calc_energy(sound_chunk, sample_width):
return audioop.rms(sound_chunk, sample_width)
def _record_phrase(self, source, sec_per_buffer, stream=None):
def _record_phrase(
self,
source,
sec_per_buffer,
stream=None,
ww_frames=None
):
"""Record an entire spoken phrase.
Essentially, this code waits for a period of silence and then returns
@ -246,7 +252,9 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
sec_per_buffer (float): Fractional number of seconds in each chunk
stream (AudioStreamHandler): Stream target that will receive chunks
of the utterance audio while it is
being recorded
being recorded.
ww_frames (deque): Frames of audio data from the last part of wake
word detection.
Returns:
bytearray: complete audio buffer recorded, including any
@ -290,7 +298,10 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
phrase_complete = False
while num_chunks < max_chunks and not phrase_complete:
chunk = self.record_sound_chunk(source)
if ww_frames:
chunk = ww_frames.popleft()
else:
chunk = self.record_sound_chunk(source)
byte_data += chunk
num_chunks += 1
@ -311,7 +322,6 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
with open(self.mic_level_file, 'w') as f:
f.write("Energy: cur=" + str(energy) + " thresh=" +
str(self.energy_threshold))
f.close()
was_loud_enough = num_loud_chunks > min_loud_chunks
@ -421,10 +431,15 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs
counter = 0
# These are frames immediately after wake word is detected
# that we want to keep to send to STT
ww_frames = deque(maxlen=7)
while not said_wake_word and not self._stop_signaled:
if self._skip_wake_word():
break
chunk = self.record_sound_chunk(source)
ww_frames.append(chunk)
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
if energy < self.energy_threshold * self.multiplier:
@ -453,7 +468,6 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
with open(self.mic_level_file, 'w') as f:
f.write("Energy: cur=" + str(energy) + " thresh=" +
str(self.energy_threshold))
f.close()
counter += 1
# At first, the buffer is empty and must fill up. After that
@ -486,9 +500,10 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
os.mkdir(self.saved_wake_words_dir)
module = self.wake_word_recognizer.__class__.__name__
fn = join(self.saved_wake_words_dir,
'_'.join([str(mtd[k]) for k in sorted(mtd)])
+ '.wav')
fn = join(
self.saved_wake_words_dir,
'_'.join(str(mtd[k]) for k in sorted(mtd)) + '.wav'
)
with open(fn, 'wb') as f:
f.write(audio.get_wav_data())
@ -500,6 +515,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
self._create_audio_data(byte_data, source),
mtd or self._compile_metadata()]
).start()
return ww_frames
@staticmethod
def _create_audio_data(raw_data, source):
@ -540,7 +556,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
self.adjust_for_ambient_noise(source, 1.0)
LOG.debug("Waiting for wake word...")
self._wait_until_wake_word(source, sec_per_buffer)
ww_frames = self._wait_until_wake_word(source, sec_per_buffer)
if self._stop_signaled:
return
@ -556,8 +572,16 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
source.mute()
play_wav(audio_file).wait()
source.unmute()
# Clear frames from wakeword detctions since they're
# irrelevant after mute - play wav - unmute sequence
ww_frames = None
frame_data = self._record_phrase(source, sec_per_buffer, stream)
frame_data = self._record_phrase(
source,
sec_per_buffer,
stream,
ww_frames
)
audio_data = self._create_audio_data(frame_data, source)
emitter.emit("recognizer_loop:record_end")
if self.save_utterances: