Merge pull request #2267 from MycroftAI/feature/better_no_confirm_listening

Improve operation without confirm listening by adding the last frames sent to the wakeword detection to the transcription queue.
2019-08-22 21:01:14 +02:00 · 2019-08-22 21:01:14 +02:00 · e6dc2d145e
parent 0eb0a59641 f96a69227d
commit e6dc2d145e
1 changed files with 36 additions and 12 deletions
--- a/mycroft/client/speech/mic.py
+++ b/mycroft/client/speech/mic.py
@ -15,7 +15,7 @@
 import audioop
 from time import sleep, time as get_time

-import collections
+from collections import deque
 import datetime
 import json
 import os
@ -79,7 +79,7 @@ class MutableStream:
            Returns:
                Data read from device
        """
-        frames = collections.deque()
+        frames = deque()
        remaining = size
        while remaining > 0:
            # If muted during read return empty buffer. This ensures no
@ -234,7 +234,13 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
    def calc_energy(sound_chunk, sample_width):
        return audioop.rms(sound_chunk, sample_width)

-    def _record_phrase(self, source, sec_per_buffer, stream=None):
+    def _record_phrase(
+        self,
+        source,
+        sec_per_buffer,
+        stream=None,
+        ww_frames=None
+    ):
        """Record an entire spoken phrase.

        Essentially, this code waits for a period of silence and then returns
@ -246,7 +252,9 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
            sec_per_buffer (float):  Fractional number of seconds in each chunk
            stream (AudioStreamHandler): Stream target that will receive chunks
                                         of the utterance audio while it is
-                                         being recorded
+                                         being recorded.
+            ww_frames (deque):  Frames of audio data from the last part of wake
+                                word detection.

        Returns:
            bytearray: complete audio buffer recorded, including any
@ -290,7 +298,10 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):

        phrase_complete = False
        while num_chunks < max_chunks and not phrase_complete:
-            chunk = self.record_sound_chunk(source)
+            if ww_frames:
+                chunk = ww_frames.popleft()
+            else:
+                chunk = self.record_sound_chunk(source)
            byte_data += chunk
            num_chunks += 1

@ -311,7 +322,6 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
                with open(self.mic_level_file, 'w') as f:
                    f.write("Energy:  cur=" + str(energy) + " thresh=" +
                            str(self.energy_threshold))
-                f.close()

            was_loud_enough = num_loud_chunks > min_loud_chunks

@ -421,10 +431,15 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
        energy_avg_samples = int(5 / sec_per_buffer)  # avg over last 5 secs
        counter = 0

+        # These are frames immediately after wake word is detected
+        # that we want to keep to send to STT
+        ww_frames = deque(maxlen=7)
+
        while not said_wake_word and not self._stop_signaled:
            if self._skip_wake_word():
                break
            chunk = self.record_sound_chunk(source)
+            ww_frames.append(chunk)

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            if energy < self.energy_threshold * self.multiplier:
@ -453,7 +468,6 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
                with open(self.mic_level_file, 'w') as f:
                    f.write("Energy:  cur=" + str(energy) + " thresh=" +
                            str(self.energy_threshold))
-                f.close()
            counter += 1

            # At first, the buffer is empty and must fill up.  After that
@ -486,9 +500,10 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
                            os.mkdir(self.saved_wake_words_dir)
                        module = self.wake_word_recognizer.__class__.__name__

-                        fn = join(self.saved_wake_words_dir,
-                                  '_'.join([str(mtd[k]) for k in sorted(mtd)])
-                                  + '.wav')
+                        fn = join(
+                            self.saved_wake_words_dir,
+                            '_'.join(str(mtd[k]) for k in sorted(mtd)) + '.wav'
+                        )
                        with open(fn, 'wb') as f:
                            f.write(audio.get_wav_data())

@ -500,6 +515,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
                                  self._create_audio_data(byte_data, source),
                                  mtd or self._compile_metadata()]
                        ).start()
+        return ww_frames

    @staticmethod
    def _create_audio_data(raw_data, source):
@ -540,7 +556,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
        self.adjust_for_ambient_noise(source, 1.0)

        LOG.debug("Waiting for wake word...")
-        self._wait_until_wake_word(source, sec_per_buffer)
+        ww_frames = self._wait_until_wake_word(source, sec_per_buffer)
        if self._stop_signaled:
            return

@ -556,8 +572,16 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
                source.mute()
                play_wav(audio_file).wait()
                source.unmute()
+                # Clear frames from wakeword detctions since they're
+                # irrelevant after mute - play wav - unmute sequence
+                ww_frames = None

-        frame_data = self._record_phrase(source, sec_per_buffer, stream)
+        frame_data = self._record_phrase(
+            source,
+            sec_per_buffer,
+            stream,
+            ww_frames
+        )
        audio_data = self._create_audio_data(frame_data, source)
        emitter.emit("recognizer_loop:record_end")
        if self.save_utterances: