This fixes several issues related to wake-up words and audio recording, including #595.

* The wake up sound is now played synchronously, thus not included in the recorded audio * The minimum recorded phrase must be at least 0.5 seconds (instead of 0.1), and must be continuously quiet for that duration * The silence threshold is reset every time we begin listening for a wakeword * The silence threshold adjusts upward to just above the ambient sound while waiting for the wake-word * Reformatted some comments to use Google-style docstrings, and added more comments
2017-04-09 16:34:48 -07:00 · 2017-04-09 16:34:48 -07:00 · 23c445a904
parent 0036f404b5
commit 23c445a904
2 changed files with 111 additions and 39 deletions
--- a/mycroft/client/speech/main.py
+++ b/mycroft/client/speech/main.py
@ -27,7 +27,7 @@ from mycroft.identity import IdentityManager
 from mycroft.messagebus.client.ws import WebsocketClient
 from mycroft.messagebus.message import Message
 from mycroft.tts import TTSFactory
-from mycroft.util import kill, play_wav, resolve_resource_file, create_signal
+from mycroft.util import kill, create_signal
 from mycroft.util.log import getLogger
 from mycroft.lock import Lock as PIDLock  # Create/Support PID locking file

@ -42,15 +42,6 @@ config = ConfigurationManager.get()

 def handle_record_begin():
    logger.info("Begin Recording...")
-
-    # If enabled, play a wave file with a short sound to audibly
-    # indicate recording has begun.
-    if config.get('confirm_listening'):
-        file = resolve_resource_file(
-            config.get('sounds').get('start_listening'))
-        if file:
-            play_wav(file)
-
    ws.emit(Message('recognizer_loop:record_begin'))


--- a/mycroft/client/speech/mic.py
+++ b/mycroft/client/speech/mic.py
@ -30,10 +30,16 @@ from speech_recognition import (
 )

 from mycroft.configuration import ConfigurationManager
-from mycroft.util import check_for_signal, get_ipc_directory
+from mycroft.util import (
+    check_for_signal,
+    get_ipc_directory,
+    resolve_resource_file,
+    play_wav
+)
 from mycroft.util.log import getLogger

-listener_config = ConfigurationManager.get().get('listener')
+config = ConfigurationManager.get()
+listener_config = config.get('listener')
 logger = getLogger(__name__)
 __author__ = 'seanfitz'

@ -130,9 +136,13 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):

    # The minimum seconds of noise before a
    # phrase can be considered complete
-    MIN_LOUD_SEC_PER_PHRASE = 0.1
+    MIN_LOUD_SEC_PER_PHRASE = 0.5

-    # The maximum length a phrase can be recorded,
+    # The minimum seconds of silence required at the end
+    # before a phrase will be considered complete
+    MIN_SILENCE_AT_END = 0.25
+
+    # The maximum seconds a phrase can be recorded,
    # provided there is noise the entire time
    RECORDING_TIMEOUT = 10.0

@ -163,22 +173,30 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
        hyp = self.wake_word_recognizer.transcribe(frame_data)
        return self.wake_word_recognizer.found_wake_word(hyp)

-    def record_phrase(self, source, sec_per_buffer):
-        """
-        This attempts to record an entire spoken phrase. Essentially,
-        this waits for a period of silence and then returns the audio
+    def _record_phrase(self, source, sec_per_buffer):
+        """Record an entire spoken phrase.

-        :rtype: bytearray
-        :param source: AudioSource
-        :param sec_per_buffer: Based on source.SAMPLE_RATE
-        :return: bytearray representing the frame_data of the recorded phrase
+        Essentially, this code waits for a period of silence and then returns
+        the audio.  If silence isn't detected, it will terminate and return
+        a buffer of RECORDING_TIMEOUT duration.
+
+        Args:
+            source (AudioSource):  Source producing the audio chunks
+            sec_per_buffer (float):  Fractional number of seconds in each chunk
+
+        Returns:
+            bytearray: complete audio buffer recorded, including any
+                       silence at the end of the user's utterance
        """
+
        num_loud_chunks = 0
        noise = 0

        max_noise = 25
        min_noise = 0

+        silence_duration = 0
+
        def increase_noise(level):
            if level < max_noise:
                return level + 200 * sec_per_buffer
@ -217,7 +235,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
                num_loud_chunks += 1
            else:
                noise = decrease_noise(noise)
-                self.adjust_threshold(energy, sec_per_buffer)
+                self._adjust_threshold(energy, sec_per_buffer)

            if num_chunks % 10 == 0:
                with open(self.mic_level_file, 'w') as f:
@ -226,7 +244,14 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
                f.close()

            was_loud_enough = num_loud_chunks > min_loud_chunks
+
            quiet_enough = noise <= min_noise
+            if quiet_enough:
+                silence_duration += sec_per_buffer
+                if silence_duration < self.MIN_SILENCE_AT_END:
+                    quiet_enough = False  # gotta be silent for min of 1/4 sec
+            else:
+                silence_duration = 0
            recorded_too_much_silence = num_chunks > max_chunks_of_silence
            if quiet_enough and (was_loud_enough or recorded_too_much_silence):
                phrase_complete = True
@ -239,7 +264,13 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
    def sec_to_bytes(sec, source):
        return sec * source.SAMPLE_RATE * source.SAMPLE_WIDTH

-    def wait_until_wake_word(self, source, sec_per_buffer):
+    def _wait_until_wake_word(self, source, sec_per_buffer):
+        """Listen continuously on source until a wake word is spoken
+
+        Args:
+            source (AudioSource):  Source producing the audio chunks
+            sec_per_buffer (float):  Fractional number of seconds in each chunk
+        """
        num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
                               source.SAMPLE_WIDTH)

@ -255,7 +286,17 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
        max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)

        said_wake_word = False
+
+        # Rolling buffer to track the audio energy (loudness) heard on
+        # the source recently.  An average audio energy is maintained
+        # based on these levels.
+        energies = []
+        idx_energy = 0
+        avg_energy = 0.0
+        energy_avg_samples = int(5 / sec_per_buffer)  # avg over last 5 secs
+
        counter = 0
+
        while not said_wake_word:
            if check_for_signal('buttonPress'):
                said_wake_word = True
@ -265,16 +306,33 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            if energy < self.energy_threshold * self.multiplier:
-                self.adjust_threshold(energy, sec_per_buffer)
+                self._adjust_threshold(energy, sec_per_buffer)

-            if counter > 2:
+            if len(energies) < energy_avg_samples:
+                # build the average
+                energies.append(energy)
+                avg_energy += float(energy)/energy_avg_samples
+            else:
+                # maintain the running average and rolling buffer
+                avg_energy -= float(energies[idx_energy])/energy_avg_samples
+                avg_energy += float(energy)/energy_avg_samples
+                energies[idx_energy] = energy
+                idx_energy = (idx_energy+1) % energy_avg_samples
+
+                # maintain the threshold using average
+                if energy < avg_energy*1.5:
+                    if energy > self.energy_threshold:
+                        # bump the threshold to just above this value
+                        self.energy_threshold = energy*1.2
+
+            # Periodically output energy level stats.  This can be used to
+            # visualize the microphone input, e.g. a needle on a meter.
+            if counter % 3:
                with open(self.mic_level_file, 'w') as f:
                    f.write("Energy:  cur=" + str(energy) + " thresh=" +
                            str(self.energy_threshold))
                f.close()
-                counter = 0
-            else:
-                counter += 1
+            counter += 1

            # At first, the buffer is empty and must fill up.  After that
            # just drop the first chunk bytes to keep it the same size.
@ -290,7 +348,7 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
                said_wake_word = self.wake_word_in_audio(byte_data + silence)

    @staticmethod
-    def create_audio_data(raw_data, source):
+    def _create_audio_data(raw_data, source):
        """
        Constructs an AudioData instance with the same parameters
        as the source and the specified frame_data
@ -298,31 +356,54 @@ class ResponsiveRecognizer(speech_recognition.Recognizer):
        return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)

    def listen(self, source, emitter):
-        """
-        Listens for audio that Mycroft should respond to
+        """Listens for chunks of audio that Mycroft should perform STT on.

-        :param source: an ``AudioSource`` instance for reading from
-        :param emitter: a pyee EventEmitter for sending when the wakeword
-                        has been found
+        This will listen continuously for a wake-up-word, then return the
+        audio chunk containing the spoken phrase that comes immediately
+        afterwards.
+
+        Args:
+            source (AudioSource):  Source producing the audio chunks
+            emitter (EventEmitter): Emitter for notifications of when recording
+                                    begins and ends.
+
+        Returns:
+            AudioData: audio with the user's utterance, minus the wake-up-word
        """
        assert isinstance(source, AudioSource), "Source must be an AudioSource"

 #        bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH
        sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE

+        # Every time a new 'listen()' request begins, reset the threshold
+        # used for silence detection.  This is as good of a reset point as
+        # any, as we expect the user and Mycroft to not be talking.
+        # NOTE: adjust_for_ambient_noise() doc claims it will stop early if
+        #       speech is detected, but there is no code to actually do that.
+        self.adjust_for_ambient_noise(source, 1.0)
+
        logger.debug("Waiting for wake word...")
-        self.wait_until_wake_word(source, sec_per_buffer)
+        self._wait_until_wake_word(source, sec_per_buffer)

        logger.debug("Recording...")
        emitter.emit("recognizer_loop:record_begin")
-        frame_data = self.record_phrase(source, sec_per_buffer)
-        audio_data = self.create_audio_data(frame_data, source)
+
+        # If enabled, play a wave file with a short sound to audibly
+        # indicate recording has begun.
+        if config.get('confirm_listening'):
+            file = resolve_resource_file(
+                config.get('sounds').get('start_listening'))
+            if file:
+                play_wav(file)
+
+        frame_data = self._record_phrase(source, sec_per_buffer)
+        audio_data = self._create_audio_data(frame_data, source)
        emitter.emit("recognizer_loop:record_end")
        logger.debug("Thinking...")

        return audio_data

-    def adjust_threshold(self, energy, seconds_per_buffer):
+    def _adjust_threshold(self, energy, seconds_per_buffer):
        if self.dynamic_energy_threshold and energy > 0:
            # account for different chunk sizes and rates
            damping = (