Initial Google Streaming STT implementation off of PR 2149 with dev logs scattered about

2019-06-10 17:32:38 -05:00 · 2019-06-10 17:32:38 -05:00 · 16248ee313
parent 7bbfe0c9fb
commit 16248ee313
2 changed files with 102 additions and 1 deletions
--- a/mycroft/client/speech/listener.py
+++ b/mycroft/client/speech/listener.py
@ -74,6 +74,7 @@ class AudioProducer(Thread):
            self.recognizer.adjust_for_ambient_noise(source)
            while self.state.running:
                try:
+                    LOG.info('WAGNER listen')
                    audio = self.recognizer.listen(source, self.emitter,
                                                   self.stream_handler)
                    self.queue.put((AUDIO_DATA, audio))
@ -132,6 +133,8 @@ class AudioConsumer(Thread):

        tag, data = message

+        LOG.info('WAGNER MESSAGE TAG {}'.format(tag))
+
        if tag == AUDIO_DATA:
            if self.state.sleeping:
                self.wake_up(data)
@ -144,7 +147,7 @@ class AudioConsumer(Thread):
        elif tag == STREAM_STOP:
            self.stt.stream_stop()
        else:
-            LOG.error("Unknown audio queue type %r" % audio)
+            LOG.error("Unknown audio queue type %r" % tag)

    # TODO: Localization
    def wake_up(self, audio):
--- a/mycroft/stt/init.py
+++ b/mycroft/stt/init.py
@ -20,6 +20,9 @@ from speech_recognition import Recognizer
 from queue import Queue
 from threading import Thread

+from google.cloud import speech
+from google.oauth2 import service_account
+
 from mycroft.api import STTApi
 from mycroft.configuration import Configuration
 from mycroft.util.log import LOG
@ -240,6 +243,100 @@ class DeepSpeechStreamServerSTT(DeepSpeechServerSTT):
        self.stream.start()


+class GoogleStreamThread(Thread):
+    def __init__(self, queue, lang, client, streaming_config):
+        super().__init__()
+        LOG.info('WAGNER Thread init')
+        self.lang = lang
+        self.client = client
+        self.streaming_config = streaming_config
+        self.queue = queue
+        self.response = None
+        self.text = ''
+
+    def _get_data(self):
+        LOG.info('WAGNER Thread get_data init')
+        while True:
+            d = self.queue.get()
+            if d is None:
+                break
+            LOG.info('WAGNER Thread get_data yield d')
+            yield d
+            self.queue.task_done()
+
+    def run(self):
+        LOG.info('WAGNER RUN!')
+        audio = self._get_data()
+        req = (speech.types.StreamingRecognizeRequest(audio_content=x) for x in audio)
+        responses = self.client.streaming_recognize(self.streaming_config, req)
+        for res in responses:
+            LOG.info('WAGNER -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
+            LOG.info(res)
+            LOG.info('WAGNER -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
+            if res.results and res.results[0].is_final:
+                self.text = res.results[0].alternatives[0].transcript
+
+
+class GoogleCloudStreamingSTT(GoogleJsonSTT):
+    def __init__(self):
+        super(GoogleCloudStreamingSTT, self).__init__()
+        # override language with module specific language selection
+        self.lang = self.config.get('lang') or self.lang
+
+        self.stream = None
+        self.can_stream = True
+
+        credentials = service_account.Credentials.from_service_account_info(
+            self.credential.get('json')
+        )
+
+        self.client = speech.SpeechClient(credentials=credentials)
+        recognition_config = speech.types.RecognitionConfig(
+            encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
+            sample_rate_hertz=16000,
+            language_code=self.lang,
+            model='command_and_search',
+            max_alternatives=1,
+        )
+        self.streaming_config = speech.types.StreamingRecognitionConfig(
+            config=recognition_config,
+            interim_results=True,
+            single_utterance=True,
+        )
+
+    def execute(self, audio, language=None):
+        #if self.stream is None:
+        #    return super().execute(audio, language)
+        return self.stream_stop()
+
+    def stream_stop(self):
+        LOG.info('WAGNER STOP')
+        if self.stream is not None:
+            self.queue.put(None)
+            self.stream.join()
+
+            text = self.stream.text
+
+            self.stream = None
+            self.queue = None
+            if not text:
+                return None
+            return text
+        return None
+
+    def stream_data(self, data):
+        LOG.info('WAGNER data! {} {}'.format(type(data), len(data)))
+        self.queue.put(data)
+
+    def stream_start(self, language=None):
+        LOG.info('WAGNER START')
+        self.lang = language or self.lang
+        self.stream_stop()
+        self.queue = Queue()
+        self.stream = GoogleStreamThread(self.queue, self.lang, self.client, self.streaming_config)
+        self.stream.start()
+
+
 class KaldiSTT(STT):
    def __init__(self):
        super(KaldiSTT, self).__init__()
@ -301,6 +398,7 @@ class STTFactory:
        "mycroft": MycroftSTT,
        "google": GoogleSTT,
        "google_cloud": GoogleCloudSTT,
+        "google_cloud_streaming": GoogleCloudStreamingSTT,
        "wit": WITSTT,
        "ibm": IBMSTT,
        "kaldi": KaldiSTT,