Fixes issue #539

The utterance is now placed on the bus along with its language code. If not specified, it uses "en-us". Added a new mycroft.util.parse module. It contains the normalize() function. Normalization currently does two things: * Expands contractions ("they're" -> "they are", etc) * Optionally removes articles ("a", "an", "the"). Removing is the default. * Textual numbers become digits, up to 20. E.g. "What is the weather in four days" becomes "What is weather in 4 days". NOTE: This is potentially a breaking change! Remove "the", "a" and "an" from your .voc files! Skill changes: * I cleaned up the .voc files for the default Skills. * Split the date_time keyword into an extra entity. Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc. * Volume skill now accepts 1-11, e.g. "turn volume to 11"
2017-02-23 04:40:46 -08:00 · 2017-02-23 04:40:46 -08:00 · cfa79e03a2
parent 621faef118
commit cfa79e03a2
20 changed files with 349 additions and 48 deletions
--- a/mycroft/client/speech/listener.py
+++ b/mycroft/client/speech/listener.py
@ -130,11 +130,13 @@ class AudioConsumer(Thread):
        elif connected():
            self.transcribe(audio)
        else:
+            # TODO: Localization
            self.__speak("Mycroft seems not to be connected to the Internet")

    def transcribe(self, audio):
        text = None
        try:
+            # Invoke the STT engine on the audio clip
            text = self.stt.execute(audio).lower().strip()
            LOG.debug("STT: " + text)
        except sr.RequestError as e:
@ -148,8 +150,10 @@ class AudioConsumer(Thread):
            LOG.error("Speech Recognition could not understand audio")
            self.__speak("Sorry, I didn't catch that")
        if text:
+            # STT succeeded, send the transcribed speech on for processing
            payload = {
                'utterances': [text],
+                'lang': self.stt.lang,
                'session': SessionManager.get().session_id
            }
            self.emitter.emit("recognizer_loop:utterance", payload)
@ -188,6 +192,7 @@ class RecognizerLoop(EventEmitter):
        self.state = RecognizerLoopState()

    def create_mycroft_recognizer(self, rate, lang):
+        # Create a local recognizer to hear the wakeup word, e.g. 'Hey Mycroft'
        wake_word = self.config.get('wake_word')
        phonemes = self.config.get('phonemes')
        threshold = self.config.get('threshold')
@ -195,6 +200,8 @@ class RecognizerLoop(EventEmitter):

    @staticmethod
    def create_wakeup_recognizer(rate, lang):
+        # Create a local recognizer to come out of sleep with 'wake up'
+        # TODO - localization
        return LocalRecognizer("wake up", "W EY K . AH P", 1e-10, rate, lang)

    def start_async(self):
--- a/mycroft/skills/date_time/init.py
+++ b/mycroft/skills/date_time/init.py
@ -43,8 +43,8 @@ class TimeSkill(MycroftSkill):
            self.format = "%I:%M, %p"

    def initialize(self):
-        intent = IntentBuilder("TimeIntent").require("TimeKeyword") \
-            .optionally("Location").build()
+        intent = IntentBuilder("TimeIntent").require("QueryKeyword") \
+            .require("TimeKeyword").optionally("Location").build()
        self.register_intent(intent, self.handle_intent)

    def get_timezone(self, locale):
--- a/mycroft/skills/date_time/vocab/en-us/DateKeyword.voc
+++ b/mycroft/skills/date_time/vocab/en-us/DateKeyword.voc
@ -1,4 +1,2 @@
-what's the date
-whats the date
-what day is it
-what is the date
+date
+day
--- a/mycroft/skills/date_time/vocab/en-us/QueryKeyword.voc
+++ b/mycroft/skills/date_time/vocab/en-us/QueryKeyword.voc
@ -0,0 +1,2 @@
+what
+tell
--- a/mycroft/skills/date_time/vocab/en-us/TimeKeyword.voc
+++ b/mycroft/skills/date_time/vocab/en-us/TimeKeyword.voc
@ -1,6 +1 @@
-what time is it
-what is the time
-what's the time
-whats the time
-what time is
-time is it
+time
--- a/mycroft/skills/intent/init.py
+++ b/mycroft/skills/intent/init.py
@ -21,6 +21,7 @@ from adapt.engine import IntentDeterminationEngine
 from mycroft.messagebus.message import Message
 from mycroft.skills.core import open_intent_envelope, MycroftSkill
 from mycroft.util.log import getLogger
+from mycroft.util.parser import normalize

 __author__ = 'seanfitz'

@ -40,13 +41,20 @@ class IntentSkill(MycroftSkill):
        self.emitter.on('detach_intent', self.handle_detach_intent)

    def handle_utterance(self, message):
+        # Get language of the utterance
+        lang = message.data.get('lang', None)
+        if not lang:
+            lang = "en-us"
+
        utterances = message.data.get('utterances', '')

        best_intent = None
        for utterance in utterances:
            try:
+                # normalize() changes "it's a boy" to "it is boy", etc.
                best_intent = next(self.engine.determine_intent(
-                    utterance, 100))
+                    normalize(utterance, lang), 100))
+
                # TODO - Should Adapt handle this?
                best_intent['utterance'] = utterance
            except StopIteration, e:
--- a/mycroft/skills/joke/vocab/en-us/JokingKeyword.voc
+++ b/mycroft/skills/joke/vocab/en-us/JokingKeyword.voc
@ -1,4 +1,4 @@
 joke
 make me laugh
 brighten my day
-tell me a joke
+tell me joke
--- a/mycroft/skills/media/vocab/en-us/CurrentlyPlayingKeyword.voc
+++ b/mycroft/skills/media/vocab/en-us/CurrentlyPlayingKeyword.voc
@ -1,6 +1,4 @@
-what's currently playing
 what is currently playing
 what are you playing
-What are we listening to
-what's playing
+what are we listening to
 what is playing
--- a/mycroft/skills/npr_news/vocab/en-us/NPRNewsKeyword.voc
+++ b/mycroft/skills/npr_news/vocab/en-us/NPRNewsKeyword.voc
@ -1,2 +1,2 @@
 news
-tell me the news
+tell me news
--- a/mycroft/skills/personal/vocab/en-us/WhoAreYouKeyword.voc
+++ b/mycroft/skills/personal/vocab/en-us/WhoAreYouKeyword.voc
@ -1,2 +1 @@
 who are you
-who're you
--- a/mycroft/skills/personal/vocab/en-us/WhoMadeYouKeyword.voc
+++ b/mycroft/skills/personal/vocab/en-us/WhoMadeYouKeyword.voc
@ -1,2 +1,4 @@
 who made you
 who were you made by
+who created you
+who built you
--- a/mycroft/skills/reminder/vocab/en-us/ReminderSkillAmount.voc
+++ b/mycroft/skills/reminder/vocab/en-us/ReminderSkillAmount.voc
@ -1,5 +1,5 @@
 all|all my
-1|one
-2|two
-the next
-the following
+1
+2
+next
+following
--- a/mycroft/skills/reminder/vocab/en-us/ReminderSkillCreateVerb.voc
+++ b/mycroft/skills/reminder/vocab/en-us/ReminderSkillCreateVerb.voc
@ -3,4 +3,4 @@ notify
 notify me
 remind me
 reminder
-set a reminder
+set reminder
--- a/mycroft/skills/spelling/vocab/en-us/SpellingKeyword.voc
+++ b/mycroft/skills/spelling/vocab/en-us/SpellingKeyword.voc
@ -1,4 +1,4 @@
 spell
-spell the word
+spell word
 spelling of
-spelling of the word
+spelling of word
--- a/mycroft/skills/volume/vocab/en-us/VolumeAmount.voc
+++ b/mycroft/skills/volume/vocab/en-us/VolumeAmount.voc
@ -1,6 +1,15 @@
-0|zero
-1|one
-2|two
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
 quiet
 normal
 loud
--- a/mycroft/skills/weather/vocab/en-us/NextDay.voc
+++ b/mycroft/skills/weather/vocab/en-us/NextDay.voc
@ -1,9 +1,5 @@
 tomorrow
 1 day
 in 1 day
-one day
-in one day
 next day
-the next day
 following day
-the following day
--- a/mycroft/skills/weather/vocab/en-us/NextHours.voc
+++ b/mycroft/skills/weather/vocab/en-us/NextHours.voc
@ -1,14 +1,10 @@
 next hour
-the next hour
-in the next hour
+in next hour
 next hours
-the next hours
-in the next hours
+in next hours
 few hours
 next few hours
-the next few hours
-in the next few hours
+in next few hours
 couple of hours
 next couple of hours
-the next couple of hours
-in the next couple of hours
+in next couple of hours
--- a/mycroft/stt/init.py
+++ b/mycroft/stt/init.py
@ -70,8 +70,8 @@ class GoogleSTT(TokenSTT):
        super(GoogleSTT, self).__init__()

    def execute(self, audio, language=None):
-        language = language or self.lang
-        return self.recognizer.recognize_google(audio, self.token, language)
+        self.lang = language or self.lang
+        return self.recognizer.recognize_google(audio, self.token, s)


 class WITSTT(TokenSTT):
@ -88,9 +88,9 @@ class IBMSTT(BasicSTT):
        super(IBMSTT, self).__init__()

    def execute(self, audio, language=None):
-        language = language or self.lang
+        self.lang = language or self.lang
        return self.recognizer.recognize_ibm(audio, self.username,
-                                             self.password, language)
+                                             self.password, self.lang)


 class MycroftSTT(STT):
@ -99,8 +99,8 @@ class MycroftSTT(STT):
        self.api = STTApi()

    def execute(self, audio, language=None):
-        language = language or self.lang
-        return self.api.stt(audio.get_flac_data(), language, 1)[0]
+        self.lang = language or self.lang
+        return self.api.stt(audio.get_flac_data(), self.lang, 1)[0]


 class STTFactory(object):
--- a/mycroft/util/parse.py
+++ b/mycroft/util/parse.py
@ -0,0 +1,96 @@
+# Copyright 2017 Mycroft AI, Inc.
+#
+# This file is part of Mycroft Core.
+#
+# Mycroft Core is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Mycroft Core is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Mycroft Core.  If not, see <http://www.gnu.org/licenses/>.
+
+
+def normalize(text, lang="en-us", remove_articles=True):
+    """Prepare a string for parsing
+
+    This function prepares the given text for parsing by making
+    numbers consistent, getting rid of contractions, etc.
+    """
+    if str(lang).lower().startswith("en"):
+        return normalize_en(text, remove_articles)
+
+    # TODO: Normalization for other languages
+    return text
+
+
+def normalize_en(text, remove_articles):
+    """ English string normalization """
+
+    words = text.split()  # this also removed extra spaces
+    normalized = ""
+    for word in words:
+        if remove_articles and word in ["the", "a", "an"]:
+            continue
+
+        # Expand common contractions, e.g. "isn't" -> "is not"
+        contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
+                       "didn't", "doesn't", "don't", "gonna", "gotta",
+                       "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
+                       "how'd", "how'll", "how's", "I'd", "I'll", "I'm",
+                       "I've", "isn't", "it'd", "it'll", "it's", "mightn't",
+                       "might've", "mustn't", "must've", "needn't", "oughtn't",
+                       "shan't", "she'd", "she'll", "she's", "shouldn't",
+                       "should've", "somebody's", "someone'd", "someone'll",
+                       "someone's", "that'll", "that's", "that'd", "there'd",
+                       "there're", "there's", "they'd", "they'll", "they're",
+                       "they've", "wasn't", "we'd", "we'll", "we're", "we've",
+                       "weren't", "what'd", "what'll", "what're", "what's",
+                       "what've", "when's", "when'd", "where'd", "where's",
+                       "where've", "who'd", "who'd've", "who'll", "who're",
+                       "who's", "who've", "why'd", "why're", "why's", "won't",
+                       "won't've", "would've", "wouldn't", "wouldn't've",
+                       "y'all", "ya'll", "you'd", "you'd've", "you'll",
+                       "y'aint", "y'ain't", "you're", "you've"]
+        if word in contraction:
+            expansion = ["is not", "are not", "can not", "could have",
+                         "could not", "did not", "does not", "do not",
+                         "going to", "got to", "had not", "has not",
+                         "have not", "he would", "he will", "he is", "how did",
+                         "how will", "how is", "I would", "I will", "I am",
+                         "I have", "is not", "it would", "it will", "it is",
+                         "might not", "might have", "must not", "must have",
+                         "need not", "ought not", "shall not", "she would",
+                         "she will", "she is", "should not", "should have",
+                         "somebody is", "someone would", "someone will",
+                         "someone is", "that will", "that is", "that would",
+                         "there would", "there are", "there is", "they would",
+                         "they will", "they are", "they have", "was not",
+                         "we would", "we will", "we are", "we have",
+                         "were not", "what did", "what will", "what are",
+                         "what is", "what have", "when is", "when did",
+                         "where did", "where is", "where have", "who would",
+                         "who would have", "who will", "who are", "who is",
+                         "who have", "why did", "why are", "why is",
+                         "will not", "will not have", "would have",
+                         "would not", "would not have", "you all", "you all",
+                         "you would", "you would have", "you will",
+                         "you are not", "you are not", "you are", "you have"]
+            word = expansion[contraction.index(word)]
+
+        # Convert numbers into digits, e.g. "two" -> "2"
+        textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
+                       "seven", "eight", "nine", "ten", "eleven", "twelve",
+                       "thirteen", "fourteen", "fifteen", "sixteen",
+                       "seventeen", "eighteen", "nineteen", "twenty"]
+        if word in textNumbers:
+            word = str(textNumbers.index(word))
+
+        normalized += " "+word
+
+    return normalized[1:]  # strip the initial space
--- a/test/util/test_parse.py
+++ b/test/util/test_parse.py
@ -0,0 +1,195 @@
+import unittest
+from mycroft.util.parse import normalize
+
+
+class TestNormalize(unittest.TestCase):
+    def test_articles(self):
+        self.assertEqual(normalize("this is a test", remove_articles=True),
+                         "this is test")
+        self.assertEqual(normalize("this is the test", remove_articles=True),
+                         "this is test")
+        self.assertEqual(normalize("and annother test", remove_articles=True),
+                         "and another test")
+        self.assertEqual(normalize("this is an extra test",
+                                   remove_articles=False),
+                         "this is an extra test")
+
+    def test_spaces(self):
+        self.assertEqual(normalize("  this   is  a    test"),
+                         "this is a test")
+        self.assertEqual(normalize("  this   is  a    test  "),
+                         "this is a test")
+        self.assertEqual(normalize("  this   is  one    test"),
+                         "this is 1 test")
+
+    def test_numbers(self):
+        self.assertEqual(normalize("this is a one two three  test"),
+                         "this is a 1 2 3 test")
+        self.assertEqual(normalize("  it's  a four five six  test"),
+                         "it is a 4 5 6 test")
+        self.assertEqual(normalize("it's  a seven eight nine test"),
+                         "it is a 7 8 9 test")
+        self.assertEqual(normalize("it's a seven eight nine  test"),
+                         "it is a 7 8 9 test")
+        self.assertEqual(normalize("that's a ten eleven twelve test"),
+                         "that is 10 11 12 test")
+        self.assertEqual(normalize("that's a thirteen fourteen test"),
+                         "that is 13 14 test")
+        self.assertEqual(normalize("that's fifteen sixteen seventeen"),
+                         "that is 15 16 17")
+        self.assertEqual(normalize("that's eighteen nineteen twenty"),
+                         "that is 18 19 20")
+
+    def test_contractions(self):
+        self.assertEqual(normalize("ain't"), "is not")
+        self.assertEqual(normalize("aren't"), "are not")
+        self.assertEqual(normalize("can't"), "can not")
+        self.assertEqual(normalize("could've"), "could have")
+        self.assertEqual(normalize("couldn't"), "could not")
+        self.assertEqual(normalize("didn't"), "did not")
+        self.assertEqual(normalize("doesn't"), "does not")
+        self.assertEqual(normalize("don't"), "do not")
+        self.assertEqual(normalize("gonna"), "going to")
+        self.assertEqual(normalize("gotta"), "got to")
+        self.assertEqual(normalize("hadn't"), "had not")
+        self.assertEqual(normalize("hadn't have"), "had not have")
+        self.assertEqual(normalize("hasn't"), "has not")
+        self.assertEqual(normalize("haven't"), "have not")
+        # TODO: Ambiguous with "he had"
+        self.assertEqual(normalize("he'd"), "he would")
+        self.assertEqual(normalize("he'll"), "he will")
+        # TODO: Ambiguous with "he has"
+        self.assertEqual(normalize("he's"), "he is")
+        # TODO: Ambiguous with "how would"
+        self.assertEqual(normalize("how'd"), "how did")
+        self.assertEqual(normalize("how'll"), "how will")
+        # TODO: Ambiguous with "how has" and "how does"
+        self.assertEqual(normalize("how's"), "how is")
+        # TODO: Ambiguous with "I had"
+        self.assertEqual(normalize("I'd"), "I would")
+        self.assertEqual(normalize("I'll"), "I will")
+        self.assertEqual(normalize("I'm"), "I am")
+        self.assertEqual(normalize("I've"), "I have")
+        self.assertEqual(normalize("I haven't"), "I have not")
+        self.assertEqual(normalize("isn't"), "is not")
+        self.assertEqual(normalize("it'd"), "it would")
+        self.assertEqual(normalize("it'll"), "it will")
+        # TODO: Ambiguous with "it has"
+        self.assertEqual(normalize("it's"), "it is")
+        self.assertEqual(normalize("it isn't"), "it is not")
+        self.assertEqual(normalize("mightn't"), "might not")
+        self.assertEqual(normalize("might've"), "might have")
+        self.assertEqual(normalize("mustn't"), "must not")
+        self.assertEqual(normalize("mustn't have"), "must not have")
+        self.assertEqual(normalize("must've"), "must have")
+        self.assertEqual(normalize("needn't"), "need not")
+        self.assertEqual(normalize("oughtn't"), "ought not")
+        self.assertEqual(normalize("shan't"), "shall not")
+        # TODO: Ambiguous wiht "she had"
+        self.assertEqual(normalize("she'd"), "she would")
+        self.assertEqual(normalize("she hadn't"), "she had not")
+        self.assertEqual(normalize("she'll"), "she will")
+        self.assertEqual(normalize("she's"), "she is")
+        self.assertEqual(normalize("she isn't"), "she is not")
+        self.assertEqual(normalize("should've"), "should have")
+        self.assertEqual(normalize("shouldn't"), "should not")
+        self.assertEqual(normalize("shouldn't have"), "should not have")
+        self.assertEqual(normalize("somebody's"), "somebody is")
+        # TODO: Ambiguous with "someone had"
+        self.assertEqual(normalize("someone'd"), "someone would")
+        self.assertEqual(normalize("someone hadn't"), "someone had not")
+        self.assertEqual(normalize("someone'll"), "someone will")
+        # TODO: Ambiguous with "someone has"
+        self.assertEqual(normalize("someone's"), "someone is")
+        self.assertEqual(normalize("that'll"), "that will")
+        # TODO: Ambiguous with "that has"
+        self.assertEqual(normalize("that's"), "that is")
+        # TODO: Ambiguous with "that had"
+        self.assertEqual(normalize("that'd"), "that would")
+        # TODO: Ambiguous with "there had"
+        self.assertEqual(normalize("there'd"), "there would")
+        self.assertEqual(normalize("there're"), "there are")
+        # TODO: Ambiguous with "there has"
+        self.assertEqual(normalize("there's"), "there is")
+        # TODO: Ambiguous with "they had"
+        self.assertEqual(normalize("they'd"), "they would")
+        self.assertEqual(normalize("they'll"), "they will")
+        self.assertEqual(normalize("they won't have"), "they will not have")
+        self.assertEqual(normalize("they're"), "they are")
+        self.assertEqual(normalize("they've"), "they have")
+        self.assertEqual(normalize("they haven't"), "they have not")
+        self.assertEqual(normalize("wasn't"), "was not")
+        # TODO: Ambiguous wiht "we had"
+        self.assertEqual(normalize("we'd"), "we would")
+        self.assertEqual(normalize("we would've"), "we would have")
+        self.assertEqual(normalize("we wouldn't"), "we would not")
+        self.assertEqual(normalize("we wouldn't have"), "we would not have")
+        self.assertEqual(normalize("we'll"), "we will")
+        self.assertEqual(normalize("we won't have"), "we will not have")
+        self.assertEqual(normalize("we're"), "we are")
+        self.assertEqual(normalize("we've"), "we have")
+        self.assertEqual(normalize("weren't"), "were not")
+        self.assertEqual(normalize("what'd"), "what did")
+        self.assertEqual(normalize("what'll"), "what will")
+        self.assertEqual(normalize("what're"), "what are")
+        # TODO: Ambiguous with "what has" / "what does")
+        self.assertEqual(normalize("what's"), "what is")
+        self.assertEqual(normalize("what've"), "what have")
+        # TODO: Ambiguous with "when has"
+        self.assertEqual(normalize("when's"), "when is")
+        self.assertEqual(normalize("where'd"), "where did")
+        # TODO: Ambiguous with "where has" / where does"
+        self.assertEqual(normalize("where's"), "where is")
+        self.assertEqual(normalize("where've"), "where have")
+        # TODO: Ambiguous with "who had" "who did")
+        self.assertEqual(normalize("who'd"), "who would")
+        self.assertEqual(normalize("who'd've"), "who would have")
+        self.assertEqual(normalize("who'll"), "who will")
+        self.assertEqual(normalize("who're"), "who are")
+        # TODO: Ambiguous with "who has" / "who does"
+        self.assertEqual(normalize("who's"), "who is")
+        self.assertEqual(normalize("who've"), "who have")
+        self.assertEqual(normalize("why'd"), "why did")
+        self.assertEqual(normalize("why're"), "why are")
+        # TODO: Ambiguous with "why has" / "why does"
+        self.assertEqual(normalize("why's"), "why is")
+        self.assertEqual(normalize("won't"), "will not")
+        self.assertEqual(normalize("won't've"), "will not have")
+        self.assertEqual(normalize("would've"), "would have")
+        self.assertEqual(normalize("wouldn't"), "would not")
+        self.assertEqual(normalize("wouldn't've"), "would not have")
+        self.assertEqual(normalize("ya'll"), "you all")
+        self.assertEqual(normalize("y'all"), "you all")
+        self.assertEqual(normalize("y'ain't"), "you are not")
+        # TODO: Ambiguous with "you had"
+        self.assertEqual(normalize("you'd"), "you would")
+        self.assertEqual(normalize("you'd've"), "you would have")
+        self.assertEqual(normalize("you'll"), "you will")
+        self.assertEqual(normalize("you're"), "you are")
+        self.assertEqual(normalize("you aren't"), "you are not")
+        self.assertEqual(normalize("you've"), "you have")
+        self.assertEqual(normalize("you haven't"), "you have not")
+
+    def test_combinations(self):
+        self.assertEqual(normalize("I couldn't have guessed there'd be two"),
+                         "I could not have guessed there would be 2")
+        self.assertEqual(normalize("I wouldn't have"), "I would not have")
+        self.assertEqual(normalize("I hadn't been there"),
+                         "I had not been there")
+        self.assertEqual(normalize("I would've"), "I would have")
+        self.assertEqual(normalize("it hadn't"), "it had not")
+        self.assertEqual(normalize("it hadn't have"), "it had not have")
+        self.assertEqual(normalize("it would've"), "it would have")
+        self.assertEqual(normalize("she wouldn't have"), "she would not have")
+        self.assertEqual(normalize("she would've"), "she would have")
+        self.assertEqual(normalize("someone wouldn't have"),
+                         "someone would not have")
+        self.assertEqual(normalize("someone would've"), "someone would have")
+        self.assertEqual(normalize("what's the weather like"),
+                         "what is the weather like")
+        self.assertEqual(normalize("that's what I told you"),
+                         "that is what I told you")
+
+
+if __name__ == "__main__":
+    unittest.main()