mycroft-core/mycroft/util/parse.py


# -*- coding: iso-8859-15 -*-

# Copyright 2017 Mycroft AI, Inc.
#
# This file is part of Mycroft Core.
#
# Mycroft Core is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Mycroft Core is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Mycroft Core.  If not, see <http://www.gnu.org/licenses/>.


def normalize(text, lang="en-us", remove_articles=True):
    """Prepare a string for parsing

    This function prepares the given text for parsing by making
    numbers consistent, getting rid of contractions, etc.
    Args:
        text (str): the string to normalize
        lang (str): the code for the language text is in
        remove_articles (bool): whether to remove articles (like 'a', or 'the')
    Returns:
        (str): The normalized string.
    """
    if str(lang).lower().startswith("en"):
        return normalize_en(text, remove_articles)
    elif str(lang).lower().startswith("es"):
        return normalize_es(text, remove_articles)

    # TODO: Normalization for other languages
    return text


def normalize_en(text, remove_articles):
    """ English string normalization """

    words = text.split()  # this also removed extra spaces
    normalized = ""
    for word in words:
        if remove_articles and word in ["the", "a", "an"]:
            continue

        # Expand common contractions, e.g. "isn't" -> "is not"
        contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
                       "didn't", "doesn't", "don't", "gonna", "gotta",
                       "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
                       "how'd", "how'll", "how's", "I'd", "I'll", "I'm",
                       "I've", "isn't", "it'd", "it'll", "it's", "mightn't",
                       "might've", "mustn't", "must've", "needn't", "oughtn't",
                       "shan't", "she'd", "she'll", "she's", "shouldn't",
                       "should've", "somebody's", "someone'd", "someone'll",
                       "someone's", "that'll", "that's", "that'd", "there'd",
                       "there're", "there's", "they'd", "they'll", "they're",
                       "they've", "wasn't", "we'd", "we'll", "we're", "we've",
                       "weren't", "what'd", "what'll", "what're", "what's",
                       "whats",  # technically incorrect but some STT does this
                       "what've", "when's", "when'd", "where'd", "where's",
                       "where've", "who'd", "who'd've", "who'll", "who're",
                       "who's", "who've", "why'd", "why're", "why's", "won't",
                       "won't've", "would've", "wouldn't", "wouldn't've",
                       "y'all", "ya'll", "you'd", "you'd've", "you'll",
                       "y'aint", "y'ain't", "you're", "you've"]
        if word in contraction:
            expansion = ["is not", "are not", "can not", "could have",
                         "could not", "did not", "does not", "do not",
                         "going to", "got to", "had not", "has not",
                         "have not", "he would", "he will", "he is", "how did",
                         "how will", "how is", "I would", "I will", "I am",
                         "I have", "is not", "it would", "it will", "it is",
                         "might not", "might have", "must not", "must have",
                         "need not", "ought not", "shall not", "she would",
                         "she will", "she is", "should not", "should have",
                         "somebody is", "someone would", "someone will",
                         "someone is", "that will", "that is", "that would",
                         "there would", "there are", "there is", "they would",
                         "they will", "they are", "they have", "was not",
                         "we would", "we will", "we are", "we have",
                         "were not", "what did", "what will", "what are",
                         "what is",
                         "what is", "what have", "when is", "when did",
                         "where did", "where is", "where have", "who would",
                         "who would have", "who will", "who are", "who is",
                         "who have", "why did", "why are", "why is",
                         "will not", "will not have", "would have",
                         "would not", "would not have", "you all", "you all",
                         "you would", "you would have", "you will",
                         "you are not", "you are not", "you are", "you have"]
            word = expansion[contraction.index(word)]

        # Convert numbers into digits, e.g. "two" -> "2"
        textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
                       "seven", "eight", "nine", "ten", "eleven", "twelve",
                       "thirteen", "fourteen", "fifteen", "sixteen",
                       "seventeen", "eighteen", "nineteen", "twenty"]
        if word in textNumbers:
            word = str(textNumbers.index(word))

        normalized += " " + word

    return normalized[1:]  # strip the initial space


# TODO: it should be modular in indepent files
# TODO: numbers greaters than 100

es_articles = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
es_numbers_0_9 = [
               "cero", "uno", "dos", "tres", "cuatro",
               "cinco", "seis", "siete", "ocho", "nueve"]
es_numbers_10_29 = [
               u"diez", u"once", u"doce", u"trece", u"catorce",
               u"quince", u"dieciséis", u"diecisiete",
               u"dieciocho", u"diecinueve",
               u"veinte", u"veintiuno", u"veintidós",
               u"veintitrés", u"veinticuatro",
               u"veinticinco", u"veintiséis", u"veintisiete",
               u"veintiocho", u"veintinueve"]
es_numbers_10n = ["treinta", "cuarenta", "cincuenta", "sesenta",
                  "setenta", "ochenta", "noventa"]


def normalize_es(text, remove_articles):
    """ Spanish string normalization """

    words = text.split()  # this also removed extra spaces
    normalized = ""
    i = 0

    while i < len(words):
        word = words[i]
        i += 1

        if remove_articles and word in es_articles:
            continue

        # Convert numbers into digits: from 0 to 99
        elif word in es_numbers_0_9:
            word = str(es_numbers_0_9.index(word))

        elif word in es_numbers_10_29:
            word = str(es_numbers_10_29.index(word)+10)

        elif word in es_numbers_10n:
            n = es_numbers_10n.index(word)*10+30
            if i+1 < len(words) and words[i] == "y" and \
               words[i+1] in es_numbers_0_9:
                n += es_numbers_0_9.index(words[i+1])
                i += 2
            word = str(n)

        normalized += " " + word

    return normalized[1:]  # strip the initial space
normalize_es 2017-05-03 09:37:00 +00:00
			`# -- coding: iso-8859-15 --`

Fixes issue #539 The utterance is now placed on the bus along with its language code. If not specified, it uses "en-us". Added a new mycroft.util.parse module. It contains the normalize() function. Normalization currently does two things: * Expands contractions ("they're" -> "they are", etc) * Optionally removes articles ("a", "an", "the"). Removing is the default. * Textual numbers become digits, up to 20. E.g. "What is the weather in four days" becomes "What is weather in 4 days". NOTE: This is potentially a breaking change! Remove "the", "a" and "an" from your .voc files! Skill changes: * I cleaned up the .voc files for the default Skills. * Split the date_time keyword into an extra entity. Now a "QueryKeyword.voc" exists, with "what\|tell" instead of combing that into "what is time" in the TimeKeyword.voc. * Volume skill now accepts 1-11, e.g. "turn volume to 11" 2017-02-23 12:40:46 +00:00			`# Copyright 2017 Mycroft AI, Inc.`
			`#`
			`# This file is part of Mycroft Core.`
			`#`
			`# Mycroft Core is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# Mycroft Core is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.`


			`def normalize(text, lang="en-us", remove_articles=True):`
			`"""Prepare a string for parsing`

			`This function prepares the given text for parsing by making`
			`numbers consistent, getting rid of contractions, etc.`
Several extensions to text normalization: * intent_failure message now carries along the utterance's lang code * normalizing query for Wolfram Alpha * added normalization of "whats" to "what is". This is technically incorrect ("whats" means more than one instance of "what", as in "the whats and whys of open source"), but that is a rare phrase. Unfortunately, several STT engines incorrectly output things like "whats 8 + 4", which is grammatically incorrect. So we'll handle the common and potentially screw up the uncommon. * more parsing test cases, including a few corrections 2017-02-25 05:59:00 +00:00			`Args:`
			`text (str): the string to normalize`
			`lang (str): the code for the language text is in`
			`remove_articles (bool): whether to remove articles (like 'a', or 'the')`
			`Returns:`
			`(str): The normalized string.`
Fixes issue #539 The utterance is now placed on the bus along with its language code. If not specified, it uses "en-us". Added a new mycroft.util.parse module. It contains the normalize() function. Normalization currently does two things: * Expands contractions ("they're" -> "they are", etc) * Optionally removes articles ("a", "an", "the"). Removing is the default. * Textual numbers become digits, up to 20. E.g. "What is the weather in four days" becomes "What is weather in 4 days". NOTE: This is potentially a breaking change! Remove "the", "a" and "an" from your .voc files! Skill changes: * I cleaned up the .voc files for the default Skills. * Split the date_time keyword into an extra entity. Now a "QueryKeyword.voc" exists, with "what\|tell" instead of combing that into "what is time" in the TimeKeyword.voc. * Volume skill now accepts 1-11, e.g. "turn volume to 11" 2017-02-23 12:40:46 +00:00			`"""`
			`if str(lang).lower().startswith("en"):`
			`return normalize_en(text, remove_articles)`
normalize_es 2017-05-03 09:37:00 +00:00			`elif str(lang).lower().startswith("es"):`
			`return normalize_es(text, remove_articles)`
Fixes issue #539 The utterance is now placed on the bus along with its language code. If not specified, it uses "en-us". Added a new mycroft.util.parse module. It contains the normalize() function. Normalization currently does two things: * Expands contractions ("they're" -> "they are", etc) * Optionally removes articles ("a", "an", "the"). Removing is the default. * Textual numbers become digits, up to 20. E.g. "What is the weather in four days" becomes "What is weather in 4 days". NOTE: This is potentially a breaking change! Remove "the", "a" and "an" from your .voc files! Skill changes: * I cleaned up the .voc files for the default Skills. * Split the date_time keyword into an extra entity. Now a "QueryKeyword.voc" exists, with "what\|tell" instead of combing that into "what is time" in the TimeKeyword.voc. * Volume skill now accepts 1-11, e.g. "turn volume to 11" 2017-02-23 12:40:46 +00:00
			`# TODO: Normalization for other languages`
			`return text`


			`def normalize_en(text, remove_articles):`
			`""" English string normalization """`

			`words = text.split() # this also removed extra spaces`
			`normalized = ""`
			`for word in words:`
			`if remove_articles and word in ["the", "a", "an"]:`
			`continue`

			`# Expand common contractions, e.g. "isn't" -> "is not"`
			`contraction = ["ain't", "aren't", "can't", "could've", "couldn't",`
			`"didn't", "doesn't", "don't", "gonna", "gotta",`
			`"hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",`
			`"how'd", "how'll", "how's", "I'd", "I'll", "I'm",`
			`"I've", "isn't", "it'd", "it'll", "it's", "mightn't",`
			`"might've", "mustn't", "must've", "needn't", "oughtn't",`
			`"shan't", "she'd", "she'll", "she's", "shouldn't",`
			`"should've", "somebody's", "someone'd", "someone'll",`
			`"someone's", "that'll", "that's", "that'd", "there'd",`
			`"there're", "there's", "they'd", "they'll", "they're",`
			`"they've", "wasn't", "we'd", "we'll", "we're", "we've",`
			`"weren't", "what'd", "what'll", "what're", "what's",`
Several extensions to text normalization: * intent_failure message now carries along the utterance's lang code * normalizing query for Wolfram Alpha * added normalization of "whats" to "what is". This is technically incorrect ("whats" means more than one instance of "what", as in "the whats and whys of open source"), but that is a rare phrase. Unfortunately, several STT engines incorrectly output things like "whats 8 + 4", which is grammatically incorrect. So we'll handle the common and potentially screw up the uncommon. * more parsing test cases, including a few corrections 2017-02-25 05:59:00 +00:00			`"whats", # technically incorrect but some STT does this`
Fixes issue #539 The utterance is now placed on the bus along with its language code. If not specified, it uses "en-us". Added a new mycroft.util.parse module. It contains the normalize() function. Normalization currently does two things: * Expands contractions ("they're" -> "they are", etc) * Optionally removes articles ("a", "an", "the"). Removing is the default. * Textual numbers become digits, up to 20. E.g. "What is the weather in four days" becomes "What is weather in 4 days". NOTE: This is potentially a breaking change! Remove "the", "a" and "an" from your .voc files! Skill changes: * I cleaned up the .voc files for the default Skills. * Split the date_time keyword into an extra entity. Now a "QueryKeyword.voc" exists, with "what\|tell" instead of combing that into "what is time" in the TimeKeyword.voc. * Volume skill now accepts 1-11, e.g. "turn volume to 11" 2017-02-23 12:40:46 +00:00			`"what've", "when's", "when'd", "where'd", "where's",`
			`"where've", "who'd", "who'd've", "who'll", "who're",`
			`"who's", "who've", "why'd", "why're", "why's", "won't",`
			`"won't've", "would've", "wouldn't", "wouldn't've",`
			`"y'all", "ya'll", "you'd", "you'd've", "you'll",`
			`"y'aint", "y'ain't", "you're", "you've"]`
			`if word in contraction:`
			`expansion = ["is not", "are not", "can not", "could have",`
			`"could not", "did not", "does not", "do not",`
			`"going to", "got to", "had not", "has not",`
			`"have not", "he would", "he will", "he is", "how did",`
			`"how will", "how is", "I would", "I will", "I am",`
			`"I have", "is not", "it would", "it will", "it is",`
			`"might not", "might have", "must not", "must have",`
			`"need not", "ought not", "shall not", "she would",`
			`"she will", "she is", "should not", "should have",`
			`"somebody is", "someone would", "someone will",`
			`"someone is", "that will", "that is", "that would",`
			`"there would", "there are", "there is", "they would",`
			`"they will", "they are", "they have", "was not",`
			`"we would", "we will", "we are", "we have",`
			`"were not", "what did", "what will", "what are",`
#539 - fixing pep8 2017-03-14 18:43:41 +00:00			`"what is",`
Fixes issue #539 The utterance is now placed on the bus along with its language code. If not specified, it uses "en-us". Added a new mycroft.util.parse module. It contains the normalize() function. Normalization currently does two things: * Expands contractions ("they're" -> "they are", etc) * Optionally removes articles ("a", "an", "the"). Removing is the default. * Textual numbers become digits, up to 20. E.g. "What is the weather in four days" becomes "What is weather in 4 days". NOTE: This is potentially a breaking change! Remove "the", "a" and "an" from your .voc files! Skill changes: * I cleaned up the .voc files for the default Skills. * Split the date_time keyword into an extra entity. Now a "QueryKeyword.voc" exists, with "what\|tell" instead of combing that into "what is time" in the TimeKeyword.voc. * Volume skill now accepts 1-11, e.g. "turn volume to 11" 2017-02-23 12:40:46 +00:00			`"what is", "what have", "when is", "when did",`
			`"where did", "where is", "where have", "who would",`
			`"who would have", "who will", "who are", "who is",`
			`"who have", "why did", "why are", "why is",`
			`"will not", "will not have", "would have",`
			`"would not", "would not have", "you all", "you all",`
			`"you would", "you would have", "you will",`
			`"you are not", "you are not", "you are", "you have"]`
			`word = expansion[contraction.index(word)]`

			`# Convert numbers into digits, e.g. "two" -> "2"`
			`textNumbers = ["zero", "one", "two", "three", "four", "five", "six",`
			`"seven", "eight", "nine", "ten", "eleven", "twelve",`
			`"thirteen", "fourteen", "fifteen", "sixteen",`
			`"seventeen", "eighteen", "nineteen", "twenty"]`
			`if word in textNumbers:`
			`word = str(textNumbers.index(word))`

#539 - fixing pep8 2017-03-14 18:43:41 +00:00			`normalized += " " + word`
Fixes issue #539 The utterance is now placed on the bus along with its language code. If not specified, it uses "en-us". Added a new mycroft.util.parse module. It contains the normalize() function. Normalization currently does two things: * Expands contractions ("they're" -> "they are", etc) * Optionally removes articles ("a", "an", "the"). Removing is the default. * Textual numbers become digits, up to 20. E.g. "What is the weather in four days" becomes "What is weather in 4 days". NOTE: This is potentially a breaking change! Remove "the", "a" and "an" from your .voc files! Skill changes: * I cleaned up the .voc files for the default Skills. * Split the date_time keyword into an extra entity. Now a "QueryKeyword.voc" exists, with "what\|tell" instead of combing that into "what is time" in the TimeKeyword.voc. * Volume skill now accepts 1-11, e.g. "turn volume to 11" 2017-02-23 12:40:46 +00:00
			`return normalized[1:] # strip the initial space`
normalize_es 2017-05-03 09:37:00 +00:00

+ 2017-05-03 17:48:08 +00:00			`# TODO: it should be modular in indepent files`
			`# TODO: numbers greaters than 100`

			`es_articles = ["el", "la", "los", "las", "un", "una", "unos", "unas"]`
			`es_numbers_0_9 = [`
			`"cero", "uno", "dos", "tres", "cuatro",`
			`"cinco", "seis", "siete", "ocho", "nueve"]`
			`es_numbers_10_29 = [`
normalize_es 2017-05-03 09:37:00 +00:00			`u"diez", u"once", u"doce", u"trece", u"catorce",`
			`u"quince", u"dieciséis", u"diecisiete",`
			`u"dieciocho", u"diecinueve",`
+ 2017-05-03 17:48:08 +00:00			`u"veinte", u"veintiuno", u"veintidós",`
normalize_es 2017-05-03 09:37:00 +00:00			`u"veintitrés", u"veinticuatro",`
			`u"veinticinco", u"veintiséis", u"veintisiete",`
			`u"veintiocho", u"veintinueve"]`
+ 2017-05-03 17:48:08 +00:00			`es_numbers_10n = ["treinta", "cuarenta", "cincuenta", "sesenta",`
			`"setenta", "ochenta", "noventa"]`
normalize_es 2017-05-03 09:37:00 +00:00

			`def normalize_es(text, remove_articles):`
			`""" Spanish string normalization """`

			`words = text.split() # this also removed extra spaces`
			`normalized = ""`
+ 2017-05-03 17:48:08 +00:00			`i = 0`

			`while i < len(words):`
			`word = words[i]`
			`i += 1`

			`if remove_articles and word in es_articles:`
normalize_es 2017-05-03 09:37:00 +00:00			`continue`

+ 2017-05-03 17:48:08 +00:00			`# Convert numbers into digits: from 0 to 99`
			`elif word in es_numbers_0_9:`
			`word = str(es_numbers_0_9.index(word))`

			`elif word in es_numbers_10_29:`
			`word = str(es_numbers_10_29.index(word)+10)`
normalize_es 2017-05-03 09:37:00 +00:00
+ 2017-05-03 17:48:08 +00:00			`elif word in es_numbers_10n:`
			`n = es_numbers_10n.index(word)*10+30`
			`if i+1 < len(words) and words[i] == "y" and \`
			`words[i+1] in es_numbers_0_9:`
			`n += es_numbers_0_9.index(words[i+1])`
			`i += 2`
			`word = str(n)`
normalize_es 2017-05-03 09:37:00 +00:00
			`normalized += " " + word`

			`return normalized[1:] # strip the initial space`