Split format.py and parse.py per language

Move the language specific functions and constants into separate files. This will avoid many unnecessary conflicts due to involuntary encoding changes.
2017-12-30 01:14:28 +01:00 · 2017-12-30 01:14:28 +01:00 · 0114ce473e
parent 5d842fd369
commit 0114ce473e
10 changed files with 2448 additions and 2327 deletions
--- a/mycroft/util/format.py
+++ b/mycroft/util/format.py
@ -1,5 +1,3 @@
-# -*- coding: iso-8859-15 -*-
-#
 # Copyright 2017 Mycroft AI Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -14,52 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-FRACTION_STRING_EN = {
-    2: 'half',
-    3: 'third',
-    4: 'forth',
-    5: 'fifth',
-    6: 'sixth',
-    7: 'seventh',
-    8: 'eigth',
-    9: 'ninth',
-    10: 'tenth',
-    11: 'eleventh',
-    12: 'twelveth',
-    13: 'thirteenth',
-    14: 'fourteenth',
-    15: 'fifteenth',
-    16: 'sixteenth',
-    17: 'seventeenth',
-    18: 'eighteenth',
-    19: 'nineteenth',
-    20: 'twentyith'
-}

-FRACTION_STRING_PT = {
-    2: 'meio',
-    3: u'terço',
-    4: 'quarto',
-    5: 'quinto',
-    6: 'sexto',
-    7: u'sétimo',
-    8: 'oitavo',
-    9: 'nono',
-    10: u'décimo',
-    11: 'onze avos',
-    12: 'doze avos',
-    13: 'treze avos',
-    14: 'catorze avos',
-    15: 'quinze avos',
-    16: 'dezasseis avos',
-    17: 'dezassete avos',
-    18: 'dezoito avos',
-    19: 'dezanove avos',
-    20: u'vigésimo',
-    30: u'trigésimo',
-    100: u'centésimo',
-    1000: u'milésimo'
-}
+from mycroft.util.lang.format_en import *
+from mycroft.util.lang.format_es import *
+from mycroft.util.lang.format_pt import *


 def nice_number(number, lang="en-us", speech=True, denominators=None):
@ -95,55 +51,6 @@ def nice_number(number, lang="en-us", speech=True, denominators=None):
    return str(number)


-def nice_number_en(result):
-    """ English conversion for nice_number """
-    whole, num, den = result
-    if num == 0:
-        return str(whole)
-    den_str = FRACTION_STRING_EN[den]
-    if whole == 0:
-        if num == 1:
-            return_string = 'a {}'.format(den_str)
-        else:
-            return_string = '{} {}'.format(num, den_str)
-    elif num == 1:
-        return_string = '{} and a {}'.format(whole, den_str)
-    else:
-        return_string = '{} and {} {}'.format(whole, num, den_str)
-    if num > 1:
-        return_string += 's'
-    return return_string
-
-
-def nice_number_pt(result):
-    """ Portuguese conversion for nice_number """
-    whole, num, den = result
-    if num == 0:
-        return str(whole)
-    # denominador
-    den_str = FRACTION_STRING_PT[den]
-    # fracções
-    if whole == 0:
-        if num == 1:
-            # um décimo
-            return_string = 'um {}'.format(den_str)
-        else:
-            # três meio
-            return_string = '{} {}'.format(num, den_str)
-    # inteiros >10
-    elif num == 1:
-        # trinta e um
-        return_string = '{} e {}'.format(whole, den_str)
-    # inteiros >10 com fracções
-    else:
-        # vinte e 3 décimo
-        return_string = '{} e {} {}'.format(whole, num, den_str)
-    # plural
-    if num > 1:
-        return_string += 's'
-    return return_string
-
-
 def convert_number(number, denominators):
    """ Convert floats to mixed fractions """
    int_number = int(number)
--- a/mycroft/util/lang/init.py
+++ b/mycroft/util/lang/init.py
@ -0,0 +1,14 @@
+# Copyright 2017 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
--- a/mycroft/util/lang/format_en.py
+++ b/mycroft/util/lang/format_en.py
@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2017 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+FRACTION_STRING_EN = {
+    2: 'half',
+    3: 'third',
+    4: 'forth',
+    5: 'fifth',
+    6: 'sixth',
+    7: 'seventh',
+    8: 'eigth',
+    9: 'ninth',
+    10: 'tenth',
+    11: 'eleventh',
+    12: 'twelveth',
+    13: 'thirteenth',
+    14: 'fourteenth',
+    15: 'fifteenth',
+    16: 'sixteenth',
+    17: 'seventeenth',
+    18: 'eighteenth',
+    19: 'nineteenth',
+    20: 'twentyith'
+}
+
+
+def nice_number_en(result):
+    """ English conversion for nice_number """
+    whole, num, den = result
+    if num == 0:
+        return str(whole)
+    den_str = FRACTION_STRING_EN[den]
+    if whole == 0:
+        if num == 1:
+            return_string = 'a {}'.format(den_str)
+        else:
+            return_string = '{} {}'.format(num, den_str)
+    elif num == 1:
+        return_string = '{} and a {}'.format(whole, den_str)
+    else:
+        return_string = '{} and {} {}'.format(whole, num, den_str)
+    if num > 1:
+        return_string += 's'
+    return return_string
--- a/mycroft/util/lang/format_pt.py
+++ b/mycroft/util/lang/format_pt.py
@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2017 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+FRACTION_STRING_PT = {
+    2: 'meio',
+    3: u'terço',
+    4: 'quarto',
+    5: 'quinto',
+    6: 'sexto',
+    7: u'sétimo',
+    8: 'oitavo',
+    9: 'nono',
+    10: u'décimo',
+    11: 'onze avos',
+    12: 'doze avos',
+    13: 'treze avos',
+    14: 'catorze avos',
+    15: 'quinze avos',
+    16: 'dezasseis avos',
+    17: 'dezassete avos',
+    18: 'dezoito avos',
+    19: 'dezanove avos',
+    20: u'vigésimo',
+    30: u'trigésimo',
+    100: u'centésimo',
+    1000: u'milésimo'
+}
+
+
+def nice_number_pt(result):
+    """ Portuguese conversion for nice_number """
+    whole, num, den = result
+    if num == 0:
+        return str(whole)
+    # denominador
+    den_str = FRACTION_STRING_PT[den]
+    # fracções
+    if whole == 0:
+        if num == 1:
+            # um décimo
+            return_string = 'um {}'.format(den_str)
+        else:
+            # três meio
+            return_string = '{} {}'.format(num, den_str)
+    # inteiros >10
+    elif num == 1:
+        # trinta e um
+        return_string = '{} e {}'.format(whole, den_str)
+    # inteiros >10 com fracções
+    else:
+        # vinte e 3 décimo
+        return_string = '{} e {} {}'.format(whole, num, den_str)
+    # plural
+    if num > 1:
+        return_string += 's'
+    return return_string
--- a/mycroft/util/lang/parse_common.py
+++ b/mycroft/util/lang/parse_common.py
@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2017 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+def is_numeric(input_str):
+    """
+    Takes in a string and tests to see if it is a number.
+    Args:
+        text (str): string to test if a number
+    Returns:
+        (bool): True if a number, else False
+
+    """
+
+    try:
+        float(input_str)
+        return True
+    except ValueError:
+        return False
+
+
+def look_for_fractions(split_list):
+    """"
+    This function takes a list made by fraction & determines if a fraction.
+
+    Args:
+        split_list (list): list created by splitting on '/'
+    Returns:
+        (bool): False if not a fraction, otherwise True
+
+    """
+
+    if len(split_list) == 2:
+        if is_numeric(split_list[0]) and is_numeric(split_list[1]):
+            return True
+
+    return False
--- a/mycroft/util/lang/parse_en.py
+++ b/mycroft/util/lang/parse_en.py
@ -0,0 +1,835 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2017 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from datetime import datetime, timedelta
+from dateutil.relativedelta import relativedelta
+from mycroft.util.lang.parse_common import *
+
+
+def extractnumber_en(text):
+    """
+    This function prepares the given text for parsing by making
+    numbers consistent, getting rid of contractions, etc.
+    Args:
+        text (str): the string to normalize
+    Returns:
+        (int) or (float): The value of extracted number
+
+    """
+    aWords = text.split()
+    aWords = [word for word in aWords if word not in ["the", "a", "an"]]
+    andPass = False
+    valPreAnd = False
+    val = False
+    count = 0
+    while count < len(aWords):
+        word = aWords[count]
+        if is_numeric(word):
+            # if word.isdigit():            # doesn't work with decimals
+            val = float(word)
+        elif word == "first":
+            val = 1
+        elif word == "second":
+            val = 2
+        elif isFractional_en(word):
+            val = isFractional_en(word)
+        else:
+            if word == "one":
+                val = 1
+            elif word == "two":
+                val = 2
+            elif word == "three":
+                val = 3
+            elif word == "four":
+                val = 4
+            elif word == "five":
+                val = 5
+            elif word == "six":
+                val = 6
+            elif word == "seven":
+                val = 7
+            elif word == "eight":
+                val = 8
+            elif word == "nine":
+                val = 9
+            elif word == "ten":
+                val = 10
+            if val:
+                if count < (len(aWords) - 1):
+                    wordNext = aWords[count + 1]
+                else:
+                    wordNext = ""
+                valNext = isFractional_en(wordNext)
+
+                if valNext:
+                    val = val * valNext
+                    aWords[count + 1] = ""
+
+        # if val == False:
+        if not val:
+            # look for fractions like "2/3"
+            aPieces = word.split('/')
+            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
+            #   and is_numeric(aPieces[1])):
+            if look_for_fractions(aPieces):
+                val = float(aPieces[0]) / float(aPieces[1])
+            elif andPass:
+                # added to value, quit here
+                val = valPreAnd
+                break
+            else:
+                count += 1
+                continue
+
+        aWords[count] = ""
+
+        if (andPass):
+            aWords[count - 1] = ''  # remove "and"
+            val += valPreAnd
+        elif count + 1 < len(aWords) and aWords[count + 1] == 'and':
+            andPass = True
+            valPreAnd = val
+            val = False
+            count += 2
+            continue
+        elif count + 2 < len(aWords) and aWords[count + 2] == 'and':
+            andPass = True
+            valPreAnd = val
+            val = False
+            count += 3
+            continue
+
+        break
+
+    # if val == False:
+    if not val:
+        return False
+
+    # Return the $str with the number related words removed
+    # (now empty strings, so strlen == 0)
+    aWords = [word for word in aWords if len(word) > 0]
+    text = ' '.join(aWords)
+
+    return val
+
+
+def extract_datetime_en(str, currentDate=None):
+    def clean_string(str):
+        # cleans the input string of unneeded punctuation and capitalization
+        # among other things
+        str = str.lower().replace('?', '').replace('.', '').replace(',', '') \
+            .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ')
+        wordList = str.split()
+        for idx, word in enumerate(wordList):
+            word = word.replace("'s", "")
+
+            ordinals = ["rd", "st", "nd", "th"]
+            if word[0].isdigit():
+                for ord in ordinals:
+                    if ord in word:
+                        word = word.replace(ord, "")
+            wordList[idx] = word
+
+        return wordList
+
+    def date_found():
+        return found or \
+            (
+                datestr != "" or timeStr != "" or
+                yearOffset != 0 or monthOffset != 0 or
+                dayOffset is True or hrOffset != 0 or
+                hrAbs != 0 or minOffset != 0 or
+                minAbs != 0 or secOffset != 0
+            )
+
+    if str == "":
+        return None
+    if currentDate is None:
+        currentDate = datetime.now()
+
+    found = False
+    daySpecified = False
+    dayOffset = False
+    monthOffset = 0
+    yearOffset = 0
+    dateNow = currentDate
+    today = dateNow.strftime("%w")
+    currentYear = dateNow.strftime("%Y")
+    fromFlag = False
+    datestr = ""
+    hasYear = False
+    timeQualifier = ""
+
+    timeQualifiersList = ['morning', 'afternoon', 'evening']
+    markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of']
+    days = ['monday', 'tuesday', 'wednesday',
+            'thursday', 'friday', 'saturday', 'sunday']
+    months = ['january', 'february', 'march', 'april', 'may', 'june',
+              'july', 'august', 'september', 'october', 'november',
+              'december']
+    monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
+                   'sept', 'oct', 'nov', 'dec']
+
+    words = clean_string(str)
+
+    for idx, word in enumerate(words):
+        if word == "":
+            continue
+        wordPrevPrev = words[idx - 2] if idx > 1 else ""
+        wordPrev = words[idx - 1] if idx > 0 else ""
+        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
+        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
+
+        # this isn't in clean string because I don't want to save back to words
+        word = word.rstrip('s')
+        start = idx
+        used = 0
+        # save timequalifier for later
+        if word in timeQualifiersList:
+            timeQualifier = word
+            # parse today, tomorrow, day after tomorrow
+        elif word == "today" and not fromFlag:
+            dayOffset = 0
+            used += 1
+        elif word == "tomorrow" and not fromFlag:
+            dayOffset = 1
+            used += 1
+        elif (word == "day" and
+                wordNext == "after" and
+                wordNextNext == "tomorrow" and
+                not fromFlag and
+                not wordPrev[0].isdigit()):
+            dayOffset = 2
+            used = 3
+            if wordPrev == "the":
+                start -= 1
+                used += 1
+                # parse 5 days, 10 weeks, last week, next week
+        elif word == "day":
+            if wordPrev[0].isdigit():
+                dayOffset += int(wordPrev)
+                start -= 1
+                used = 2
+        elif word == "week" and not fromFlag:
+            if wordPrev[0].isdigit():
+                dayOffset += int(wordPrev) * 7
+                start -= 1
+                used = 2
+            elif wordPrev == "next":
+                dayOffset = 7
+                start -= 1
+                used = 2
+            elif wordPrev == "last":
+                dayOffset = -7
+                start -= 1
+                used = 2
+                # parse 10 months, next month, last month
+        elif word == "month" and not fromFlag:
+            if wordPrev[0].isdigit():
+                monthOffset = int(wordPrev)
+                start -= 1
+                used = 2
+            elif wordPrev == "next":
+                monthOffset = 1
+                start -= 1
+                used = 2
+            elif wordPrev == "last":
+                monthOffset = -1
+                start -= 1
+                used = 2
+                # parse 5 years, next year, last year
+        elif word == "year" and not fromFlag:
+            if wordPrev[0].isdigit():
+                yearOffset = int(wordPrev)
+                start -= 1
+                used = 2
+            elif wordPrev == "next":
+                yearOffset = 1
+                start -= 1
+                used = 2
+            elif wordPrev == "last":
+                yearOffset = -1
+                start -= 1
+                used = 2
+                # parse Monday, Tuesday, etc., and next Monday,
+                # last Tuesday, etc.
+        elif word in days and not fromFlag:
+            d = days.index(word)
+            dayOffset = (d + 1) - int(today)
+            used = 1
+            if dayOffset < 0:
+                dayOffset += 7
+            if wordPrev == "next":
+                dayOffset += 7
+                used += 1
+                start -= 1
+            elif wordPrev == "last":
+                dayOffset -= 7
+                used += 1
+                start -= 1
+                # parse 15 of July, June 20th, Feb 18, 19 of February
+        elif word in months or word in monthsShort and not fromFlag:
+            try:
+                m = months.index(word)
+            except ValueError:
+                m = monthsShort.index(word)
+            used += 1
+            datestr = months[m]
+            if wordPrev and (wordPrev[0].isdigit() or
+                             (wordPrev == "of" and wordPrevPrev[0].isdigit())):
+                if wordPrev == "of" and wordPrevPrev[0].isdigit():
+                    datestr += " " + words[idx - 2]
+                    used += 1
+                    start -= 1
+                else:
+                    datestr += " " + wordPrev
+                start -= 1
+                used += 1
+                if wordNext and wordNext[0].isdigit():
+                    datestr += " " + wordNext
+                    used += 1
+                    hasYear = True
+                else:
+                    hasYear = False
+
+            elif wordNext and wordNext[0].isdigit():
+                datestr += " " + wordNext
+                used += 1
+                if wordNextNext and wordNextNext[0].isdigit():
+                    datestr += " " + wordNextNext
+                    used += 1
+                    hasYear = True
+                else:
+                    hasYear = False
+        # parse 5 days from tomorrow, 10 weeks from next thursday,
+        # 2 months from July
+        validFollowups = days + months + monthsShort
+        validFollowups.append("today")
+        validFollowups.append("tomorrow")
+        validFollowups.append("next")
+        validFollowups.append("last")
+        validFollowups.append("now")
+        if (word == "from" or word == "after") and wordNext in validFollowups:
+            used = 2
+            fromFlag = True
+            if wordNext == "tomorrow":
+                dayOffset += 1
+            elif wordNext in days:
+                d = days.index(wordNext)
+                tmpOffset = (d + 1) - int(today)
+                used = 2
+                if tmpOffset < 0:
+                    tmpOffset += 7
+                dayOffset += tmpOffset
+            elif wordNextNext and wordNextNext in days:
+                d = days.index(wordNextNext)
+                tmpOffset = (d + 1) - int(today)
+                used = 3
+                if wordNext == "next":
+                    tmpOffset += 7
+                    used += 1
+                    start -= 1
+                elif wordNext == "last":
+                    tmpOffset -= 7
+                    used += 1
+                    start -= 1
+                dayOffset += tmpOffset
+        if used > 0:
+            if start - 1 > 0 and words[start - 1] == "this":
+                start -= 1
+                used += 1
+
+            for i in range(0, used):
+                words[i + start] = ""
+
+            if (start - 1 >= 0 and words[start - 1] in markers):
+                words[start - 1] = ""
+            found = True
+            daySpecified = True
+
+    # parse time
+    timeStr = ""
+    hrOffset = 0
+    minOffset = 0
+    secOffset = 0
+    hrAbs = 0
+    minAbs = 0
+    military = False
+
+    for idx, word in enumerate(words):
+        if word == "":
+            continue
+
+        wordPrevPrev = words[idx - 2] if idx > 1 else ""
+        wordPrev = words[idx - 1] if idx > 0 else ""
+        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
+        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
+        # parse noon, midnight, morning, afternoon, evening
+        used = 0
+        if word == "noon":
+            hrAbs = 12
+            used += 1
+        elif word == "midnight":
+            hrAbs = 0
+            used += 1
+        elif word == "morning":
+            if hrAbs == 0:
+                hrAbs = 8
+            used += 1
+        elif word == "afternoon":
+            if hrAbs == 0:
+                hrAbs = 15
+            used += 1
+        elif word == "evening":
+            if hrAbs == 0:
+                hrAbs = 19
+            used += 1
+            # parse half an hour, quarter hour
+        elif word == "hour" and \
+                (wordPrev in markers or wordPrevPrev in markers):
+            if wordPrev == "half":
+                minOffset = 30
+            elif wordPrev == "quarter":
+                minOffset = 15
+            elif wordPrevPrev == "quarter":
+                minOffset = 15
+                if idx > 2 and words[idx - 3] in markers:
+                    words[idx - 3] = ""
+                words[idx - 2] = ""
+            else:
+                hrOffset = 1
+            if wordPrevPrev in markers:
+                words[idx - 2] = ""
+            words[idx - 1] = ""
+            used += 1
+            hrAbs = -1
+            minAbs = -1
+            # parse 5:00 am, 12:00 p.m., etc
+        elif word[0].isdigit():
+            isTime = True
+            strHH = ""
+            strMM = ""
+            remainder = ""
+            if ':' in word:
+                # parse colons
+                # "3:00 in the morning"
+                stage = 0
+                length = len(word)
+                for i in range(length):
+                    if stage == 0:
+                        if word[i].isdigit():
+                            strHH += word[i]
+                        elif word[i] == ":":
+                            stage = 1
+                        else:
+                            stage = 2
+                            i -= 1
+                    elif stage == 1:
+                        if word[i].isdigit():
+                            strMM += word[i]
+                        else:
+                            stage = 2
+                            i -= 1
+                    elif stage == 2:
+                        remainder = word[i:].replace(".", "")
+                        break
+                if remainder == "":
+                    nextWord = wordNext.replace(".", "")
+                    if nextWord == "am" or nextWord == "pm":
+                        remainder = nextWord
+                        used += 1
+                    elif nextWord == "tonight":
+                        remainder = "pm"
+                        used += 1
+                    elif wordNext == "in" and wordNextNext == "the" and \
+                            words[idx + 3] == "morning":
+                        reaminder = "am"
+                        used += 3
+                    elif wordNext == "in" and wordNextNext == "the" and \
+                            words[idx + 3] == "afternoon":
+                        remainder = "pm"
+                        used += 3
+                    elif wordNext == "in" and wordNextNext == "the" and \
+                            words[idx + 3] == "evening":
+                        remainder = "pm"
+                        used += 3
+                    elif wordNext == "in" and wordNextNext == "morning":
+                        remainder = "am"
+                        used += 2
+                    elif wordNext == "in" and wordNextNext == "afternoon":
+                        remainder = "pm"
+                        used += 2
+                    elif wordNext == "in" and wordNextNext == "evening":
+                        remainder = "pm"
+                        used += 2
+                    elif wordNext == "this" and wordNextNext == "morning":
+                        remainder = "am"
+                        used = 2
+                    elif wordNext == "this" and wordNextNext == "afternoon":
+                        remainder = "pm"
+                        used = 2
+                    elif wordNext == "this" and wordNextNext == "evening":
+                        remainder = "pm"
+                        used = 2
+                    elif wordNext == "at" and wordNextNext == "night":
+                        if strHH > 5:
+                            remainder = "pm"
+                        else:
+                            remainder = "am"
+                        used += 2
+                    else:
+                        if timeQualifier != "":
+                            military = True
+                            if strHH <= 12 and \
+                                    (timeQualifier == "evening" or
+                                     timeQualifier == "afternoon"):
+                                strHH += 12
+            else:
+                # try to parse # s without colons
+                # 5 hours, 10 minutes etc.
+                length = len(word)
+                strNum = ""
+                remainder = ""
+                for i in range(length):
+                    if word[i].isdigit():
+                        strNum += word[i]
+                    else:
+                        remainder += word[i]
+
+                if remainder == "":
+                    remainder = wordNext.replace(".", "").lstrip().rstrip()
+
+                if (
+                        remainder == "pm" or
+                        wordNext == "pm" or
+                        remainder == "p.m." or
+                        wordNext == "p.m."):
+                    strHH = strNum
+                    remainder = "pm"
+                    used = 1
+                elif (
+                        remainder == "am" or
+                        wordNext == "am" or
+                        remainder == "a.m." or
+                        wordNext == "a.m."):
+                    strHH = strNum
+                    remainder = "am"
+                    used = 1
+                else:
+                    if wordNext == "pm" or wordNext == "p.m.":
+                        strHH = strNum
+                        reaminder = "pm"
+                        used = 1
+                    elif wordNext == "am" or wordNext == "a.m.":
+                        strHH = strNum
+                        remainder = "am"
+                        used = 1
+                    elif (
+                            int(word) > 100 and
+                            (
+                                wordPrev == "o" or
+                                wordPrev == "oh"
+                            )):
+                        # 0800 hours (pronounced oh-eight-hundred)
+                        strHH = int(word) / 100
+                        strMM = int(word) - strHH * 100
+                        military = True
+                        if wordNext == "hours":
+                            used += 1
+                    elif (
+                            wordNext == "hours" and
+                            word[0] != '0' and
+                            (
+                                int(word) < 100 and
+                                int(word) > 2400
+                            )):
+                        # ignores military time
+                        # "in 3 hours"
+                        hrOffset = int(word)
+                        used = 2
+                        isTime = False
+                        hrAbs = -1
+                        minAbs = -1
+
+                    elif wordNext == "minutes":
+                        # "in 10 minutes"
+                        minOffset = int(word)
+                        used = 2
+                        isTime = False
+                        hrAbs = -1
+                        minAbs = -1
+                    elif wordNext == "seconds":
+                        # in 5 seconds
+                        secOffset = int(word)
+                        used = 2
+                        isTime = False
+                        hrAbs = -1
+                        minAbs = -1
+                    elif int(word) > 100:
+                        strHH = int(word) / 100
+                        strMM = int(word) - strHH * 100
+                        military = True
+                        if wordNext == "hours":
+                            used += 1
+                    elif wordNext[0].isdigit():
+                        strHH = word
+                        strMM = wordNext
+                        military = True
+                        used += 1
+                        if wordNextNext == "hours":
+                            used += 1
+                    elif (
+                            wordNext == "" or wordNext == "o'clock" or
+                            (
+                                        wordNext == "in" and
+                                        (
+                                            wordNextNext == "the" or
+                                            wordNextNext == timeQualifier
+                                        )
+                            )):
+                        strHH = word
+                        strMM = 00
+                        if wordNext == "o'clock":
+                            used += 1
+                        if wordNext == "in" or wordNextNext == "in":
+                            used += (1 if wordNext == "in" else 2)
+                            if (wordNextNext and
+                                wordNextNext in timeQualifier or
+                                (words[words.index(wordNextNext) + 1] and
+                                 words[words.index(wordNextNext) + 1] in
+                                 timeQualifier)):
+                                if (wordNextNext == "afternoon" or
+                                    (len(words) >
+                                     words.index(wordNextNext) + 1 and
+                                     words[words.index(
+                                         wordNextNext) + 1] == "afternoon")):
+                                    remainder = "pm"
+                                if (wordNextNext == "evening" or
+                                    (len(words) >
+                                     (words.index(wordNextNext) + 1) and
+                                     words[words.index(
+                                         wordNextNext) + 1] == "evening")):
+                                    remainder = "pm"
+                                if (wordNextNext == "morning" or
+                                    (len(words) >
+                                     words.index(wordNextNext) + 1 and
+                                     words[words.index(
+                                         wordNextNext) + 1] == "morning")):
+                                    remainder = "am"
+                        if timeQualifier != "":
+                            military = True
+                    else:
+                        isTime = False
+
+            strHH = int(strHH) if strHH else 0
+            strMM = int(strMM) if strMM else 0
+            strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
+            strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
+            if strHH > 24 or strMM > 59:
+                isTime = False
+                used = 0
+            if isTime:
+                hrAbs = strHH * 1
+                minAbs = strMM * 1
+                used += 1
+        if used > 0:
+            # removed parsed words from the sentence
+            for i in range(used):
+                words[idx + i] = ""
+
+            if wordPrev == "o" or wordPrev == "oh":
+                words[words.index(wordPrev)] = ""
+
+            if wordPrev == "early":
+                hrOffset = -1
+                words[idx - 1] = ""
+                idx -= 1
+            elif wordPrev == "late":
+                hrOffset = 1
+                words[idx - 1] = ""
+                idx -= 1
+            if idx > 0 and wordPrev in markers:
+                words[idx - 1] = ""
+            if idx > 1 and wordPrevPrev in markers:
+                words[idx - 2] = ""
+
+            idx += used - 1
+            found = True
+
+    # check that we found a date
+    if not date_found:
+        return None
+
+    if dayOffset is False:
+        dayOffset = 0
+
+    # perform date manipulation
+
+    extractedDate = dateNow
+    extractedDate = extractedDate.replace(microsecond=0,
+                                          second=0,
+                                          minute=0,
+                                          hour=0)
+    if datestr != "":
+        temp = datetime.strptime(datestr, "%B %d")
+        if not hasYear:
+            temp = temp.replace(year=extractedDate.year)
+            if extractedDate < temp:
+                extractedDate = extractedDate.replace(year=int(currentYear),
+                                                      month=int(
+                                                          temp.strftime(
+                                                              "%m")),
+                                                      day=int(temp.strftime(
+                                                          "%d")))
+            else:
+                extractedDate = extractedDate.replace(
+                    year=int(currentYear) + 1,
+                    month=int(temp.strftime("%m")),
+                    day=int(temp.strftime("%d")))
+        else:
+            extractedDate = extractedDate.replace(
+                year=int(temp.strftime("%Y")),
+                month=int(temp.strftime("%m")),
+                day=int(temp.strftime("%d")))
+
+    if timeStr != "":
+        temp = datetime(timeStr)
+        extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
+                                              minute=temp.strftime("%M"),
+                                              second=temp.strftime("%S"))
+
+    if yearOffset != 0:
+        extractedDate = extractedDate + relativedelta(years=yearOffset)
+    if monthOffset != 0:
+        extractedDate = extractedDate + relativedelta(months=monthOffset)
+    if dayOffset != 0:
+        extractedDate = extractedDate + relativedelta(days=dayOffset)
+    if hrAbs != -1 and minAbs != -1:
+
+        extractedDate = extractedDate + relativedelta(hours=hrAbs,
+                                                      minutes=minAbs)
+        if (hrAbs != 0 or minAbs != 0) and datestr == "":
+            if not daySpecified and dateNow > extractedDate:
+                extractedDate = extractedDate + relativedelta(days=1)
+    if hrOffset != 0:
+        extractedDate = extractedDate + relativedelta(hours=hrOffset)
+    if minOffset != 0:
+        extractedDate = extractedDate + relativedelta(minutes=minOffset)
+    if secOffset != 0:
+        extractedDate = extractedDate + relativedelta(seconds=secOffset)
+    for idx, word in enumerate(words):
+        if words[idx] == "and" and words[idx - 1] == "" and words[
+                idx + 1] == "":
+            words[idx] = ""
+
+    resultStr = " ".join(words)
+    resultStr = ' '.join(resultStr.split())
+    return [extractedDate, resultStr]
+
+
+def isFractional_en(input_str):
+    """
+    This function takes the given text and checks if it is a fraction.
+
+    Args:
+        text (str): the string to check if fractional
+    Returns:
+        (bool) or (float): False if not a fraction, otherwise the fraction
+
+    """
+    if input_str.endswith('s', -1):
+        input_str = input_str[:len(input_str) - 1]  # e.g. "fifths"
+
+    aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth",
+             "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"]
+
+    if input_str.lower() in aFrac:
+        return 1.0 / (aFrac.index(input_str) + 1)
+    if input_str == "quarter":
+        return 1.0 / 4
+
+    return False
+
+
+def normalize_en(text, remove_articles):
+    """ English string normalization """
+
+    words = text.split()  # this also removed extra spaces
+    normalized = ""
+    for word in words:
+        if remove_articles and word in ["the", "a", "an"]:
+            continue
+
+        # Expand common contractions, e.g. "isn't" -> "is not"
+        contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
+                       "didn't", "doesn't", "don't", "gonna", "gotta",
+                       "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
+                       "how'd", "how'll", "how's", "I'd", "I'll", "I'm",
+                       "I've", "isn't", "it'd", "it'll", "it's", "mightn't",
+                       "might've", "mustn't", "must've", "needn't",
+                       "oughtn't",
+                       "shan't", "she'd", "she'll", "she's", "shouldn't",
+                       "should've", "somebody's", "someone'd", "someone'll",
+                       "someone's", "that'll", "that's", "that'd", "there'd",
+                       "there're", "there's", "they'd", "they'll", "they're",
+                       "they've", "wasn't", "we'd", "we'll", "we're", "we've",
+                       "weren't", "what'd", "what'll", "what're", "what's",
+                       "whats",  # technically incorrect but some STT outputs
+                       "what've", "when's", "when'd", "where'd", "where's",
+                       "where've", "who'd", "who'd've", "who'll", "who're",
+                       "who's", "who've", "why'd", "why're", "why's", "won't",
+                       "won't've", "would've", "wouldn't", "wouldn't've",
+                       "y'all", "ya'll", "you'd", "you'd've", "you'll",
+                       "y'aint", "y'ain't", "you're", "you've"]
+        if word in contraction:
+            expansion = ["is not", "are not", "can not", "could have",
+                         "could not", "did not", "does not", "do not",
+                         "going to", "got to", "had not", "has not",
+                         "have not", "he would", "he will", "he is",
+                         "how did",
+                         "how will", "how is", "I would", "I will", "I am",
+                         "I have", "is not", "it would", "it will", "it is",
+                         "might not", "might have", "must not", "must have",
+                         "need not", "ought not", "shall not", "she would",
+                         "she will", "she is", "should not", "should have",
+                         "somebody is", "someone would", "someone will",
+                         "someone is", "that will", "that is", "that would",
+                         "there would", "there are", "there is", "they would",
+                         "they will", "they are", "they have", "was not",
+                         "we would", "we will", "we are", "we have",
+                         "were not", "what did", "what will", "what are",
+                         "what is",
+                         "what is", "what have", "when is", "when did",
+                         "where did", "where is", "where have", "who would",
+                         "who would have", "who will", "who are", "who is",
+                         "who have", "why did", "why are", "why is",
+                         "will not", "will not have", "would have",
+                         "would not", "would not have", "you all", "you all",
+                         "you would", "you would have", "you will",
+                         "you are not", "you are not", "you are", "you have"]
+            word = expansion[contraction.index(word)]
+
+        # Convert numbers into digits, e.g. "two" -> "2"
+        textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
+                       "seven", "eight", "nine", "ten", "eleven", "twelve",
+                       "thirteen", "fourteen", "fifteen", "sixteen",
+                       "seventeen", "eighteen", "nineteen", "twenty"]
+        if word in textNumbers:
+            word = str(textNumbers.index(word))
+
+        normalized += " " + word
+
+    return normalized[1:]  # strip the initial space
--- a/mycroft/util/lang/parse_es.py
+++ b/mycroft/util/lang/parse_es.py
@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2017 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from datetime import datetime, timedelta
+from dateutil.relativedelta import relativedelta
+from mycroft.util.lang.parse_common import *
+"""
+    Parse functions for spanish (es)
+    TODO: numbers greater than 999999
+"""
+
+# Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
+# in Spanish, "un caballo" means "a horse" or "one horse".
+es_articles = ["el", "la", "los", "las"]
+
+es_numbers_xlat = {
+    "un": 1,
+    "uno": 1,
+    "una": 1,
+    "dos": 2,
+    "tres": 3,
+    u"trï¿½s": 3,
+    "cuatro": 4,
+    "cinco": 5,
+    "seis": 6,
+    "siete": 7,
+    "ocho": 8,
+    "nueve": 9,
+    "diez": 10,
+    "once": 11,
+    "doce": 12,
+    "trece": 13,
+    "catorce": 14,
+    "quince": 15,
+    "dieciseis": 16,
+    u"dieciséis": 16,
+    "diecisiete": 17,
+    "dieciocho": 18,
+    "diecinueve": 19,
+    "veinte": 20,
+    "veintiuno": 21,
+    u"veintidï¿½s": 22,
+    u"veintitrï¿½s": 23,
+    "veintidos": 22,
+    "veintitres": 23,
+    u"veintitrés": 23,
+    "veinticuatro": 24,
+    "veinticinco": 25,
+    u"veintiséis": 26,
+    "veintiseis": 26,
+    "veintisiete": 27,
+    "veintiocho": 28,
+    "veintinueve": 29,
+    "treinta": 30,
+    "cuarenta": 40,
+    "cincuenta": 50,
+    "sesenta": 60,
+    "setenta": 70,
+    "ochenta": 80,
+    "noventa": 90,
+    "cien": 100,
+    "ciento": 100,
+    "doscientos": 200,
+    "doscientas": 200,
+    "trescientos": 300,
+    "trescientas": 300,
+    "cuatrocientos": 400,
+    "cuatrocientas": 400,
+    "quinientos": 500,
+    "quinientas": 500,
+    "seiscientos": 600,
+    "seiscientas": 600,
+    "setecientos": 700,
+    "setecientas": 700,
+    "ochocientos": 800,
+    "ochocientas": 800,
+    "novecientos": 900,
+    "novecientas": 900}
+
+
+def es_parse(words, i):
+    def es_cte(i, s):
+        if i < len(words) and s == words[i]:
+            return s, i + 1
+        return None
+
+    def es_number_word(i, mi, ma):
+        if i < len(words):
+            v = es_numbers_xlat.get(words[i])
+            if v and v >= mi and v <= ma:
+                return v, i + 1
+        return None
+
+    def es_number_1_99(i):
+        r1 = es_number_word(i, 1, 29)
+        if r1:
+            return r1
+
+        r1 = es_number_word(i, 30, 90)
+        if r1:
+            v1, i1 = r1
+            r2 = es_cte(i1, "y")
+            if r2:
+                v2, i2 = r2
+                r3 = es_number_word(i2, 1, 9)
+                if r3:
+                    v3, i3 = r3
+                    return v1 + v3, i3
+            return r1
+        return None
+
+    def es_number_1_999(i):
+        # [2-9]cientos [1-99]?
+        r1 = es_number_word(i, 100, 900)
+        if r1:
+            v1, i1 = r1
+            r2 = es_number_1_99(i1)
+            if r2:
+                v2, i2 = r2
+                return v1 + v2, i2
+            else:
+                return r1
+
+        # [1-99]
+        r1 = es_number_1_99(i)
+        if r1:
+            return r1
+
+        return None
+
+    def es_number(i):
+        # check for cero
+        r1 = es_number_word(i, 0, 0)
+        if r1:
+            return r1
+
+        # check for [1-999] (mil [0-999])?
+        r1 = es_number_1_999(i)
+        if r1:
+            v1, i1 = r1
+            r2 = es_cte(i1, "mil")
+            if r2:
+                v2, i2 = r2
+                r3 = es_number_1_999(i2)
+                if r3:
+                    v3, i3 = r3
+                    return v1 * 1000 + v3, i3
+                else:
+                    return v1 * 1000, i2
+            else:
+                return r1
+        return None
+
+    return es_number(i)
+
+
+def normalize_es(text, remove_articles):
+    """ Spanish string normalization """
+
+    words = text.split()  # this also removed extra spaces
+
+    normalized = ""
+    i = 0
+    while i < len(words):
+        word = words[i]
+
+        if remove_articles and word in es_articles:
+            i += 1
+            continue
+
+        # Convert numbers into digits
+        r = es_parse(words, i)
+        if r:
+            v, i = r
+            normalized += " " + str(v)
+            continue
+
+        normalized += " " + word
+        i += 1
+
+    return normalized[1:]  # strip the initial space
--- a/mycroft/util/lang/parse_pt.py
+++ b/mycroft/util/lang/parse_pt.py
--- a/mycroft/util/parse.py
+++ b/mycroft/util/parse.py
--- a/test/unittests/util/test_parse.py
+++ b/test/unittests/util/test_parse.py
@ -375,7 +375,7 @@ class TestNormalize(unittest.TestCase):
        self.assertEqual(
            extractnumber("tres quartos de chocolate", lang="pt"),
            3.0 / 4.0)
-        self.assertEqual(extractnumber(u"tr<EFBFBD>s quarto de chocolate",
+        self.assertEqual(extractnumber(u"três quarto de chocolate",
                                       lang="pt"), 3.0 / 4.0)
        self.assertEqual(extractnumber("sete ponto cinco", lang="pt"), 7.5)
        self.assertEqual(extractnumber("sete ponto 5", lang="pt"), 7.5)
@ -420,9 +420,9 @@ class TestNormalize(unittest.TestCase):
                         "isto e 1 teste")

    def test_numbers_pt(self):
-        self.assertEqual(normalize(u"isto e o um dois tr<EFBFBD>s teste", lang="pt"),
+        self.assertEqual(normalize(u"isto e o um dois três teste", lang="pt"),
                         u"isto 1 2 3 teste")
-        self.assertEqual(normalize(u"<EFBFBD> a sete oito nove  test", lang="pt"),
+        self.assertEqual(normalize(u"ê a sete oito nove  test", lang="pt"),
                         u"7 8 9 test")
        self.assertEqual(
            normalize("teste zero dez onze doze treze", lang="pt"),
@ -459,9 +459,9 @@ class TestNormalize(unittest.TestCase):
            self.assertEqual(res[0], expected_date)
            self.assertEqual(res[1], expected_leftover)

-        testExtract(u"que dia <EFBFBD> hoje",
+        testExtract(u"que dia é hoje",
                    "2017-06-27 00:00:00", u"dia")
-        testExtract(u"que dia <EFBFBD> amanha",
+        testExtract(u"que dia é amanha",
                    "2017-06-28 00:00:00", u"dia")
        testExtract(u"que dia foi ontem",
                    "2017-06-26 00:00:00", u"dia")
@ -513,7 +513,7 @@ class TestNormalize(unittest.TestCase):
        testExtract("lembra me para ligar a mae no dia 3 de agosto",
                    "2017-08-03 00:00:00", "lembra ligar mae")

-        testExtract(u"compra facas no 13<EFBFBD> dia de maio",
+        testExtract(u"compra facas no 13º dia de maio",
                    "2018-05-13 00:00:00", "compra facas")
        testExtract(u"gasta dinheiro no maio dia 13",
                    "2018-05-13 00:00:00", "gasta dinheiro")
@ -588,7 +588,7 @@ class TestNormalize(unittest.TestCase):
        self.assertEqual(normalize("diez once doce trece catorce quince",
                                   lang="es"),
                         "10 11 12 13 14 15")
-        self.assertEqual(normalize(u"diecis<EFBFBD>is diecisiete", lang="es"),
+        self.assertEqual(normalize(u"dieciséis diecisiete", lang="es"),
                         "16 17")
        self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"),
                         "18 19")
@ -609,7 +609,7 @@ class TestNormalize(unittest.TestCase):
                                   lang="es"),
                         "2345")
        self.assertEqual(normalize(
-            u"ciento veintitr<EFBFBD>s mil cuatrocientas cincuenta y seis",
+            u"ciento veintitrés mil cuatrocientas cincuenta y seis",
            lang="es"),
            "123456")
        self.assertEqual(normalize(