mycroft-core/mycroft/util/parse.py

# -*- coding: iso-8859-15 -*-
#
# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from difflib import SequenceMatcher


def fuzzy_match(x, against):
    """Perform a 'fuzzy' comparison between two strings.
    Returns:
        float: match percentage -- 1.0 for perfect match,
               down to 0.0 for no match at all.
    """
    return SequenceMatcher(None, x, against).ratio()


def extractnumber(text, lang="en-us"):
    """Takes in a string and extracts a number.
    Args:
        text (str): the string to extract a number from
        lang (str): the code for the language text is in
    Returns:
        (str): The number extracted or the original text.
    """

    lang_lower = str(lang).lower()
    if lang_lower.startswith("en"):
        # return extractnumber_en(text, remove_articles)
        return extractnumber_en(text)
    elif lang_lower.startswith("pt"):
        return extractnumber_pt(text)

    # TODO: Normalization for other languages
    return text


def extract_datetime(text, anchorDate=None, lang="en-us"):
    """
    Parsing function that extracts date and time information
    from sentences. Parses many of the common ways that humans
    express dates and times. Includes relative dates like "5 days from today".

    Vague terminology are given arbitrary values, like:
        - morning = 8 AM
        - afternoon = 3 PM
        - evening = 7 PM

    If a time isn't supplied, the function defaults to 12 AM

    Args:
        str (string): the text to be normalized
        anchortDate (:obj:`datetime`, optional): the date to be used for
            relative dating (for example, what does "tomorrow" mean?).
            Defaults to the current date
            (acquired with datetime.datetime.now())
        lang (string): the language of the sentence(s)

    Returns:
        [:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date
            as a datetime object. Times are represented in 24 hour notation.
            'leftover_string' is the original phrase with all date and time
            related keywords stripped out. See examples for further
            clarification

            Returns 'None' if no date was extracted.

    Examples:

        >>> extract_datetime(
        ... "What is the weather like the day after tomorrow?",
        ... datetime(2017, 06, 30, 00, 00)
        ... )
        [datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like']

        >>> extract_datetime(
        ... "Set up an appointment 2 weeks from Sunday at 5 pm",
        ... datetime(2016, 02, 19, 00, 00)
        ... )
        [datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment']
    """

    lang_lower = str(lang).lower()

    if lang_lower.startswith("en"):
        return extract_datetime_en(text, anchorDate)
    elif lang_lower.startswith("pt"):
        return extract_datetime_pt(text, anchorDate)

    return text


def is_numeric(input_str):
    """
    Takes in a string and tests to see if it is a number.
    Args:
        text (str): string to test if a number
    Returns:
        (bool): True if a number, else False

    """

    try:
        float(input_str)
        return True
    except ValueError:
        return False


def extractnumber_en(text):
    """
    This function prepares the given text for parsing by making
    numbers consistent, getting rid of contractions, etc.
    Args:
        text (str): the string to normalize
    Returns:
        (int) or (float): The value of extracted number

    """
    aWords = text.split()
    aWords = [word for word in aWords if word not in ["the", "a", "an"]]
    andPass = False
    valPreAnd = False
    val = False
    count = 0
    while count < len(aWords):
        word = aWords[count]
        if is_numeric(word):
            # if word.isdigit():            # doesn't work with decimals
            val = float(word)
        elif word == "first":
            val = 1
        elif word == "second":
            val = 2
        elif isFractional_en(word):
            val = isFractional_en(word)
        else:
            if word == "one":
                val = 1
            elif word == "two":
                val = 2
            elif word == "three":
                val = 3
            elif word == "four":
                val = 4
            elif word == "five":
                val = 5
            elif word == "six":
                val = 6
            elif word == "seven":
                val = 7
            elif word == "eight":
                val = 8
            elif word == "nine":
                val = 9
            elif word == "ten":
                val = 10
            if val:
                if count < (len(aWords) - 1):
                    wordNext = aWords[count + 1]
                else:
                    wordNext = ""
                valNext = isFractional_en(wordNext)

                if valNext:
                    val = val * valNext
                    aWords[count + 1] = ""

        # if val == False:
        if not val:
            # look for fractions like "2/3"
            aPieces = word.split('/')
            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
            #   and is_numeric(aPieces[1])):
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])
            elif andPass:
                # added to value, quit here
                val = valPreAnd
                break
            else:
                count += 1
                continue

        aWords[count] = ""

        if (andPass):
            aWords[count - 1] = ''  # remove "and"
            val += valPreAnd
        elif count + 1 < len(aWords) and aWords[count + 1] == 'and':
            andPass = True
            valPreAnd = val
            val = False
            count += 2
            continue
        elif count + 2 < len(aWords) and aWords[count + 2] == 'and':
            andPass = True
            valPreAnd = val
            val = False
            count += 3
            continue

        break

    # if val == False:
    if not val:
        return False

    # Return the $str with the number related words removed
    # (now empty strings, so strlen == 0)
    aWords = [word for word in aWords if len(word) > 0]
    text = ' '.join(aWords)

    return val


def extract_datetime_en(str, currentDate=None):
    def clean_string(str):
        # cleans the input string of unneeded punctuation and capitalization
        # among other things
        str = str.lower().replace('?', '').replace('.', '').replace(',', '') \
            .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ')
        wordList = str.split()
        for idx, word in enumerate(wordList):
            word = word.replace("'s", "")

            ordinals = ["rd", "st", "nd", "th"]
            if word[0].isdigit():
                for ord in ordinals:
                    if ord in word:
                        word = word.replace(ord, "")
            wordList[idx] = word

        return wordList

    def date_found():
        return found or \
            (
                datestr != "" or timeStr != "" or
                yearOffset != 0 or monthOffset != 0 or
                dayOffset is True or hrOffset != 0 or
                hrAbs != 0 or minOffset != 0 or
                minAbs != 0 or secOffset != 0
            )

    if str == "":
        return None
    if currentDate is None:
        currentDate = datetime.now()

    found = False
    daySpecified = False
    dayOffset = False
    monthOffset = 0
    yearOffset = 0
    dateNow = currentDate
    today = dateNow.strftime("%w")
    currentYear = dateNow.strftime("%Y")
    fromFlag = False
    datestr = ""
    hasYear = False
    timeQualifier = ""

    timeQualifiersList = ['morning', 'afternoon', 'evening']
    markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of']
    days = ['monday', 'tuesday', 'wednesday',
            'thursday', 'friday', 'saturday', 'sunday']
    months = ['january', 'february', 'march', 'april', 'may', 'june',
              'july', 'august', 'september', 'october', 'november',
              'december']
    monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
                   'sept', 'oct', 'nov', 'dec']

    words = clean_string(str)

    for idx, word in enumerate(words):
        if word == "":
            continue
        wordPrevPrev = words[idx - 2] if idx > 1 else ""
        wordPrev = words[idx - 1] if idx > 0 else ""
        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""

        # this isn't in clean string because I don't want to save back to words
        word = word.rstrip('s')
        start = idx
        used = 0
        # save timequalifier for later
        if word in timeQualifiersList:
            timeQualifier = word
            # parse today, tomorrow, day after tomorrow
        elif word == "today" and not fromFlag:
            dayOffset = 0
            used += 1
        elif word == "tomorrow" and not fromFlag:
            dayOffset = 1
            used += 1
        elif (word == "day" and
                wordNext == "after" and
                wordNextNext == "tomorrow" and
                not fromFlag and
                not wordPrev[0].isdigit()):
            dayOffset = 2
            used = 3
            if wordPrev == "the":
                start -= 1
                used += 1
                # parse 5 days, 10 weeks, last week, next week
        elif word == "day":
            if wordPrev[0].isdigit():
                dayOffset += int(wordPrev)
                start -= 1
                used = 2
        elif word == "week" and not fromFlag:
            if wordPrev[0].isdigit():
                dayOffset += int(wordPrev) * 7
                start -= 1
                used = 2
            elif wordPrev == "next":
                dayOffset = 7
                start -= 1
                used = 2
            elif wordPrev == "last":
                dayOffset = -7
                start -= 1
                used = 2
                # parse 10 months, next month, last month
        elif word == "month" and not fromFlag:
            if wordPrev[0].isdigit():
                monthOffset = int(wordPrev)
                start -= 1
                used = 2
            elif wordPrev == "next":
                monthOffset = 1
                start -= 1
                used = 2
            elif wordPrev == "last":
                monthOffset = -1
                start -= 1
                used = 2
                # parse 5 years, next year, last year
        elif word == "year" and not fromFlag:
            if wordPrev[0].isdigit():
                yearOffset = int(wordPrev)
                start -= 1
                used = 2
            elif wordPrev == "next":
                yearOffset = 1
                start -= 1
                used = 2
            elif wordPrev == "last":
                yearOffset = -1
                start -= 1
                used = 2
                # parse Monday, Tuesday, etc., and next Monday,
                # last Tuesday, etc.
        elif word in days and not fromFlag:
            d = days.index(word)
            dayOffset = (d + 1) - int(today)
            used = 1
            if dayOffset < 0:
                dayOffset += 7
            if wordPrev == "next":
                dayOffset += 7
                used += 1
                start -= 1
            elif wordPrev == "last":
                dayOffset -= 7
                used += 1
                start -= 1
                # parse 15 of July, June 20th, Feb 18, 19 of February
        elif word in months or word in monthsShort and not fromFlag:
            try:
                m = months.index(word)
            except ValueError:
                m = monthsShort.index(word)
            used += 1
            datestr = months[m]
            if wordPrev and (wordPrev[0].isdigit() or
                             (wordPrev == "of" and wordPrevPrev[0].isdigit())):
                if wordPrev == "of" and wordPrevPrev[0].isdigit():
                    datestr += " " + words[idx - 2]
                    used += 1
                    start -= 1
                else:
                    datestr += " " + wordPrev
                start -= 1
                used += 1
                if wordNext and wordNext[0].isdigit():
                    datestr += " " + wordNext
                    used += 1
                    hasYear = True
                else:
                    hasYear = False

            elif wordNext and wordNext[0].isdigit():
                datestr += " " + wordNext
                used += 1
                if wordNextNext and wordNextNext[0].isdigit():
                    datestr += " " + wordNextNext
                    used += 1
                    hasYear = True
                else:
                    hasYear = False
        # parse 5 days from tomorrow, 10 weeks from next thursday,
        # 2 months from July
        validFollowups = days + months + monthsShort
        validFollowups.append("today")
        validFollowups.append("tomorrow")
        validFollowups.append("next")
        validFollowups.append("last")
        validFollowups.append("now")
        if (word == "from" or word == "after") and wordNext in validFollowups:
            used = 2
            fromFlag = True
            if wordNext == "tomorrow":
                dayOffset += 1
            elif wordNext in days:
                d = days.index(wordNext)
                tmpOffset = (d + 1) - int(today)
                used = 2
                if tmpOffset < 0:
                    tmpOffset += 7
                dayOffset += tmpOffset
            elif wordNextNext and wordNextNext in days:
                d = days.index(wordNextNext)
                tmpOffset = (d + 1) - int(today)
                used = 3
                if wordNext == "next":
                    tmpOffset += 7
                    used += 1
                    start -= 1
                elif wordNext == "last":
                    tmpOffset -= 7
                    used += 1
                    start -= 1
                dayOffset += tmpOffset
        if used > 0:
            if start - 1 > 0 and words[start - 1] == "this":
                start -= 1
                used += 1

            for i in range(0, used):
                words[i + start] = ""

            if (start - 1 >= 0 and words[start - 1] in markers):
                words[start - 1] = ""
            found = True
            daySpecified = True

    # parse time
    timeStr = ""
    hrOffset = 0
    minOffset = 0
    secOffset = 0
    hrAbs = 0
    minAbs = 0
    military = False

    for idx, word in enumerate(words):
        if word == "":
            continue

        wordPrevPrev = words[idx - 2] if idx > 1 else ""
        wordPrev = words[idx - 1] if idx > 0 else ""
        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
        # parse noon, midnight, morning, afternoon, evening
        used = 0
        if word == "noon":
            hrAbs = 12
            used += 1
        elif word == "midnight":
            hrAbs = 0
            used += 1
        elif word == "morning":
            if hrAbs == 0:
                hrAbs = 8
            used += 1
        elif word == "afternoon":
            if hrAbs == 0:
                hrAbs = 15
            used += 1
        elif word == "evening":
            if hrAbs == 0:
                hrAbs = 19
            used += 1
            # parse half an hour, quarter hour
        elif word == "hour" and \
                (wordPrev in markers or wordPrevPrev in markers):
            if wordPrev == "half":
                minOffset = 30
            elif wordPrev == "quarter":
                minOffset = 15
            elif wordPrevPrev == "quarter":
                minOffset = 15
                if idx > 2 and words[idx - 3] in markers:
                    words[idx - 3] = ""
                words[idx - 2] = ""
            else:
                hrOffset = 1
            if wordPrevPrev in markers:
                words[idx - 2] = ""
            words[idx - 1] = ""
            used += 1
            hrAbs = -1
            minAbs = -1
            # parse 5:00 am, 12:00 p.m., etc
        elif word[0].isdigit():
            isTime = True
            strHH = ""
            strMM = ""
            remainder = ""
            if ':' in word:
                # parse colons
                # "3:00 in the morning"
                stage = 0
                length = len(word)
                for i in range(length):
                    if stage == 0:
                        if word[i].isdigit():
                            strHH += word[i]
                        elif word[i] == ":":
                            stage = 1
                        else:
                            stage = 2
                            i -= 1
                    elif stage == 1:
                        if word[i].isdigit():
                            strMM += word[i]
                        else:
                            stage = 2
                            i -= 1
                    elif stage == 2:
                        remainder = word[i:].replace(".", "")
                        break
                if remainder == "":
                    nextWord = wordNext.replace(".", "")
                    if nextWord == "am" or nextWord == "pm":
                        remainder = nextWord
                        used += 1
                    elif nextWord == "tonight":
                        remainder = "pm"
                        used += 1
                    elif wordNext == "in" and wordNextNext == "the" and \
                            words[idx + 3] == "morning":
                        reaminder = "am"
                        used += 3
                    elif wordNext == "in" and wordNextNext == "the" and \
                            words[idx + 3] == "afternoon":
                        remainder = "pm"
                        used += 3
                    elif wordNext == "in" and wordNextNext == "the" and \
                            words[idx + 3] == "evening":
                        remainder = "pm"
                        used += 3
                    elif wordNext == "in" and wordNextNext == "morning":
                        remainder = "am"
                        used += 2
                    elif wordNext == "in" and wordNextNext == "afternoon":
                        remainder = "pm"
                        used += 2
                    elif wordNext == "in" and wordNextNext == "evening":
                        remainder = "pm"
                        used += 2
                    elif wordNext == "this" and wordNextNext == "morning":
                        remainder = "am"
                        used = 2
                    elif wordNext == "this" and wordNextNext == "afternoon":
                        remainder = "pm"
                        used = 2
                    elif wordNext == "this" and wordNextNext == "evening":
                        remainder = "pm"
                        used = 2
                    elif wordNext == "at" and wordNextNext == "night":
                        if strHH > 5:
                            remainder = "pm"
                        else:
                            remainder = "am"
                        used += 2
                    else:
                        if timeQualifier != "":
                            military = True
                            if strHH <= 12 and \
                                    (timeQualifier == "evening" or
                                     timeQualifier == "afternoon"):
                                strHH += 12
            else:
                # try to parse # s without colons
                # 5 hours, 10 minutes etc.
                length = len(word)
                strNum = ""
                remainder = ""
                for i in range(length):
                    if word[i].isdigit():
                        strNum += word[i]
                    else:
                        remainder += word[i]

                if remainder == "":
                    remainder = wordNext.replace(".", "").lstrip().rstrip()

                if (
                        remainder == "pm" or
                        wordNext == "pm" or
                        remainder == "p.m." or
                        wordNext == "p.m."):
                    strHH = strNum
                    remainder = "pm"
                    used = 1
                elif (
                        remainder == "am" or
                        wordNext == "am" or
                        remainder == "a.m." or
                        wordNext == "a.m."):
                    strHH = strNum
                    remainder = "am"
                    used = 1
                else:
                    if wordNext == "pm" or wordNext == "p.m.":
                        strHH = strNum
                        reaminder = "pm"
                        used = 1
                    elif wordNext == "am" or wordNext == "a.m.":
                        strHH = strNum
                        remainder = "am"
                        used = 1
                    elif (
                            int(word) > 100 and
                            (
                                wordPrev == "o" or
                                wordPrev == "oh"
                            )):
                        # 0800 hours (pronounced oh-eight-hundred)
                        strHH = int(word) / 100
                        strMM = int(word) - strHH * 100
                        military = True
                        if wordNext == "hours":
                            used += 1
                    elif (
                            wordNext == "hours" and
                            word[0] != '0' and
                            (
                                int(word) < 100 and
                                int(word) > 2400
                            )):
                        # ignores military time
                        # "in 3 hours"
                        hrOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1

                    elif wordNext == "minutes":
                        # "in 10 minutes"
                        minOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1
                    elif wordNext == "seconds":
                        # in 5 seconds
                        secOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1
                    elif int(word) > 100:
                        strHH = int(word) / 100
                        strMM = int(word) - strHH * 100
                        military = True
                        if wordNext == "hours":
                            used += 1
                    elif wordNext[0].isdigit():
                        strHH = word
                        strMM = wordNext
                        military = True
                        used += 1
                        if wordNextNext == "hours":
                            used += 1
                    elif (
                            wordNext == "" or wordNext == "o'clock" or
                            (
                                        wordNext == "in" and
                                        (
                                            wordNextNext == "the" or
                                            wordNextNext == timeQualifier
                                        )
                            )):
                        strHH = word
                        strMM = 00
                        if wordNext == "o'clock":
                            used += 1
                        if wordNext == "in" or wordNextNext == "in":
                            used += (1 if wordNext == "in" else 2)
                            if (wordNextNext and
                                wordNextNext in timeQualifier or
                                (words[words.index(wordNextNext) + 1] and
                                 words[words.index(wordNextNext) + 1] in
                                 timeQualifier)):
                                if (wordNextNext == "afternoon" or
                                    (len(words) >
                                     words.index(wordNextNext) + 1 and
                                     words[words.index(
                                         wordNextNext) + 1] == "afternoon")):
                                    remainder = "pm"
                                if (wordNextNext == "evening" or
                                    (len(words) >
                                     (words.index(wordNextNext) + 1) and
                                     words[words.index(
                                         wordNextNext) + 1] == "evening")):
                                    remainder = "pm"
                                if (wordNextNext == "morning" or
                                    (len(words) >
                                     words.index(wordNextNext) + 1 and
                                     words[words.index(
                                         wordNextNext) + 1] == "morning")):
                                    remainder = "am"
                        if timeQualifier != "":
                            military = True
                    else:
                        isTime = False

            strHH = int(strHH) if strHH else 0
            strMM = int(strMM) if strMM else 0
            strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
            strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
            if strHH > 24 or strMM > 59:
                isTime = False
                used = 0
            if isTime:
                hrAbs = strHH * 1
                minAbs = strMM * 1
                used += 1
        if used > 0:
            # removed parsed words from the sentence
            for i in range(used):
                words[idx + i] = ""

            if wordPrev == "o" or wordPrev == "oh":
                words[words.index(wordPrev)] = ""

            if wordPrev == "early":
                hrOffset = -1
                words[idx - 1] = ""
                idx -= 1
            elif wordPrev == "late":
                hrOffset = 1
                words[idx - 1] = ""
                idx -= 1
            if idx > 0 and wordPrev in markers:
                words[idx - 1] = ""
            if idx > 1 and wordPrevPrev in markers:
                words[idx - 2] = ""

            idx += used - 1
            found = True

    # check that we found a date
    if not date_found:
        return None

    if dayOffset is False:
        dayOffset = 0

    # perform date manipulation

    extractedDate = dateNow
    extractedDate = extractedDate.replace(microsecond=0,
                                          second=0,
                                          minute=0,
                                          hour=0)
    if datestr != "":
        temp = datetime.strptime(datestr, "%B %d")
        if not hasYear:
            temp = temp.replace(year=extractedDate.year)
            if extractedDate < temp:
                extractedDate = extractedDate.replace(year=int(currentYear),
                                                      month=int(
                                                          temp.strftime(
                                                              "%m")),
                                                      day=int(temp.strftime(
                                                          "%d")))
            else:
                extractedDate = extractedDate.replace(
                    year=int(currentYear) + 1,
                    month=int(temp.strftime("%m")),
                    day=int(temp.strftime("%d")))
        else:
            extractedDate = extractedDate.replace(
                year=int(temp.strftime("%Y")),
                month=int(temp.strftime("%m")),
                day=int(temp.strftime("%d")))

    if timeStr != "":
        temp = datetime(timeStr)
        extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
                                              minute=temp.strftime("%M"),
                                              second=temp.strftime("%S"))

    if yearOffset != 0:
        extractedDate = extractedDate + relativedelta(years=yearOffset)
    if monthOffset != 0:
        extractedDate = extractedDate + relativedelta(months=monthOffset)
    if dayOffset != 0:
        extractedDate = extractedDate + relativedelta(days=dayOffset)
    if hrAbs != -1 and minAbs != -1:

        extractedDate = extractedDate + relativedelta(hours=hrAbs,
                                                      minutes=minAbs)
        if (hrAbs != 0 or minAbs != 0) and datestr == "":
            if not daySpecified and dateNow > extractedDate:
                extractedDate = extractedDate + relativedelta(days=1)
    if hrOffset != 0:
        extractedDate = extractedDate + relativedelta(hours=hrOffset)
    if minOffset != 0:
        extractedDate = extractedDate + relativedelta(minutes=minOffset)
    if secOffset != 0:
        extractedDate = extractedDate + relativedelta(seconds=secOffset)
    for idx, word in enumerate(words):
        if words[idx] == "and" and words[idx - 1] == "" and words[
                idx + 1] == "":
            words[idx] = ""

    resultStr = " ".join(words)
    resultStr = ' '.join(resultStr.split())
    return [extractedDate, resultStr]


def look_for_fractions(split_list):
    """"
    This function takes a list made by fraction & determines if a fraction.

    Args:
        split_list (list): list created by splitting on '/'
    Returns:
        (bool): False if not a fraction, otherwise True

    """

    if len(split_list) == 2:
        if is_numeric(split_list[0]) and is_numeric(split_list[1]):
            return True

    return False


def isFractional_en(input_str):
    """
    This function takes the given text and checks if it is a fraction.

    Args:
        text (str): the string to check if fractional
    Returns:
        (bool) or (float): False if not a fraction, otherwise the fraction

    """
    if input_str.endswith('s', -1):
        input_str = input_str[:len(input_str) - 1]  # e.g. "fifths"

    aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth",
             "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"]

    if input_str.lower() in aFrac:
        return 1.0 / (aFrac.index(input_str) + 1)
    if input_str == "quarter":
        return 1.0 / 4

    return False


def get_gender(word, input_string="", lang="en-us"):
    '''
    guess gender of word, optionally use raw input text for context
    returns "m" if the word is male, "f" if female, False if unknown
    '''
    if "pt" in lang or "es" in lang:
        # spanish follows same rules
        return get_gender_pt(word, input_string)
    return False


# ==============================================================


def normalize(text, lang="en-us", remove_articles=True):
    """Prepare a string for parsing

    This function prepares the given text for parsing by making
    numbers consistent, getting rid of contractions, etc.
    Args:
        text (str): the string to normalize
        lang (str): the code for the language text is in
        remove_articles (bool): whether to remove articles (like 'a', or 'the')
    Returns:
        (str): The normalized string.
    """

    lang_lower = str(lang).lower()
    if lang_lower.startswith("en"):
        return normalize_en(text, remove_articles)
    elif lang_lower.startswith("es"):
        return normalize_es(text, remove_articles)
    elif lang_lower.startswith("pt"):
        return normalize_pt(text, remove_articles)
    # TODO: Normalization for other languages
    return text


def normalize_en(text, remove_articles):
    """ English string normalization """

    words = text.split()  # this also removed extra spaces
    normalized = ""
    for word in words:
        if remove_articles and word in ["the", "a", "an"]:
            continue

        # Expand common contractions, e.g. "isn't" -> "is not"
        contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
                       "didn't", "doesn't", "don't", "gonna", "gotta",
                       "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
                       "how'd", "how'll", "how's", "I'd", "I'll", "I'm",
                       "I've", "isn't", "it'd", "it'll", "it's", "mightn't",
                       "might've", "mustn't", "must've", "needn't",
                       "oughtn't",
                       "shan't", "she'd", "she'll", "she's", "shouldn't",
                       "should've", "somebody's", "someone'd", "someone'll",
                       "someone's", "that'll", "that's", "that'd", "there'd",
                       "there're", "there's", "they'd", "they'll", "they're",
                       "they've", "wasn't", "we'd", "we'll", "we're", "we've",
                       "weren't", "what'd", "what'll", "what're", "what's",
                       "whats",  # technically incorrect but some STT outputs
                       "what've", "when's", "when'd", "where'd", "where's",
                       "where've", "who'd", "who'd've", "who'll", "who're",
                       "who's", "who've", "why'd", "why're", "why's", "won't",
                       "won't've", "would've", "wouldn't", "wouldn't've",
                       "y'all", "ya'll", "you'd", "you'd've", "you'll",
                       "y'aint", "y'ain't", "you're", "you've"]
        if word in contraction:
            expansion = ["is not", "are not", "can not", "could have",
                         "could not", "did not", "does not", "do not",
                         "going to", "got to", "had not", "has not",
                         "have not", "he would", "he will", "he is",
                         "how did",
                         "how will", "how is", "I would", "I will", "I am",
                         "I have", "is not", "it would", "it will", "it is",
                         "might not", "might have", "must not", "must have",
                         "need not", "ought not", "shall not", "she would",
                         "she will", "she is", "should not", "should have",
                         "somebody is", "someone would", "someone will",
                         "someone is", "that will", "that is", "that would",
                         "there would", "there are", "there is", "they would",
                         "they will", "they are", "they have", "was not",
                         "we would", "we will", "we are", "we have",
                         "were not", "what did", "what will", "what are",
                         "what is",
                         "what is", "what have", "when is", "when did",
                         "where did", "where is", "where have", "who would",
                         "who would have", "who will", "who are", "who is",
                         "who have", "why did", "why are", "why is",
                         "will not", "will not have", "would have",
                         "would not", "would not have", "you all", "you all",
                         "you would", "you would have", "you will",
                         "you are not", "you are not", "you are", "you have"]
            word = expansion[contraction.index(word)]

        # Convert numbers into digits, e.g. "two" -> "2"
        textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
                       "seven", "eight", "nine", "ten", "eleven", "twelve",
                       "thirteen", "fourteen", "fifteen", "sixteen",
                       "seventeen", "eighteen", "nineteen", "twenty"]
        if word in textNumbers:
            word = str(textNumbers.index(word))

        normalized += " " + word

    return normalized[1:]  # strip the initial space


####################################################################
# PT-PT
#
# TODO: numbers greater than 999999
# TODO: date time pt
####################################################################

# Undefined articles ["um", "uma", "uns", "umas"] can not be supressed,
# in PT, "um cavalo" means "a horse" or "one horse".
pt_articles = ["o", "a", "os", "as"]

pt_numbers = {
    "zero": 0,
    "um": 1,
    "uma": 1,
    "uns": 1,
    "umas": 1,
    "primeiro": 1,
    "segundo": 2,
    "terceiro": 3,
    "dois": 2,
    "duas": 2,
    "tres": 3,
    u"tr<EFBFBD>s": 3,
    "quatro": 4,
    "cinco": 5,
    "seis": 6,
    "sete": 7,
    "oito": 8,
    "nove": 9,
    "dez": 10,
    "onze": 11,
    "doze": 12,
    "treze": 13,
    "catorze": 14,
    "quinze": 15,
    "dezasseis": 16,
    "dezassete": 17,
    "dezoito": 18,
    "dezanove": 19,
    "vinte": 20,
    "trinta": 30,
    "quarenta": 40,
    "cinquenta": 50,
    "sessenta": 60,
    "setenta": 70,
    "oitenta": 80,
    "noventa": 90,
    "cem": 100,
    "cento": 100,
    "duzentos": 200,
    "duzentas": 200,
    "trezentos": 300,
    "trezentas": 300,
    "quatrocentos": 400,
    "quatrocentas": 400,
    "quinhentos": 500,
    "quinhentas": 500,
    "seiscentos": 600,
    "seiscentas": 600,
    "setecentos": 700,
    "setecentas": 700,
    "oitocentos": 800,
    "oitocentas": 800,
    "novecentos": 900,
    "novecentas": 900,
    "mil": 1000,
    u"milh<EFBFBD>o": 1000000}


def isFractional_pt(input_str):
    """
    This function takes the given text and checks if it is a fraction.

    Args:
        text (str): the string to check if fractional
    Returns:
        (bool) or (float): False if not a fraction, otherwise the fraction

    """
    if input_str.endswith('s', -1):
        input_str = input_str[:len(input_str) - 1]  # e.g. "fifths"

    aFrac = ["meio", u"ter<EFBFBD>o", "quarto", "quinto", "sexto",
             "setimo", "oitavo", "nono", u"d<EFBFBD>cimo"]

    if input_str.lower() in aFrac:
        return 1.0 / (aFrac.index(input_str) + 2)
    if input_str == u"vig<EFBFBD>simo":
        return 1.0 / 20
    if input_str == u"trig<EFBFBD>simo":
        return 1.0 / 30
    if input_str == u"cent<EFBFBD>simo":
        return 1.0 / 100
    if input_str == u"mil<EFBFBD>simo":
        return 1.0 / 1000
    if (input_str == u"s<EFBFBD>timo" or input_str == "septimo" or
            input_str == u"s<EFBFBD>ptimo"):
        return 1.0 / 7

    return False


def extractnumber_pt(text):
    """
    This function prepares the given text for parsing by making
    numbers consistent, getting rid of contractions, etc.
    Args:
        text (str): the string to normalize
    Returns:
        (int) or (float): The value of extracted number

    """
    aWords = text.split()
    count = 0
    result = None
    while count < len(aWords):
        val = 0
        word = aWords[count]
        next_next_word = None
        if count + 1 < len(aWords):
            next_word = aWords[count + 1]
            if count + 2 < len(aWords):
                next_next_word = aWords[count + 2]
        else:
            next_word = None

        # is current word a number?
        if word in pt_numbers:
            val = pt_numbers[word]
        elif word.isdigit():  # doesn't work with decimals
            val = int(word)
        elif is_numeric(word):
            val = float(word)
        elif isFractional_pt(word):
            if not result:
                result = 1
            result = result * isFractional_pt(word)
            count += 1
            continue

        if not val:
            # look for fractions like "2/3"
            aPieces = word.split('/')
            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
            #   and is_numeric(aPieces[1])):
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])

        if val:
            if result is None:
                result = 0
            # handle fractions
            if next_word != "avos":
                result += val
            else:
                result = float(result) / float(val)

        if next_word is None:
            break

        # number word and fraction
        ands = ["e"]
        if next_word in ands:
            zeros = 0
            if result is None:
                count += 1
                continue
            newWords = aWords[count + 2:]
            newText = ""
            for word in newWords:
                newText += word + " "

            afterAndVal = extractnumber_pt(newText[:-1])
            if afterAndVal:
                if result < afterAndVal or result < 20:
                    while afterAndVal > 1:
                        afterAndVal = afterAndVal / 10.0
                    for word in newWords:
                        if word == "zero" or word == "0":
                            zeros += 1
                        else:
                            break
                for i in range(0, zeros):
                    afterAndVal = afterAndVal / 10.0
                result += afterAndVal
                break
        elif next_next_word is not None:
            if next_next_word in ands:
                newWords = aWords[count + 3:]
                newText = ""
                for word in newWords:
                    newText += word + " "
                afterAndVal = extractnumber_pt(newText[:-1])
                if afterAndVal:
                    if result is None:
                        result = 0
                    result += afterAndVal
                    break

        decimals = ["ponto", "virgula", u"v<EFBFBD>rgula", ".", ","]
        if next_word in decimals:
            zeros = 0
            newWords = aWords[count + 2:]
            newText = ""
            for word in newWords:
                newText += word + " "
            for word in newWords:
                if word == "zero" or word == "0":
                    zeros += 1
                else:
                    break
            afterDotVal = str(extractnumber_pt(newText[:-1]))
            afterDotVal = zeros * "0" + afterDotVal
            result = float(str(result) + "." + afterDotVal)
            break
        count += 1

    if result is None:
        return False

    # Return the $str with the number related words removed
    # (now empty strings, so strlen == 0)
    # aWords = [word for word in aWords if len(word) > 0]
    # text = ' '.join(aWords)
    if "." in str(result):
        integer, dec = str(result).split(".")
        # cast float to int
        if dec == "0":
            result = int(integer)

    return result


def pt_number_parse(words, i):
    def pt_cte(i, s):
        if i < len(words) and s == words[i]:
            return s, i + 1
        return None

    def pt_number_word(i, mi, ma):
        if i < len(words):
            v = pt_numbers.get(words[i])
            if v and v >= mi and v <= ma:
                return v, i + 1
        return None

    def pt_number_1_99(i):
        r1 = pt_number_word(i, 1, 29)
        if r1:
            return r1

        r1 = pt_number_word(i, 30, 90)
        if r1:
            v1, i1 = r1
            r2 = pt_cte(i1, "e")
            if r2:
                v2, i2 = r2
                r3 = pt_number_word(i2, 1, 9)
                if r3:
                    v3, i3 = r3
                    return v1 + v3, i3
            return r1
        return None

    def pt_number_1_999(i):
        # [2-9]cientos [1-99]?
        r1 = pt_number_word(i, 100, 900)
        if r1:
            v1, i1 = r1
            r2 = pt_number_1_99(i1)
            if r2:
                v2, i2 = r2
                return v1 + v2, i2
            else:
                return r1

        # [1-99]
        r1 = pt_number_1_99(i)
        if r1:
            return r1

        return None

    def pt_number(i):
        # check for cero
        r1 = pt_number_word(i, 0, 0)
        if r1:
            return r1

        # check for [1-999] (mil [0-999])?
        r1 = pt_number_1_999(i)
        if r1:
            v1, i1 = r1
            r2 = pt_cte(i1, "mil")
            if r2:
                v2, i2 = r2
                r3 = pt_number_1_999(i2)
                if r3:
                    v3, i3 = r3
                    return v1 * 1000 + v3, i3
                else:
                    return v1 * 1000, i2
            else:
                return r1
        return None

    return pt_number(i)


def normalize_pt(text, remove_articles):
    """ PT string normalization """

    words = text.split()  # this also removed extra spaces
    normalized = ""
    # Contractions are not common in PT

    # Convert numbers into digits, e.g. "dois" -> "2"
    normalized = ""
    i = 0
    while i < len(words):
        word = words[i]
        # remove articles
        if remove_articles and word in pt_articles:
            i += 1
            continue

        # Convert numbers into digits
        r = pt_number_parse(words, i)
        if r:
            v, i = r
            normalized += " " + str(v)
            continue

        # NOTE temporary , handle some numbers above >999
        if word in pt_numbers:
            word = str(pt_numbers[word])
        # end temporary

        normalized += " " + word
        i += 1
    # some articles in pt-pt can not be removed, but many words can
    # this is experimental and some meaning may be lost
    # maybe agressive should default to False
    # only usage will tell, as a native speaker this seems reasonable
    return pt_pruning(normalized[1:], agressive=remove_articles)


def extract_datetime_pt(input_str, currentDate=None):
    def clean_string(str):
        # cleans the input string of unneeded punctuation and capitalization
        # among other things
        symbols = [".", ",", ";", "?", "!", u"<EFBFBD>", u"<EFBFBD>"]
        noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de",
                       "ao", "aos"]

        for word in symbols:
            str = str.replace(word, "")
        for word in noise_words:
            str = str.replace(" " + word + " ", " ")
        str = str.lower().replace(
            u"<EFBFBD>",
            "a").replace(
            u"<EFBFBD>",
            "c").replace(
            u"<EFBFBD>",
            "a").replace(
            u"<EFBFBD>",
            "a").replace(
            u"<EFBFBD>",
            "e").replace(
            u"<EFBFBD>",
            "e").replace(
            u"<EFBFBD>",
            "e").replace(
            u"<EFBFBD>",
            "o").replace(
            u"<EFBFBD>",
            "o").replace(
            "-",
            " ").replace(
            "_",
            "")
        # handle synonims and equivalents, "tomorrow early = tomorrow morning
        synonims = {"manha": ["manhazinha", "cedo", "cedinho"],
                    "tarde": ["tardinha", "tarde"],
                    "noite": ["noitinha", "anoitecer"],
                    "todos": ["ao", "aos"],
                    "em": ["do", "da", "dos", "das", "de"]}
        for syn in synonims:
            for word in synonims[syn]:
                str = str.replace(" " + word + " ", " " + syn + " ")
        # relevant plurals, cant just extract all s in pt
        wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos",
                    "minutos", "segundos", "nas", "nos", "proximas",
                    "seguintes", "horas"]
        for idx, word in enumerate(wordlist):
            str = str.replace(word, word.rstrip('s'))
        str = str.replace("meses", "mes").replace("anteriores", "anterior")
        return str

    def date_found():
        return found or \
            (
                datestr != "" or timeStr != "" or
                yearOffset != 0 or monthOffset != 0 or
                dayOffset is True or hrOffset != 0 or
                hrAbs != 0 or minOffset != 0 or
                minAbs != 0 or secOffset != 0
            )

    if input_str == "":
        return None
    if currentDate is None:
        currentDate = datetime.now()

    found = False
    daySpecified = False
    dayOffset = False
    monthOffset = 0
    yearOffset = 0
    dateNow = currentDate
    today = dateNow.strftime("%w")
    currentYear = dateNow.strftime("%Y")
    fromFlag = False
    datestr = ""
    hasYear = False
    timeQualifier = ""

    words = clean_string(input_str).split(" ")
    timeQualifiersList = ['manha', 'tarde', 'noite']
    time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas",
                       "no", "dia", "hora"]
    days = ['segunda', 'terca', 'quarta',
            'quinta', 'sexta', 'sabado', 'domingo']
    months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho',
              'julho', 'agosto', 'setembro', 'outubro', 'novembro',
              'dezembro']
    monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag',
                   'set', 'out', 'nov', 'dec']
    nexts = ["proximo", "proxima"]
    suffix_nexts = ["seguinte", "subsequente", "seguir"]
    lasts = ["ultimo", "ultima"]
    suffix_lasts = ["passada", "passado", "anterior", "antes"]
    nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"]
    prevs = ["antes", "ante", "previa", "previamente", "anterior"]
    froms = ["partir", "em", "para", "na", "no", "daqui", "seguir",
             "depois", "por", "proxima", "proximo", "da", "do", "de"]
    thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse",
              "nessa"]
    froms += thises
    lists = nxts + prevs + froms + time_indicators
    for idx, word in enumerate(words):
        if word == "":
            continue
        wordPrevPrevPrev = words[idx - 3] if idx > 2 else ""
        wordPrevPrev = words[idx - 2] if idx > 1 else ""
        wordPrev = words[idx - 1] if idx > 0 else ""
        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
        wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""

        start = idx
        used = 0
        # save timequalifier for later
        if word in timeQualifiersList:
            timeQualifier = word

        # parse today, tomorrow, yesterday
        elif word == "hoje" and not fromFlag:
            dayOffset = 0
            used += 1
        elif word == "amanha" and not fromFlag:
            dayOffset = 1
            used += 1
        elif word == "ontem" and not fromFlag:
            dayOffset -= 1
            used += 1
        # "before yesterday" and "before before yesterday"
        elif (word == "anteontem" or
              (word == "ante" and wordNext == "ontem")) and not fromFlag:
            dayOffset -= 2
            used += 1
            if wordNext == "ontem":
                used += 1
        elif word == "ante" and wordNext == "ante" and wordNextNext == \
                "ontem" and not fromFlag:
            dayOffset -= 3
            used += 3
        elif word == "anteanteontem" and not fromFlag:
            dayOffset -= 3
            used += 1
        # day after tomorrow
        elif word == "depois" and wordNext == "amanha" and not fromFlag:
            dayOffset += 2
            used = 2
        # day before yesterday
        elif word == "antes" and wordNext == "ontem" and not fromFlag:
            dayOffset -= 2
            used = 2
        # parse 5 days, 10 weeks, last week, next week, week after
        elif word == "dia":
            if wordNext == "depois" or wordNext == "antes":
                used += 1
                if wordPrev and wordPrev[0].isdigit():
                    dayOffset += int(wordPrev)
                    start -= 1
                    used += 1
            elif (wordPrev and wordPrev[0].isdigit() and
                    wordNext not in months and
                    wordNext not in monthsShort):
                dayOffset += int(wordPrev)
                start -= 1
                used += 2
            elif wordNext and wordNext[0].isdigit() and wordNextNext not in \
                    months and wordNextNext not in monthsShort:
                dayOffset += int(wordNext)
                start -= 1
                used += 2

        elif word == "semana" and not fromFlag:
            if wordPrev[0].isdigit():
                dayOffset += int(wordPrev) * 7
                start -= 1
                used = 2
            for w in nexts:
                if wordPrev == w:
                    dayOffset = 7
                    start -= 1
                    used = 2
            for w in lasts:
                if wordPrev == w:
                    dayOffset = -7
                    start -= 1
                    used = 2
            for w in suffix_nexts:
                if wordNext == w:
                    dayOffset = 7
                    start -= 1
                    used = 2
            for w in suffix_lasts:
                if wordNext == w:
                    dayOffset = -7
                    start -= 1
                    used = 2
        # parse 10 months, next month, last month
        elif word == "mes" and not fromFlag:
            if wordPrev[0].isdigit():
                monthOffset = int(wordPrev)
                start -= 1
                used = 2
            for w in nexts:
                if wordPrev == w:
                    monthOffset = 7
                    start -= 1
                    used = 2
            for w in lasts:
                if wordPrev == w:
                    monthOffset = -7
                    start -= 1
                    used = 2
            for w in suffix_nexts:
                if wordNext == w:
                    monthOffset = 7
                    start -= 1
                    used = 2
            for w in suffix_lasts:
                if wordNext == w:
                    monthOffset = -7
                    start -= 1
                    used = 2
        # parse 5 years, next year, last year
        elif word == "ano" and not fromFlag:
            if wordPrev[0].isdigit():
                yearOffset = int(wordPrev)
                start -= 1
                used = 2
            for w in nexts:
                if wordPrev == w:
                    yearOffset = 7
                    start -= 1
                    used = 2
            for w in lasts:
                if wordPrev == w:
                    yearOffset = -7
                    start -= 1
                    used = 2
            for w in suffix_nexts:
                if wordNext == w:
                    yearOffset = 7
                    start -= 1
                    used = 2
            for w in suffix_lasts:
                if wordNext == w:
                    yearOffset = -7
                    start -= 1
                    used = 2
        # parse Monday, Tuesday, etc., and next Monday,
        # last Tuesday, etc.
        elif word in days and not fromFlag:

            d = days.index(word)
            dayOffset = (d + 1) - int(today)
            used = 1
            if dayOffset < 0:
                dayOffset += 7
            for w in nexts:
                if wordPrev == w:
                    dayOffset += 7
                    used += 1
                    start -= 1
            for w in lasts:
                if wordPrev == w:
                    dayOffset -= 7
                    used += 1
                    start -= 1
            for w in suffix_nexts:
                if wordNext == w:
                    dayOffset += 7
                    used += 1
                    start -= 1
            for w in suffix_lasts:
                if wordNext == w:
                    dayOffset -= 7
                    used += 1
                    start -= 1
            if wordNext == "feira":
                used += 1
        # parse 15 of July, June 20th, Feb 18, 19 of February
        elif word in months or word in monthsShort:
            try:
                m = months.index(word)
            except ValueError:
                m = monthsShort.index(word)
            used += 1
            datestr = months[m]
            if wordPrev and wordPrev[0].isdigit():
                # 13 maio
                datestr += " " + wordPrev
                start -= 1
                used += 1
                if wordNext and wordNext[0].isdigit():
                    datestr += " " + wordNext
                    used += 1
                    hasYear = True
                else:
                    hasYear = False

            elif wordNext and wordNext[0].isdigit():
                # maio 13
                datestr += " " + wordNext
                used += 1
                if wordNextNext and wordNextNext[0].isdigit():
                    datestr += " " + wordNextNext
                    used += 1
                    hasYear = True
                else:
                    hasYear = False

            elif wordPrevPrev and wordPrevPrev[0].isdigit():
                # 13 dia maio
                datestr += " " + wordPrevPrev

                start -= 2
                used += 2
                if wordNext and word[0].isdigit():
                    datestr += " " + wordNext
                    used += 1
                    hasYear = True
                else:
                    hasYear = False

            elif wordNextNext and wordNextNext[0].isdigit():
                # maio dia 13
                datestr += " " + wordNextNext
                used += 2
                if wordNextNextNext and wordNextNextNext[0].isdigit():
                    datestr += " " + wordNextNextNext
                    used += 1
                    hasYear = True
                else:
                    hasYear = False

            if datestr in months:
                datestr = ""

        # parse 5 days from tomorrow, 10 weeks from next thursday,
        # 2 months from July
        validFollowups = days + months + monthsShort
        validFollowups.append("hoje")
        validFollowups.append("amanha")
        validFollowups.append("ontem")
        validFollowups.append("anteontem")
        validFollowups.append("agora")
        validFollowups.append("ja")
        validFollowups.append("ante")

        # TODO debug word "depois" that one is failing for some reason
        if word in froms and wordNext in validFollowups:

            if not (wordNext == "amanha" and wordNext == "ontem") and not (
                    word == "depois" or word == "antes" or word == "em"):
                used = 2
                fromFlag = True
            if wordNext == "amanha" and word != "depois":
                dayOffset += 1
            elif wordNext == "ontem":
                dayOffset -= 1
            elif wordNext == "anteontem":
                dayOffset -= 2
            elif wordNext == "ante" and wordNextNext == "ontem":
                dayOffset -= 2
            elif (wordNext == "ante" and wordNext == "ante" and
                  wordNextNextNext == "ontem"):
                dayOffset -= 3
            elif wordNext in days:
                d = days.index(wordNext)
                tmpOffset = (d + 1) - int(today)
                used = 2
                if wordNextNext == "feira":
                    used += 1
                if tmpOffset < 0:
                    tmpOffset += 7
                if wordNextNext:
                    if wordNextNext in nxts:
                        tmpOffset += 7
                        used += 1
                    elif wordNextNext in prevs:
                        tmpOffset -= 7
                        used += 1
                dayOffset += tmpOffset
            elif wordNextNext and wordNextNext in days:
                d = days.index(wordNextNext)
                tmpOffset = (d + 1) - int(today)
                used = 3
                if wordNextNextNext:
                    if wordNextNextNext in nxts:
                        tmpOffset += 7
                        used += 1
                    elif wordNextNextNext in prevs:
                        tmpOffset -= 7
                        used += 1
                dayOffset += tmpOffset
                if wordNextNextNext == "feira":
                    used += 1
        if wordNext in months:
            used -= 1
        if used > 0:

            if start - 1 > 0 and words[start - 1] in lists:
                start -= 1
                used += 1

            for i in range(0, used):
                words[i + start] = ""

            if (start - 1 >= 0 and words[start - 1] in lists):
                words[start - 1] = ""
            found = True
            daySpecified = True

    # parse time
    timeStr = ""
    hrOffset = 0
    minOffset = 0
    secOffset = 0
    hrAbs = 0
    minAbs = 0
    military = False

    for idx, word in enumerate(words):
        if word == "":
            continue

        wordPrevPrev = words[idx - 2] if idx > 1 else ""
        wordPrev = words[idx - 1] if idx > 0 else ""
        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
        wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
        # parse noon, midnight, morning, afternoon, evening
        used = 0
        if word == "meio" and wordNext == "dia":
            hrAbs = 12
            used += 2
        elif word == "meia" and wordNext == "noite":
            hrAbs = 0
            used += 2
        elif word == "manha":
            if hrAbs == 0:
                hrAbs = 8
            used += 1
        elif word == "tarde":
            if hrAbs == 0:
                hrAbs = 15
            used += 1
        elif word == "meio" and wordNext == "tarde":
            if hrAbs == 0:
                hrAbs = 17
            used += 2
        elif word == "meio" and wordNext == "manha":
            if hrAbs == 0:
                hrAbs = 10
            used += 2
        elif word == "fim" and wordNext == "tarde":
            if hrAbs == 0:
                hrAbs = 19
            used += 2
        elif word == "fim" and wordNext == "manha":
            if hrAbs == 0:
                hrAbs = 11
            used += 2
        elif word == "tantas" and wordNext == "manha":
            if hrAbs == 0:
                hrAbs = 4
            used += 2
        elif word == "noite":
            if hrAbs == 0:
                hrAbs = 22
            used += 1
        # parse half an hour, quarter hour
        elif word == "hora" and \
                (wordPrev in time_indicators or wordPrevPrev in
                    time_indicators):
            if wordPrev == "meia":
                minOffset = 30
            elif wordPrev == "quarto":
                minOffset = 15
            elif wordPrevPrev == "quarto":
                minOffset = 15
                if idx > 2 and words[idx - 3] in time_indicators:
                    words[idx - 3] = ""
                words[idx - 2] = ""
            else:
                hrOffset = 1
            if wordPrevPrev in time_indicators:
                words[idx - 2] = ""
            words[idx - 1] = ""
            used += 1
            hrAbs = -1
            minAbs = -1
        # parse 5:00 am, 12:00 p.m., etc
        elif word[0].isdigit():
            isTime = True
            strHH = ""
            strMM = ""
            remainder = ""
            if ':' in word:
                # parse colons
                # "3:00 in the morning"
                stage = 0
                length = len(word)
                for i in range(length):
                    if stage == 0:
                        if word[i].isdigit():
                            strHH += word[i]
                        elif word[i] == ":":
                            stage = 1
                        else:
                            stage = 2
                            i -= 1
                    elif stage == 1:
                        if word[i].isdigit():
                            strMM += word[i]
                        else:
                            stage = 2
                            i -= 1
                    elif stage == 2:
                        remainder = word[i:].replace(".", "")
                        break
                if remainder == "":
                    nextWord = wordNext.replace(".", "")
                    if nextWord == "am" or nextWord == "pm":
                        remainder = nextWord
                        used += 1
                    elif wordNext == "manha":
                        remainder = "am"
                        used += 1
                    elif wordNext == "tarde":
                        remainder = "pm"
                        used += 1
                    elif wordNext == "noite":
                        if 0 < int(word[0]) < 6:
                            remainder = "am"
                        else:
                            remainder = "pm"
                        used += 1
                    elif wordNext in thises and wordNextNext == "manha":
                        remainder = "am"
                        used = 2
                    elif wordNext in thises and wordNextNext == "tarde":
                        remainder = "pm"
                        used = 2
                    elif wordNext in thises and wordNextNext == "noite":
                        remainder = "pm"
                        used = 2
                    else:
                        if timeQualifier != "":
                            military = True
                            if strHH <= 12 and \
                                    (timeQualifier == "manha" or
                                     timeQualifier == "tarde"):
                                strHH += 12

            else:
                # try to parse # s without colons
                # 5 hours, 10 minutes etc.
                length = len(word)
                strNum = ""
                remainder = ""
                for i in range(length):
                    if word[i].isdigit():
                        strNum += word[i]
                    else:
                        remainder += word[i]

                if remainder == "":
                    remainder = wordNext.replace(".", "").lstrip().rstrip()

                if (
                        remainder == "pm" or
                        wordNext == "pm" or
                        remainder == "p.m." or
                        wordNext == "p.m."):
                    strHH = strNum
                    remainder = "pm"
                    used = 1
                elif (
                        remainder == "am" or
                        wordNext == "am" or
                        remainder == "a.m." or
                        wordNext == "a.m."):
                    strHH = strNum
                    remainder = "am"
                    used = 1
                else:
                    if (wordNext == "pm" or
                            wordNext == "p.m." or
                            wordNext == "tarde"):
                        strHH = strNum
                        remainder = "pm"
                        used = 1
                    elif (wordNext == "am" or
                          wordNext == "a.m." or
                          wordNext == "manha"):
                        strHH = strNum
                        remainder = "am"
                        used = 1
                    elif (int(word) > 100 and
                            (
                                wordPrev == "o" or
                                wordPrev == "oh" or
                                wordPrev == "zero"
                            )):
                        # 0800 hours (pronounced oh-eight-hundred)
                        strHH = int(word) / 100
                        strMM = int(word) - strHH * 100
                        military = True
                        if wordNext == "hora":
                            used += 1
                    elif (
                            wordNext == "hora" and
                            word[0] != '0' and
                            (
                                int(word) < 100 and
                                int(word) > 2400
                            )):
                        # ignores military time
                        # "in 3 hours"
                        hrOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1

                    elif wordNext == "minuto":
                        # "in 10 minutes"
                        minOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1
                    elif wordNext == "segundo":
                        # in 5 seconds
                        secOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1
                    elif int(word) > 100:
                        strHH = int(word) / 100
                        strMM = int(word) - strHH * 100
                        military = True
                        if wordNext == "hora":
                            used += 1

                    elif wordNext == "" or (
                            wordNext == "em" and wordNextNext == "ponto"):
                        strHH = word
                        strMM = 00
                        if wordNext == "em" and wordNextNext == "ponto":
                            used += 2
                            if wordNextNextNext == "tarde":
                                remainder = "pm"
                                used += 1
                            elif wordNextNextNext == "manha":
                                remainder = "am"
                                used += 1
                            elif wordNextNextNext == "noite":
                                if 0 > strHH > 6:
                                    remainder = "am"
                                else:
                                    remainder = "pm"
                                used += 1

                    elif wordNext[0].isdigit():
                        strHH = word
                        strMM = wordNext
                        military = True
                        used += 1
                        if wordNextNext == "hora":
                            used += 1
                    else:
                        isTime = False

            strHH = int(strHH) if strHH else 0
            strMM = int(strMM) if strMM else 0
            strHH = strHH + 12 if (remainder == "pm" and
                                   0 < strHH < 12) else strHH
            strHH = strHH - 12 if (remainder == "am" and
                                   0 < strHH >= 12) else strHH
            if strHH > 24 or strMM > 59:
                isTime = False
                used = 0
            if isTime:
                hrAbs = strHH * 1
                minAbs = strMM * 1
                used += 1

        if used > 0:
            # removed parsed words from the sentence
            for i in range(used):
                words[idx + i] = ""

            if wordPrev == "em" or wordPrev == "ponto":
                words[words.index(wordPrev)] = ""

            if idx > 0 and wordPrev in time_indicators:
                words[idx - 1] = ""
            if idx > 1 and wordPrevPrev in time_indicators:
                words[idx - 2] = ""

            idx += used - 1
            found = True

    # check that we found a date
    if not date_found:
        return None

    if dayOffset is False:
        dayOffset = 0

    # perform date manipulation

    extractedDate = dateNow
    extractedDate = extractedDate.replace(microsecond=0,
                                          second=0,
                                          minute=0,
                                          hour=0)
    if datestr != "":
        en_months = ['january', 'february', 'march', 'april', 'may', 'june',
                     'july', 'august', 'september', 'october', 'november',
                     'december']
        en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
                          'aug',
                          'sept', 'oct', 'nov', 'dec']
        for idx, en_month in enumerate(en_months):
            datestr = datestr.replace(months[idx], en_month)
        for idx, en_month in enumerate(en_monthsShort):
            datestr = datestr.replace(monthsShort[idx], en_month)

        temp = datetime.strptime(datestr, "%B %d")
        if not hasYear:
            temp = temp.replace(year=extractedDate.year)
            if extractedDate < temp:
                extractedDate = extractedDate.replace(year=int(currentYear),
                                                      month=int(
                                                          temp.strftime(
                                                              "%m")),
                                                      day=int(temp.strftime(
                                                          "%d")))
            else:
                extractedDate = extractedDate.replace(
                    year=int(currentYear) + 1,
                    month=int(temp.strftime("%m")),
                    day=int(temp.strftime("%d")))
        else:
            extractedDate = extractedDate.replace(
                year=int(temp.strftime("%Y")),
                month=int(temp.strftime("%m")),
                day=int(temp.strftime("%d")))

    if timeStr != "":
        temp = datetime(timeStr)
        extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
                                              minute=temp.strftime("%M"),
                                              second=temp.strftime("%S"))

    if yearOffset != 0:
        extractedDate = extractedDate + relativedelta(years=yearOffset)
    if monthOffset != 0:
        extractedDate = extractedDate + relativedelta(months=monthOffset)
    if dayOffset != 0:
        extractedDate = extractedDate + relativedelta(days=dayOffset)
    if hrAbs != -1 and minAbs != -1:

        extractedDate = extractedDate + relativedelta(hours=hrAbs,
                                                      minutes=minAbs)
        if (hrAbs != 0 or minAbs != 0) and datestr == "":
            if not daySpecified and dateNow > extractedDate:
                extractedDate = extractedDate + relativedelta(days=1)
    if hrOffset != 0:
        extractedDate = extractedDate + relativedelta(hours=hrOffset)
    if minOffset != 0:
        extractedDate = extractedDate + relativedelta(minutes=minOffset)
    if secOffset != 0:
        extractedDate = extractedDate + relativedelta(seconds=secOffset)

    resultStr = " ".join(words)
    resultStr = ' '.join(resultStr.split())
    resultStr = pt_pruning(resultStr)
    return [extractedDate, resultStr]


def pt_pruning(text, symbols=True, accents=True, agressive=True):
    # agressive pt word pruning
    words = ["a", "o", "os", "as", "de", "dos", "das",
             "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para",
             "este",
             "esta", "deste", "desta", "neste", "nesta", "nesse",
             "nessa", "foi", "que"]
    if symbols:
        symbols = [".", ",", ";", ":", "!", "?", u"<EFBFBD>", u"<EFBFBD>"]
        for symbol in symbols:
            text = text.replace(symbol, "")
        text = text.replace("-", " ").replace("_", " ")
    if accents:
        accents = {"a": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
                   "e": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
                   "i": [u"<EFBFBD>", u"<EFBFBD>"],
                   "o": [u"<EFBFBD>", u"<EFBFBD>"],
                   "u": [u"<EFBFBD>", u"<EFBFBD>"],
                   "c": [u"<EFBFBD>"]}
        for char in accents:
            for acc in accents[char]:
                text = text.replace(acc, char)
    if agressive:
        text_words = text.split(" ")
        for idx, word in enumerate(text_words):
            if word in words:
                text_words[idx] = ""
        text = " ".join(text_words)
        text = ' '.join(text.split())
    return text


def get_gender_pt(word, raw_string=""):
    word = word.rstrip("s")
    gender = False
    words = raw_string.split(" ")
    for idx, w in enumerate(words):
        if w == word and idx != 0:
            previous = words[idx - 1]
            gender = get_gender_pt(previous)
            break
    if not gender:
        if word[-1] == "a":
            gender = "f"
        if word[-1] == "o" or word[-1] == "e":
            gender = "m"
    return gender


####################################################################
# Spanish normalization
#
# TODO: numbers greater than 999999
####################################################################

# Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
# in Spanish, "un caballo" means "a horse" or "one horse".
es_articles = ["el", "la", "los", "las"]

es_numbers_xlat = {
    "un": 1,
    "uno": 1,
    "una": 1,
    "dos": 2,
    "tres": 3,
    u"tr<EFBFBD>s": 3,
    "cuatro": 4,
    "cinco": 5,
    "seis": 6,
    "siete": 7,
    "ocho": 8,
    "nueve": 9,
    "diez": 10,
    "once": 11,
    "doce": 12,
    "trece": 13,
    "catorce": 14,
    "quince": 15,
    "dieciseis": 16,
    u"diecis<EFBFBD>is": 16,
    "diecisiete": 17,
    "dieciocho": 18,
    "diecinueve": 19,
    "veinte": 20,
    "veintiuno": 21,
    u"veintid<EFBFBD>s": 22,
    u"veintitr<EFBFBD>s": 23,
    "veintidos": 22,
    "veintitres": 23,
    "veinticuatro": 24,
    "veinticinco": 25,
    u"veintis<EFBFBD>is": 26,
    "veintiseis": 26,
    "veintisiete": 27,
    "veintiocho": 28,
    "veintinueve": 29,
    "treinta": 30,
    "cuarenta": 40,
    "cincuenta": 50,
    "sesenta": 60,
    "setenta": 70,
    "ochenta": 80,
    "noventa": 90,
    "cien": 100,
    "ciento": 100,
    "doscientos": 200,
    "doscientas": 200,
    "trescientos": 300,
    "trescientas": 300,
    "cuatrocientos": 400,
    "cuatrocientas": 400,
    "quinientos": 500,
    "quinientas": 500,
    "seiscientos": 600,
    "seiscientas": 600,
    "setecientos": 700,
    "setecientas": 700,
    "ochocientos": 800,
    "ochocientas": 800,
    "novecientos": 900,
    "novecientas": 900}


def es_parse(words, i):
    def es_cte(i, s):
        if i < len(words) and s == words[i]:
            return s, i + 1
        return None

    def es_number_word(i, mi, ma):
        if i < len(words):
            v = es_numbers_xlat.get(words[i])
            if v and v >= mi and v <= ma:
                return v, i + 1
        return None

    def es_number_1_99(i):
        r1 = es_number_word(i, 1, 29)
        if r1:
            return r1

        r1 = es_number_word(i, 30, 90)
        if r1:
            v1, i1 = r1
            r2 = es_cte(i1, "y")
            if r2:
                v2, i2 = r2
                r3 = es_number_word(i2, 1, 9)
                if r3:
                    v3, i3 = r3
                    return v1 + v3, i3
            return r1
        return None

    def es_number_1_999(i):
        # [2-9]cientos [1-99]?
        r1 = es_number_word(i, 100, 900)
        if r1:
            v1, i1 = r1
            r2 = es_number_1_99(i1)
            if r2:
                v2, i2 = r2
                return v1 + v2, i2
            else:
                return r1

        # [1-99]
        r1 = es_number_1_99(i)
        if r1:
            return r1

        return None

    def es_number(i):
        # check for cero
        r1 = es_number_word(i, 0, 0)
        if r1:
            return r1

        # check for [1-999] (mil [0-999])?
        r1 = es_number_1_999(i)
        if r1:
            v1, i1 = r1
            r2 = es_cte(i1, "mil")
            if r2:
                v2, i2 = r2
                r3 = es_number_1_999(i2)
                if r3:
                    v3, i3 = r3
                    return v1 * 1000 + v3, i3
                else:
                    return v1 * 1000, i2
            else:
                return r1
        return None

    return es_number(i)


def normalize_es(text, remove_articles):
    """ Spanish string normalization """

    words = text.split()  # this also removed extra spaces

    normalized = ""
    i = 0
    while i < len(words):
        word = words[i]

        if remove_articles and word in es_articles:
            i += 1
            continue

        # Convert numbers into digits
        r = es_parse(words, i)
        if r:
            v, i = r
            normalized += " " + str(v)
            continue

        normalized += " " + word
        i += 1

    return normalized[1:]  # strip the initial space
-												normalize_es

											
										
										
											2017-05-03 09:37:00 +00:00
+								# -*- coding: iso-8859-15 -*-
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								#
-												Change to Apache 2.0 license from GPLv3.0

This commit officially switches the mycroft-core repository from
GPLv3.0 licensing to Apache 2.0.  All dependencies on GPL'ed code
have been removed and we have contacted all previous contributors
with still-existing code in the repository to agree to this change.

Going forward, all contributors will sign a Contributor License
Agreement (CLA) by visiting https://mycroft.ai/cla, then they will
be included in the Mycroft Project's overall Contributor list,
found at: https://github.com/MycroftAI/contributors.  This cleanly
protects the project, the contributor and all who use the technology
to build upon.

Futher discussion can be found at this blog post:
https://mycroft.ai/blog/right-license/

This commit also removes all __author__="" from the code.  These
lines are painful to maintain and the etiquette surrounding their
maintainence is unclear.  Do you remove a name from the list if the
last line of code the wrote gets replaced?  Etc.  Now all
contributors are publicly acknowledged in the aforementioned repo,
and actual authorship is maintained by Github in a much more
effective and elegant way!

Finally, a few references to "Mycroft AI" were changed to the correct
legal entity name "Mycroft AI Inc."

==== Fixed Issues ====
#403 Update License.md and file headers to Apache 2.0
#400 Update LICENSE.md

====  Documentation Notes ====
Deprecated the ScheduledSkill and ScheduledCRUDSkill classes.
These capabilities have been superceded by the more flexible MycroftSkill
class methods schedule_event(), schedule_repeating_event(), update_event(),
and cancel_event().

											
										
										
											2017-10-04 06:28:44 +00:00
+								# Copyright 2017 Mycroft AI Inc.
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								#
-												Change to Apache 2.0 license from GPLv3.0

This commit officially switches the mycroft-core repository from
GPLv3.0 licensing to Apache 2.0.  All dependencies on GPL'ed code
have been removed and we have contacted all previous contributors
with still-existing code in the repository to agree to this change.

Going forward, all contributors will sign a Contributor License
Agreement (CLA) by visiting https://mycroft.ai/cla, then they will
be included in the Mycroft Project's overall Contributor list,
found at: https://github.com/MycroftAI/contributors.  This cleanly
protects the project, the contributor and all who use the technology
to build upon.

Futher discussion can be found at this blog post:
https://mycroft.ai/blog/right-license/

This commit also removes all __author__="" from the code.  These
lines are painful to maintain and the etiquette surrounding their
maintainence is unclear.  Do you remove a name from the list if the
last line of code the wrote gets replaced?  Etc.  Now all
contributors are publicly acknowledged in the aforementioned repo,
and actual authorship is maintained by Github in a much more
effective and elegant way!

Finally, a few references to "Mycroft AI" were changed to the correct
legal entity name "Mycroft AI Inc."

==== Fixed Issues ====
#403 Update License.md and file headers to Apache 2.0
#400 Update LICENSE.md

====  Documentation Notes ====
Deprecated the ScheduledSkill and ScheduledCRUDSkill classes.
These capabilities have been superceded by the more flexible MycroftSkill
class methods schedule_event(), schedule_repeating_event(), update_event(),
and cancel_event().

											
										
										
											2017-10-04 06:28:44 +00:00
+								# Licensed under the Apache License, Version 2.0 (the "License");
 								# you may not use this file except in compliance with the License.
 								# You may obtain a copy of the License at
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								#
-												Change to Apache 2.0 license from GPLv3.0

This commit officially switches the mycroft-core repository from
GPLv3.0 licensing to Apache 2.0.  All dependencies on GPL'ed code
have been removed and we have contacted all previous contributors
with still-existing code in the repository to agree to this change.

Going forward, all contributors will sign a Contributor License
Agreement (CLA) by visiting https://mycroft.ai/cla, then they will
be included in the Mycroft Project's overall Contributor list,
found at: https://github.com/MycroftAI/contributors.  This cleanly
protects the project, the contributor and all who use the technology
to build upon.

Futher discussion can be found at this blog post:
https://mycroft.ai/blog/right-license/

This commit also removes all __author__="" from the code.  These
lines are painful to maintain and the etiquette surrounding their
maintainence is unclear.  Do you remove a name from the list if the
last line of code the wrote gets replaced?  Etc.  Now all
contributors are publicly acknowledged in the aforementioned repo,
and actual authorship is maintained by Github in a much more
effective and elegant way!

Finally, a few references to "Mycroft AI" were changed to the correct
legal entity name "Mycroft AI Inc."

==== Fixed Issues ====
#403 Update License.md and file headers to Apache 2.0
#400 Update LICENSE.md

====  Documentation Notes ====
Deprecated the ScheduledSkill and ScheduledCRUDSkill classes.
These capabilities have been superceded by the more flexible MycroftSkill
class methods schedule_event(), schedule_repeating_event(), update_event(),
and cancel_event().

											
										
										
											2017-10-04 06:28:44 +00:00
+								#    http://www.apache.org/licenses/LICENSE-2.0
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								#
-												Change to Apache 2.0 license from GPLv3.0

This commit officially switches the mycroft-core repository from
GPLv3.0 licensing to Apache 2.0.  All dependencies on GPL'ed code
have been removed and we have contacted all previous contributors
with still-existing code in the repository to agree to this change.

Going forward, all contributors will sign a Contributor License
Agreement (CLA) by visiting https://mycroft.ai/cla, then they will
be included in the Mycroft Project's overall Contributor list,
found at: https://github.com/MycroftAI/contributors.  This cleanly
protects the project, the contributor and all who use the technology
to build upon.

Futher discussion can be found at this blog post:
https://mycroft.ai/blog/right-license/

This commit also removes all __author__="" from the code.  These
lines are painful to maintain and the etiquette surrounding their
maintainence is unclear.  Do you remove a name from the list if the
last line of code the wrote gets replaced?  Etc.  Now all
contributors are publicly acknowledged in the aforementioned repo,
and actual authorship is maintained by Github in a much more
effective and elegant way!

Finally, a few references to "Mycroft AI" were changed to the correct
legal entity name "Mycroft AI Inc."

==== Fixed Issues ====
#403 Update License.md and file headers to Apache 2.0
#400 Update LICENSE.md

====  Documentation Notes ====
Deprecated the ScheduledSkill and ScheduledCRUDSkill classes.
These capabilities have been superceded by the more flexible MycroftSkill
class methods schedule_event(), schedule_repeating_event(), update_event(),
and cancel_event().

											
										
										
											2017-10-04 06:28:44 +00:00
+								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								#
 								from datetime import datetime, timedelta
 								from dateutil.relativedelta import relativedelta
-												Add new parse function fuzzy_match (take 2)

A fuzzy_match() produces a basic ratio match of two strings.
The result is between 0.0 and 1.0, where 1.0 is a perfect
match.

											
										
										
											2017-12-15 11:54:14 +00:00
+								from difflib import SequenceMatcher
 								def fuzzy_match(x, against):
 								    """Perform a 'fuzzy' comparison between two strings.
 								    Returns:
 								        float: match percentage -- 1.0 for perfect match,
 								               down to 0.0 for no match at all.
 								    """
 								    return SequenceMatcher(None, x, against).ratio()
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								def extractnumber(text, lang="en-us"):
 								    """Takes in a string and extracts a number.
 								    Args:
 								        text (str): the string to extract a number from
 								        lang (str): the code for the language text is in
 								    Returns:
 								        (str): The number extracted or the original text.
 								    """
 								    lang_lower = str(lang).lower()
 								    if lang_lower.startswith("en"):
 								        # return extractnumber_en(text, remove_articles)
 								        return extractnumber_en(text)
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								    elif lang_lower.startswith("pt"):
 								        return extractnumber_pt(text)
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
 								    # TODO: Normalization for other languages
 								    return text
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								def extract_datetime(text, anchorDate=None, lang="en-us"):
 								    """
 								    Parsing function that extracts date and time information
 								    from sentences. Parses many of the common ways that humans
 								    express dates and times. Includes relative dates like "5 days from today".
 								    Vague terminology are given arbitrary values, like:
 								        - morning = 8 AM
 								        - afternoon = 3 PM
 								        - evening = 7 PM
 								    If a time isn't supplied, the function defaults to 12 AM
 								    Args:
 								        str (string): the text to be normalized
 								        anchortDate (:obj:`datetime`, optional): the date to be used for
 								            relative dating (for example, what does "tomorrow" mean?).
 								            Defaults to the current date
 								            (acquired with datetime.datetime.now())
 								        lang (string): the language of the sentence(s)
 								    Returns:
 								        [:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date
 								            as a datetime object. Times are represented in 24 hour notation.
 								            'leftover_string' is the original phrase with all date and time
 								            related keywords stripped out. See examples for further
 								            clarification
 								            Returns 'None' if no date was extracted.
 								    Examples:
 								        >>> extract_datetime(
 								        ... "What is the weather like the day after tomorrow?",
 								        ... datetime(2017, 06, 30, 00, 00)
 								        ... )
 								        [datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like']
 								        >>> extract_datetime(
 								        ... "Set up an appointment 2 weeks from Sunday at 5 pm",
 								        ... datetime(2016, 02, 19, 00, 00)
 								        ... )
 								        [datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment']
 								    """
 								    lang_lower = str(lang).lower()
 								    if lang_lower.startswith("en"):
 								        return extract_datetime_en(text, anchorDate)
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								    elif lang_lower.startswith("pt"):
 								        return extract_datetime_pt(text, anchorDate)
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
 								    return text
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								def is_numeric(input_str):
 								    """
 								    Takes in a string and tests to see if it is a number.
 								    Args:
 								        text (str): string to test if a number
 								    Returns:
 								        (bool): True if a number, else False
 								    """
 								    try:
 								        float(input_str)
 								        return True
 								    except ValueError:
 								        return False
 								def extractnumber_en(text):
 								    """
 								    This function prepares the given text for parsing by making
 								    numbers consistent, getting rid of contractions, etc.
 								    Args:
 								        text (str): the string to normalize
 								    Returns:
 								        (int) or (float): The value of extracted number
 								    """
 								    aWords = text.split()
 								    aWords = [word for word in aWords if word not in ["the", "a", "an"]]
 								    andPass = False
 								    valPreAnd = False
 								    val = False
 								    count = 0
 								    while count < len(aWords):
 								        word = aWords[count]
 								        if is_numeric(word):
 								            # if word.isdigit():            # doesn't work with decimals
 								            val = float(word)
 								        elif word == "first":
 								            val = 1
 								        elif word == "second":
 								            val = 2
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        elif isFractional_en(word):
 								            val = isFractional_en(word)
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								        else:
 								            if word == "one":
 								                val = 1
 								            elif word == "two":
 								                val = 2
 								            elif word == "three":
 								                val = 3
 								            elif word == "four":
 								                val = 4
 								            elif word == "five":
 								                val = 5
 								            elif word == "six":
 								                val = 6
 								            elif word == "seven":
 								                val = 7
 								            elif word == "eight":
 								                val = 8
 								            elif word == "nine":
 								                val = 9
 								            elif word == "ten":
 								                val = 10
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								            if val:
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								                if count < (len(aWords) - 1):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    wordNext = aWords[count + 1]
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								                else:
 								                    wordNext = ""
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                valNext = isFractional_en(wordNext)
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								                if valNext:
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								                    val = val * valNext
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    aWords[count + 1] = ""
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								        # if val == False:
 								        if not val:
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								            # look for fractions like "2/3"
 								            aPieces = word.split('/')
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
 								            #   and is_numeric(aPieces[1])):
 								            if look_for_fractions(aPieces):
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								                val = float(aPieces[0]) / float(aPieces[1])
 								            elif andPass:
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								                # added to value, quit here
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								                val = valPreAnd
 								                break
 								            else:
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								                count += 1
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								                continue
 								        aWords[count] = ""
 								        if (andPass):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            aWords[count - 1] = ''  # remove "and"
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								            val += valPreAnd
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        elif count + 1 < len(aWords) and aWords[count + 1] == 'and':
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								            andPass = True
 								            valPreAnd = val
 								            val = False
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								            count += 2
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								            continue
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        elif count + 2 < len(aWords) and aWords[count + 2] == 'and':
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								            andPass = True
 								            valPreAnd = val
 								            val = False
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								            count += 3
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								            continue
 								        break
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								    # if val == False:
 								    if not val:
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								        return False
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								    # Return the $str with the number related words removed
 								    # (now empty strings, so strlen == 0)
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								    aWords = [word for word in aWords if len(word) > 0]
 								    text = ' '.join(aWords)
 								    return val
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								def extract_datetime_en(str, currentDate=None):
 								    def clean_string(str):
 								        # cleans the input string of unneeded punctuation and capitalization
 								        # among other things
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        str = str.lower().replace('?', '').replace('.', '').replace(',', '') \
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ')
 								        wordList = str.split()
 								        for idx, word in enumerate(wordList):
 								            word = word.replace("'s", "")
 								            ordinals = ["rd", "st", "nd", "th"]
 								            if word[0].isdigit():
 								                for ord in ordinals:
 								                    if ord in word:
 								                        word = word.replace(ord, "")
 								            wordList[idx] = word
 								        return wordList
 								    def date_found():
 								        return found or \
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								            (
 								                datestr != "" or timeStr != "" or
 								                yearOffset != 0 or monthOffset != 0 or
 								                dayOffset is True or hrOffset != 0 or
 								                hrAbs != 0 or minOffset != 0 or
 								                minAbs != 0 or secOffset != 0
 								            )
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
 								    if str == "":
 								        return None
 								    if currentDate is None:
 								        currentDate = datetime.now()
 								    found = False
 								    daySpecified = False
 								    dayOffset = False
 								    monthOffset = 0
 								    yearOffset = 0
 								    dateNow = currentDate
 								    today = dateNow.strftime("%w")
 								    currentYear = dateNow.strftime("%Y")
 								    fromFlag = False
 								    datestr = ""
 								    hasYear = False
 								    timeQualifier = ""
 								    timeQualifiersList = ['morning', 'afternoon', 'evening']
 								    markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of']
 								    days = ['monday', 'tuesday', 'wednesday',
 								            'thursday', 'friday', 'saturday', 'sunday']
 								    months = ['january', 'february', 'march', 'april', 'may', 'june',
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								              'july', 'august', 'september', 'october', 'november',
 								              'december']
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								    monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
 								                   'sept', 'oct', 'nov', 'dec']
 								    words = clean_string(str)
 								    for idx, word in enumerate(words):
 								        if word == "":
 								            continue
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        wordPrevPrev = words[idx - 2] if idx > 1 else ""
 								        wordPrev = words[idx - 1] if idx > 0 else ""
 								        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
 								        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
 								        # this isn't in clean string because I don't want to save back to words
 								        word = word.rstrip('s')
 								        start = idx
 								        used = 0
 								        # save timequalifier for later
 								        if word in timeQualifiersList:
 								            timeQualifier = word
 								            # parse today, tomorrow, day after tomorrow
 								        elif word == "today" and not fromFlag:
 								            dayOffset = 0
 								            used += 1
 								        elif word == "tomorrow" and not fromFlag:
 								            dayOffset = 1
 								            used += 1
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								        elif (word == "day" and
 								                wordNext == "after" and
 								                wordNextNext == "tomorrow" and
 								                not fromFlag and
 								                not wordPrev[0].isdigit()):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            dayOffset = 2
 								            used = 3
 								            if wordPrev == "the":
 								                start -= 1
 								                used += 1
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                # parse 5 days, 10 weeks, last week, next week
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								        elif word == "day":
 								            if wordPrev[0].isdigit():
 								                dayOffset += int(wordPrev)
 								                start -= 1
 								                used = 2
 								        elif word == "week" and not fromFlag:
 								            if wordPrev[0].isdigit():
 								                dayOffset += int(wordPrev) * 7
 								                start -= 1
 								                used = 2
 								            elif wordPrev == "next":
 								                dayOffset = 7
 								                start -= 1
 								                used = 2
 								            elif wordPrev == "last":
 								                dayOffset = -7
 								                start -= 1
 								                used = 2
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                # parse 10 months, next month, last month
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								        elif word == "month" and not fromFlag:
 								            if wordPrev[0].isdigit():
 								                monthOffset = int(wordPrev)
 								                start -= 1
 								                used = 2
 								            elif wordPrev == "next":
 								                monthOffset = 1
 								                start -= 1
 								                used = 2
 								            elif wordPrev == "last":
 								                monthOffset = -1
 								                start -= 1
 								                used = 2
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                # parse 5 years, next year, last year
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								        elif word == "year" and not fromFlag:
 								            if wordPrev[0].isdigit():
 								                yearOffset = int(wordPrev)
 								                start -= 1
 								                used = 2
 								            elif wordPrev == "next":
 								                yearOffset = 1
 								                start -= 1
 								                used = 2
 								            elif wordPrev == "last":
 								                yearOffset = -1
 								                start -= 1
 								                used = 2
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                # parse Monday, Tuesday, etc., and next Monday,
 								                # last Tuesday, etc.
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								        elif word in days and not fromFlag:
 								            d = days.index(word)
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            dayOffset = (d + 1) - int(today)
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            used = 1
 								            if dayOffset < 0:
 								                dayOffset += 7
 								            if wordPrev == "next":
 								                dayOffset += 7
 								                used += 1
 								                start -= 1
 								            elif wordPrev == "last":
 								                dayOffset -= 7
 								                used += 1
 								                start -= 1
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                # parse 15 of July, June 20th, Feb 18, 19 of February
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								        elif word in months or word in monthsShort and not fromFlag:
 								            try:
 								                m = months.index(word)
 								            except ValueError:
 								                m = monthsShort.index(word)
 								            used += 1
 								            datestr = months[m]
-												Handle "tonight" and text containing only a date

- "tonight" is re-interpreted as PM
- check is performed to check if previous word exist before accessing it
to handle sentences containing only a simple date

											
										
										
											2017-12-12 07:35:13 +00:00
+								            if wordPrev and (wordPrev[0].isdigit() or
 								                             (wordPrev == "of" and wordPrevPrev[0].isdigit())):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                if wordPrev == "of" and wordPrevPrev[0].isdigit():
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    datestr += " " + words[idx - 2]
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                    used += 1
 								                    start -= 1
 								                else:
 								                    datestr += " " + wordPrev
 								                start -= 1
 								                used += 1
 								                if wordNext and wordNext[0].isdigit():
 								                    datestr += " " + wordNext
 								                    used += 1
 								                    hasYear = True
 								                else:
 								                    hasYear = False
 								            elif wordNext and wordNext[0].isdigit():
 								                datestr += " " + wordNext
 								                used += 1
 								                if wordNextNext and wordNextNext[0].isdigit():
 								                    datestr += " " + wordNextNext
 								                    used += 1
 								                    hasYear = True
 								                else:
 								                    hasYear = False
 								        # parse 5 days from tomorrow, 10 weeks from next thursday,
 								        # 2 months from July
 								        validFollowups = days + months + monthsShort
 								        validFollowups.append("today")
 								        validFollowups.append("tomorrow")
 								        validFollowups.append("next")
 								        validFollowups.append("last")
 								        validFollowups.append("now")
 								        if (word == "from" or word == "after") and wordNext in validFollowups:
 								            used = 2
 								            fromFlag = True
 								            if wordNext == "tomorrow":
 								                dayOffset += 1
 								            elif wordNext in days:
 								                d = days.index(wordNext)
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                tmpOffset = (d + 1) - int(today)
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                used = 2
 								                if tmpOffset < 0:
 								                    tmpOffset += 7
 								                dayOffset += tmpOffset
 								            elif wordNextNext and wordNextNext in days:
 								                d = days.index(wordNextNext)
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                tmpOffset = (d + 1) - int(today)
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                used = 3
 								                if wordNext == "next":
 								                    tmpOffset += 7
 								                    used += 1
 								                    start -= 1
 								                elif wordNext == "last":
 								                    tmpOffset -= 7
 								                    used += 1
 								                    start -= 1
 								                dayOffset += tmpOffset
 								        if used > 0:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            if start - 1 > 0 and words[start - 1] == "this":
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                start -= 1
 								                used += 1
 								            for i in range(0, used):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                words[i + start] = ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            if (start - 1 >= 0 and words[start - 1] in markers):
 								                words[start - 1] = ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            found = True
 								            daySpecified = True
 								    # parse time
 								    timeStr = ""
 								    hrOffset = 0
 								    minOffset = 0
 								    secOffset = 0
 								    hrAbs = 0
 								    minAbs = 0
 								    military = False
 								    for idx, word in enumerate(words):
 								        if word == "":
 								            continue
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        wordPrevPrev = words[idx - 2] if idx > 1 else ""
 								        wordPrev = words[idx - 1] if idx > 0 else ""
 								        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
 								        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								        # parse noon, midnight, morning, afternoon, evening
 								        used = 0
 								        if word == "noon":
 								            hrAbs = 12
 								            used += 1
 								        elif word == "midnight":
 								            hrAbs = 0
 								            used += 1
 								        elif word == "morning":
 								            if hrAbs == 0:
 								                hrAbs = 8
 								            used += 1
 								        elif word == "afternoon":
 								            if hrAbs == 0:
 								                hrAbs = 15
 								            used += 1
 								        elif word == "evening":
 								            if hrAbs == 0:
 								                hrAbs = 19
 								            used += 1
 								            # parse half an hour, quarter hour
 								        elif word == "hour" and \
 								                (wordPrev in markers or wordPrevPrev in markers):
 								            if wordPrev == "half":
 								                minOffset = 30
 								            elif wordPrev == "quarter":
 								                minOffset = 15
 								            elif wordPrevPrev == "quarter":
 								                minOffset = 15
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                if idx > 2 and words[idx - 3] in markers:
 								                    words[idx - 3] = ""
 								                words[idx - 2] = ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            else:
 								                hrOffset = 1
 								            if wordPrevPrev in markers:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                words[idx - 2] = ""
 								            words[idx - 1] = ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            used += 1
 								            hrAbs = -1
 								            minAbs = -1
 								            # parse 5:00 am, 12:00 p.m., etc
 								        elif word[0].isdigit():
 								            isTime = True
 								            strHH = ""
 								            strMM = ""
 								            remainder = ""
 								            if ':' in word:
 								                # parse colons
 								                # "3:00 in the morning"
 								                stage = 0
 								                length = len(word)
 								                for i in range(length):
 								                    if stage == 0:
 								                        if word[i].isdigit():
 								                            strHH += word[i]
 								                        elif word[i] == ":":
 								                            stage = 1
 								                        else:
 								                            stage = 2
 								                            i -= 1
 								                    elif stage == 1:
 								                        if word[i].isdigit():
 								                            strMM += word[i]
 								                        else:
 								                            stage = 2
 								                            i -= 1
 								                    elif stage == 2:
 								                        remainder = word[i:].replace(".", "")
 								                        break
 								                if remainder == "":
 								                    nextWord = wordNext.replace(".", "")
 								                    if nextWord == "am" or nextWord == "pm":
 								                        remainder = nextWord
 								                        used += 1
-												Handle "tonight" and text containing only a date

- "tonight" is re-interpreted as PM
- check is performed to check if previous word exist before accessing it
to handle sentences containing only a simple date

											
										
										
											2017-12-12 07:35:13 +00:00
+								                    elif nextWord == "tonight":
 								                        remainder = "pm"
 								                        used += 1
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                    elif wordNext == "in" and wordNextNext == "the" and \
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            words[idx + 3] == "morning":
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                        reaminder = "am"
 								                        used += 3
 								                    elif wordNext == "in" and wordNextNext == "the" and \
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            words[idx + 3] == "afternoon":
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                        remainder = "pm"
 								                        used += 3
 								                    elif wordNext == "in" and wordNextNext == "the" and \
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            words[idx + 3] == "evening":
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                        remainder = "pm"
 								                        used += 3
 								                    elif wordNext == "in" and wordNextNext == "morning":
 								                        remainder = "am"
 								                        used += 2
 								                    elif wordNext == "in" and wordNextNext == "afternoon":
 								                        remainder = "pm"
 								                        used += 2
 								                    elif wordNext == "in" and wordNextNext == "evening":
 								                        remainder = "pm"
 								                        used += 2
 								                    elif wordNext == "this" and wordNextNext == "morning":
 								                        remainder = "am"
 								                        used = 2
 								                    elif wordNext == "this" and wordNextNext == "afternoon":
 								                        remainder = "pm"
 								                        used = 2
 								                    elif wordNext == "this" and wordNextNext == "evening":
 								                        remainder = "pm"
 								                        used = 2
 								                    elif wordNext == "at" and wordNextNext == "night":
 								                        if strHH > 5:
 								                            remainder = "pm"
 								                        else:
 								                            remainder = "am"
 								                        used += 2
 								                    else:
 								                        if timeQualifier != "":
 								                            military = True
 								                            if strHH <= 12 and \
 								                                    (timeQualifier == "evening" or
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                                     timeQualifier == "afternoon"):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                                strHH += 12
 								            else:
 								                # try to parse # s without colons
 								                # 5 hours, 10 minutes etc.
 								                length = len(word)
 								                strNum = ""
 								                remainder = ""
 								                for i in range(length):
 								                    if word[i].isdigit():
 								                        strNum += word[i]
 								                    else:
 								                        remainder += word[i]
 								                if remainder == "":
 								                    remainder = wordNext.replace(".", "").lstrip().rstrip()
 								                if (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                        remainder == "pm" or
 								                        wordNext == "pm" or
 								                        remainder == "p.m." or
 								                        wordNext == "p.m."):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                    strHH = strNum
 								                    remainder = "pm"
 								                    used = 1
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                elif (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                        remainder == "am" or
 								                        wordNext == "am" or
 								                        remainder == "a.m." or
 								                        wordNext == "a.m."):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                    strHH = strNum
 								                    remainder = "am"
 								                    used = 1
 								                else:
 								                    if wordNext == "pm" or wordNext == "p.m.":
 								                        strHH = strNum
 								                        reaminder = "pm"
 								                        used = 1
 								                    elif wordNext == "am" or wordNext == "a.m.":
 								                        strHH = strNum
 								                        remainder = "am"
 								                        used = 1
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    elif (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            int(word) > 100 and
 								                            (
 								                                wordPrev == "o" or
 								                                wordPrev == "oh"
 								                            )):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                        # 0800 hours (pronounced oh-eight-hundred)
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                        strHH = int(word) / 100
 								                        strMM = int(word) - strHH * 100
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                        military = True
 								                        if wordNext == "hours":
 								                            used += 1
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    elif (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            wordNext == "hours" and
 								                            word[0] != '0' and
 								                            (
 								                                int(word) < 100 and
 								                                int(word) > 2400
 								                            )):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                        # ignores military time
 								                        # "in 3 hours"
 								                        hrOffset = int(word)
 								                        used = 2
 								                        isTime = False
 								                        hrAbs = -1
 								                        minAbs = -1
 								                    elif wordNext == "minutes":
 								                        # "in 10 minutes"
 								                        minOffset = int(word)
 								                        used = 2
 								                        isTime = False
 								                        hrAbs = -1
 								                        minAbs = -1
 								                    elif wordNext == "seconds":
 								                        # in 5 seconds
 								                        secOffset = int(word)
 								                        used = 2
 								                        isTime = False
 								                        hrAbs = -1
 								                        minAbs = -1
 								                    elif int(word) > 100:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                        strHH = int(word) / 100
 								                        strMM = int(word) - strHH * 100
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                        military = True
 								                        if wordNext == "hours":
 								                            used += 1
 								                    elif wordNext[0].isdigit():
 								                        strHH = word
 								                        strMM = wordNext
 								                        military = True
 								                        used += 1
 								                        if wordNextNext == "hours":
 								                            used += 1
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    elif (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            wordNext == "" or wordNext == "o'clock" or
 								                            (
 								                                        wordNext == "in" and
 								                                        (
 								                                            wordNextNext == "the" or
 								                                            wordNextNext == timeQualifier
 								                                        )
 								                            )):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                        strHH = word
 								                        strMM = 00
 								                        if wordNext == "o'clock":
 								                            used += 1
 								                        if wordNext == "in" or wordNextNext == "in":
 								                            used += (1 if wordNext == "in" else 2)
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            if (wordNextNext and
 								                                wordNextNext in timeQualifier or
 								                                (words[words.index(wordNextNext) + 1] and
 								                                 words[words.index(wordNextNext) + 1] in
 								                                 timeQualifier)):
 								                                if (wordNextNext == "afternoon" or
 								                                    (len(words) >
 								                                     words.index(wordNextNext) + 1 and
 								                                     words[words.index(
 								                                         wordNextNext) + 1] == "afternoon")):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                                    remainder = "pm"
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                                if (wordNextNext == "evening" or
 								                                    (len(words) >
 								                                     (words.index(wordNextNext) + 1) and
 								                                     words[words.index(
 								                                         wordNextNext) + 1] == "evening")):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                                    remainder = "pm"
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                                if (wordNextNext == "morning" or
 								                                    (len(words) >
 								                                     words.index(wordNextNext) + 1 and
 								                                     words[words.index(
 								                                         wordNextNext) + 1] == "morning")):
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                                    remainder = "am"
 								                        if timeQualifier != "":
 								                            military = True
 								                    else:
 								                        isTime = False
 								            strHH = int(strHH) if strHH else 0
 								            strMM = int(strMM) if strMM else 0
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
 								            strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            if strHH > 24 or strMM > 59:
 								                isTime = False
 								                used = 0
 								            if isTime:
 								                hrAbs = strHH * 1
 								                minAbs = strMM * 1
 								                used += 1
 								        if used > 0:
 								            # removed parsed words from the sentence
 								            for i in range(used):
 								                words[idx + i] = ""
 								            if wordPrev == "o" or wordPrev == "oh":
 								                words[words.index(wordPrev)] = ""
 								            if wordPrev == "early":
 								                hrOffset = -1
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                words[idx - 1] = ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                idx -= 1
 								            elif wordPrev == "late":
 								                hrOffset = 1
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                words[idx - 1] = ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								                idx -= 1
 								            if idx > 0 and wordPrev in markers:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                words[idx - 1] = ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            if idx > 1 and wordPrevPrev in markers:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                words[idx - 2] = ""
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            idx += used - 1
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            found = True
 								    # check that we found a date
 								    if not date_found:
 								        return None
 								    if dayOffset is False:
 								        dayOffset = 0
 								    # perform date manipulation
 								    extractedDate = dateNow
 								    extractedDate = extractedDate.replace(microsecond=0,
 								                                          second=0,
 								                                          minute=0,
 								                                          hour=0)
 								    if datestr != "":
 								        temp = datetime.strptime(datestr, "%B %d")
 								        if not hasYear:
 								            temp = temp.replace(year=extractedDate.year)
 								            if extractedDate < temp:
 								                extractedDate = extractedDate.replace(year=int(currentYear),
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                                                      month=int(
 								                                                          temp.strftime(
 								                                                              "%m")),
 								                                                      day=int(temp.strftime(
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                                                          "%d")))
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            else:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                extractedDate = extractedDate.replace(
 								                    year=int(currentYear) + 1,
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                    month=int(temp.strftime("%m")),
 								                    day=int(temp.strftime("%d")))
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								        else:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            extractedDate = extractedDate.replace(
 								                year=int(temp.strftime("%Y")),
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                month=int(temp.strftime("%m")),
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                day=int(temp.strftime("%d")))
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
 								    if timeStr != "":
 								        temp = datetime(timeStr)
 								        extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
 								                                              minute=temp.strftime("%M"),
 								                                              second=temp.strftime("%S"))
 								    if yearOffset != 0:
 								        extractedDate = extractedDate + relativedelta(years=yearOffset)
 								    if monthOffset != 0:
 								        extractedDate = extractedDate + relativedelta(months=monthOffset)
 								    if dayOffset != 0:
 								        extractedDate = extractedDate + relativedelta(days=dayOffset)
 								    if hrAbs != -1 and minAbs != -1:
 								        extractedDate = extractedDate + relativedelta(hours=hrAbs,
 								                                                      minutes=minAbs)
 								        if (hrAbs != 0 or minAbs != 0) and datestr == "":
 								            if not daySpecified and dateNow > extractedDate:
 								                extractedDate = extractedDate + relativedelta(days=1)
 								    if hrOffset != 0:
 								        extractedDate = extractedDate + relativedelta(hours=hrOffset)
 								    if minOffset != 0:
 								        extractedDate = extractedDate + relativedelta(minutes=minOffset)
 								    if secOffset != 0:
 								        extractedDate = extractedDate + relativedelta(seconds=secOffset)
 								    for idx, word in enumerate(words):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        if words[idx] == "and" and words[idx - 1] == "" and words[
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                idx + 1] == "":
-												Adds the ExtractDateTime parse function from Christopher (plus relevant tests) (#866)

Adds the ExtractDateTime parse function from Christopher.  When imported from mycroft/util/parse.py, it'll take a sentence like "What's the weather like 5 weeks from next Wednesday?" and will extract a python datetime object for that date.

* Added requirements.txt change for importing dateutil

											
										
										
											2017-07-10 20:33:21 +00:00
+								            words[idx] = ""
 								    resultStr = " ".join(words)
 								    resultStr = ' '.join(resultStr.split())
 								    return [extractedDate, resultStr]
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								def look_for_fractions(split_list):
 								    """"
 								    This function takes a list made by fraction & determines if a fraction.
 								    Args:
 								        split_list (list): list created by splitting on '/'
 								    Returns:
 								        (bool): False if not a fraction, otherwise True
 								    """
 								    if len(split_list) == 2:
 								        if is_numeric(split_list[0]) and is_numeric(split_list[1]):
 								            return True
 								    return False
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								def isFractional_en(input_str):
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								    """
 								    This function takes the given text and checks if it is a fraction.
 								    Args:
 								        text (str): the string to check if fractional
 								    Returns:
 								        (bool) or (float): False if not a fraction, otherwise the fraction
 								    """
 								    if input_str.endswith('s', -1):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        input_str = input_str[:len(input_str) - 1]  # e.g. "fifths"
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
+								    aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth",
 								             "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"]
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
 								    if input_str.lower() in aFrac:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        return 1.0 / (aFrac.index(input_str) + 1)
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								    if input_str == "quarter":
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        return 1.0 / 4
 								    return False
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								def get_gender(word, input_string="", lang="en-us"):
 								    '''
 								    guess gender of word, optionally use raw input text for context
 								    returns "m" if the word is male, "f" if female, False if unknown
 								    '''
 								    if "pt" in lang or "es" in lang:
 								        # spanish follows same rules
 								        return get_gender_pt(word, input_string)
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								    return False
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
-												Adding Extraction of Numbers

											
										
										
											2017-05-25 23:31:43 +00:00
+								# ==============================================================
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
-												Fixed PEP8 for number extraction addition

											
										
										
											2017-05-30 19:30:41 +00:00
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								def normalize(text, lang="en-us", remove_articles=True):
 								    """Prepare a string for parsing
 								    This function prepares the given text for parsing by making
 								    numbers consistent, getting rid of contractions, etc.
-												Several extensions to text normalization:
* intent_failure message now carries along the utterance's lang code
* normalizing query for Wolfram Alpha
* added normalization of "whats" to "what is".  This is technically incorrect ("whats" means more than one instance of "what", as in "the whats and whys of open source"), but that is a rare phrase.  Unfortunately, several STT engines incorrectly output things like "whats 8 + 4", which is grammatically incorrect.  So we'll handle the common and potentially screw up the uncommon.
* more parsing test cases, including a few corrections

											
										
										
											2017-02-25 05:59:00 +00:00
+								    Args:
 								        text (str): the string to normalize
 								        lang (str): the code for the language text is in
 								        remove_articles (bool): whether to remove articles (like 'a', or 'the')
 								    Returns:
 								        (str): The normalized string.
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								    """
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
 								    lang_lower = str(lang).lower()
 								    if lang_lower.startswith("en"):
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								        return normalize_en(text, remove_articles)
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								    elif lang_lower.startswith("es"):
-												normalize_es

											
										
										
											2017-05-03 09:37:00 +00:00
+								        return normalize_es(text, remove_articles)
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								    elif lang_lower.startswith("pt"):
 								        return normalize_pt(text, remove_articles)
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								    # TODO: Normalization for other languages
 								    return text
 								def normalize_en(text, remove_articles):
 								    """ English string normalization """
 								    words = text.split()  # this also removed extra spaces
 								    normalized = ""
 								    for word in words:
 								        if remove_articles and word in ["the", "a", "an"]:
 								            continue
 								        # Expand common contractions, e.g. "isn't" -> "is not"
 								        contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
 								                       "didn't", "doesn't", "don't", "gonna", "gotta",
 								                       "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
 								                       "how'd", "how'll", "how's", "I'd", "I'll", "I'm",
 								                       "I've", "isn't", "it'd", "it'll", "it's", "mightn't",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                       "might've", "mustn't", "must've", "needn't",
 								                       "oughtn't",
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								                       "shan't", "she'd", "she'll", "she's", "shouldn't",
 								                       "should've", "somebody's", "someone'd", "someone'll",
 								                       "someone's", "that'll", "that's", "that'd", "there'd",
 								                       "there're", "there's", "they'd", "they'll", "they're",
 								                       "they've", "wasn't", "we'd", "we'll", "we're", "we've",
 								                       "weren't", "what'd", "what'll", "what're", "what's",
-												Increase min wait time in wait_while_speaking()

In code like this:
   self.speak_dialog("something")
   mycroft.audio.wait_while_speaking()
It was possible that the speaking of "something" would take longer to
start than the 0.1 seconds that was built into the wait_while_speaking().
The definition of this behavior is slightly fuzzy, but this is definitely
a case where the expectation is that previous request for speech would
start and complete.  For now, I have just bumped the minimum wait to
0.3 seconds.

In the long run we might consider tracking specific speak requests and
generating a notification when that request has been serviced.  Then the
skill could automatically hold off until that request has been serviced.
But the basic skill code won't have to change to make this happen, so
this additional sleep is adequate for today.

Also snuck in a minor change to a comment.

											
										
										
											2017-11-22 19:21:23 +00:00
+								                       "whats",  # technically incorrect but some STT outputs
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								                       "what've", "when's", "when'd", "where'd", "where's",
 								                       "where've", "who'd", "who'd've", "who'll", "who're",
 								                       "who's", "who've", "why'd", "why're", "why's", "won't",
 								                       "won't've", "would've", "wouldn't", "wouldn't've",
 								                       "y'all", "ya'll", "you'd", "you'd've", "you'll",
 								                       "y'aint", "y'ain't", "you're", "you've"]
 								        if word in contraction:
 								            expansion = ["is not", "are not", "can not", "could have",
 								                         "could not", "did not", "does not", "do not",
 								                         "going to", "got to", "had not", "has not",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                         "have not", "he would", "he will", "he is",
 								                         "how did",
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								                         "how will", "how is", "I would", "I will", "I am",
 								                         "I have", "is not", "it would", "it will", "it is",
 								                         "might not", "might have", "must not", "must have",
 								                         "need not", "ought not", "shall not", "she would",
 								                         "she will", "she is", "should not", "should have",
 								                         "somebody is", "someone would", "someone will",
 								                         "someone is", "that will", "that is", "that would",
 								                         "there would", "there are", "there is", "they would",
 								                         "they will", "they are", "they have", "was not",
 								                         "we would", "we will", "we are", "we have",
 								                         "were not", "what did", "what will", "what are",
-												#539 - fixing pep8

											
										
										
											2017-03-14 18:43:41 +00:00
+								                         "what is",
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
+								                         "what is", "what have", "when is", "when did",
 								                         "where did", "where is", "where have", "who would",
 								                         "who would have", "who will", "who are", "who is",
 								                         "who have", "why did", "why are", "why is",
 								                         "will not", "will not have", "would have",
 								                         "would not", "would not have", "you all", "you all",
 								                         "you would", "you would have", "you will",
 								                         "you are not", "you are not", "you are", "you have"]
 								            word = expansion[contraction.index(word)]
 								        # Convert numbers into digits, e.g. "two" -> "2"
 								        textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
 								                       "seven", "eight", "nine", "ten", "eleven", "twelve",
 								                       "thirteen", "fourteen", "fifteen", "sixteen",
 								                       "seventeen", "eighteen", "nineteen", "twenty"]
 								        if word in textNumbers:
 								            word = str(textNumbers.index(word))
-												#539 - fixing pep8

											
										
										
											2017-03-14 18:43:41 +00:00
+								        normalized += " " + word
-												Fixes issue #539

The utterance is now placed on the bus along with its language code.  If not specified, it uses "en-us".

Added a new mycroft.util.parse module.  It contains the normalize() function.  Normalization currently does two things:
  * Expands contractions ("they're" -> "they are", etc)
  * Optionally removes articles ("a", "an", "the").  Removing is the default.
  * Textual numbers become digits, up to 20.  E.g. "What is the weather in four days" becomes "What is weather in 4 days".

NOTE:  This is potentially a breaking change!  Remove "the", "a" and "an" from your .voc files!

Skill changes:
  * I cleaned up the .voc files for the default Skills.
  * Split the date_time keyword into an extra entity.  Now a "QueryKeyword.voc" exists, with "what|tell" instead of combing that into "what is time" in the TimeKeyword.voc.
  * Volume skill now accepts 1-11, e.g. "turn volume to 11"

											
										
										
											2017-02-23 12:40:46 +00:00
 								    return normalized[1:]  # strip the initial space
-												normalize_es

											
										
										
											2017-05-03 09:37:00 +00:00
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								####################################################################
 								# PT-PT
 								#
 								# TODO: numbers greater than 999999
 								# TODO: date time pt
 								####################################################################
 								# Undefined articles ["um", "uma", "uns", "umas"] can not be supressed,
 								# in PT, "um cavalo" means "a horse" or "one horse".
 								pt_articles = ["o", "a", "os", "as"]
 								pt_numbers = {
 								    "zero": 0,
 								    "um": 1,
 								    "uma": 1,
 								    "uns": 1,
 								    "umas": 1,
 								    "primeiro": 1,
 								    "segundo": 2,
 								    "terceiro": 3,
 								    "dois": 2,
 								    "duas": 2,
 								    "tres": 3,
 								    u"tr<EFBFBD>s": 3,
 								    "quatro": 4,
 								    "cinco": 5,
 								    "seis": 6,
 								    "sete": 7,
 								    "oito": 8,
 								    "nove": 9,
 								    "dez": 10,
 								    "onze": 11,
 								    "doze": 12,
 								    "treze": 13,
 								    "catorze": 14,
 								    "quinze": 15,
 								    "dezasseis": 16,
 								    "dezassete": 17,
 								    "dezoito": 18,
 								    "dezanove": 19,
 								    "vinte": 20,
 								    "trinta": 30,
 								    "quarenta": 40,
 								    "cinquenta": 50,
 								    "sessenta": 60,
 								    "setenta": 70,
 								    "oitenta": 80,
 								    "noventa": 90,
 								    "cem": 100,
 								    "cento": 100,
 								    "duzentos": 200,
 								    "duzentas": 200,
 								    "trezentos": 300,
 								    "trezentas": 300,
 								    "quatrocentos": 400,
 								    "quatrocentas": 400,
 								    "quinhentos": 500,
 								    "quinhentas": 500,
 								    "seiscentos": 600,
 								    "seiscentas": 600,
 								    "setecentos": 700,
 								    "setecentas": 700,
 								    "oitocentos": 800,
 								    "oitocentas": 800,
 								    "novecentos": 900,
 								    "novecentas": 900,
 								    "mil": 1000,
 								    u"milh<EFBFBD>o": 1000000}
 								def isFractional_pt(input_str):
 								    """
 								    This function takes the given text and checks if it is a fraction.
 								    Args:
 								        text (str): the string to check if fractional
 								    Returns:
 								        (bool) or (float): False if not a fraction, otherwise the fraction
 								    """
 								    if input_str.endswith('s', -1):
 								        input_str = input_str[:len(input_str) - 1]  # e.g. "fifths"
-												pt unicode bug fix in number parse

											
										
										
											2017-12-12 03:05:31 +00:00
+								    aFrac = ["meio", u"ter<EFBFBD>o", "quarto", "quinto", "sexto",
 								             "setimo", "oitavo", "nono", u"d<EFBFBD>cimo"]
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
 								    if input_str.lower() in aFrac:
 								        return 1.0 / (aFrac.index(input_str) + 2)
-												pt unicode bug fix in number parse

											
										
										
											2017-12-12 03:05:31 +00:00
+								    if input_str == u"vig<EFBFBD>simo":
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        return 1.0 / 20
-												pt unicode bug fix in number parse

											
										
										
											2017-12-12 03:05:31 +00:00
+								    if input_str == u"trig<EFBFBD>simo":
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        return 1.0 / 30
-												pt unicode bug fix in number parse

											
										
										
											2017-12-12 03:05:31 +00:00
+								    if input_str == u"cent<EFBFBD>simo":
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        return 1.0 / 100
-												pt unicode bug fix in number parse

											
										
										
											2017-12-12 03:05:31 +00:00
+								    if input_str == u"mil<EFBFBD>simo":
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        return 1.0 / 1000
-												pt unicode bug fix in number parse

											
										
										
											2017-12-12 03:05:31 +00:00
+								    if (input_str == u"s<EFBFBD>timo" or input_str == "septimo" or
 								            input_str == u"s<EFBFBD>ptimo"):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        return 1.0 / 7
 								    return False
 								def extractnumber_pt(text):
 								    """
 								    This function prepares the given text for parsing by making
 								    numbers consistent, getting rid of contractions, etc.
 								    Args:
 								        text (str): the string to normalize
 								    Returns:
 								        (int) or (float): The value of extracted number
 								    """
 								    aWords = text.split()
 								    count = 0
 								    result = None
 								    while count < len(aWords):
 								        val = 0
 								        word = aWords[count]
 								        next_next_word = None
 								        if count + 1 < len(aWords):
 								            next_word = aWords[count + 1]
 								            if count + 2 < len(aWords):
 								                next_next_word = aWords[count + 2]
 								        else:
 								            next_word = None
 								        # is current word a number?
 								        if word in pt_numbers:
 								            val = pt_numbers[word]
 								        elif word.isdigit():  # doesn't work with decimals
 								            val = int(word)
 								        elif is_numeric(word):
 								            val = float(word)
 								        elif isFractional_pt(word):
 								            if not result:
 								                result = 1
 								            result = result * isFractional_pt(word)
 								            count += 1
 								            continue
 								        if not val:
 								            # look for fractions like "2/3"
 								            aPieces = word.split('/')
 								            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
 								            #   and is_numeric(aPieces[1])):
 								            if look_for_fractions(aPieces):
 								                val = float(aPieces[0]) / float(aPieces[1])
 								        if val:
 								            if result is None:
 								                result = 0
 								            # handle fractions
 								            if next_word != "avos":
 								                result += val
 								            else:
 								                result = float(result) / float(val)
 								        if next_word is None:
 								            break
 								        # number word and fraction
 								        ands = ["e"]
 								        if next_word in ands:
 								            zeros = 0
 								            if result is None:
 								                count += 1
 								                continue
 								            newWords = aWords[count + 2:]
 								            newText = ""
 								            for word in newWords:
 								                newText += word + " "
 								            afterAndVal = extractnumber_pt(newText[:-1])
 								            if afterAndVal:
 								                if result < afterAndVal or result < 20:
 								                    while afterAndVal > 1:
 								                        afterAndVal = afterAndVal / 10.0
 								                    for word in newWords:
 								                        if word == "zero" or word == "0":
 								                            zeros += 1
 								                        else:
 								                            break
 								                for i in range(0, zeros):
 								                    afterAndVal = afterAndVal / 10.0
 								                result += afterAndVal
 								                break
 								        elif next_next_word is not None:
 								            if next_next_word in ands:
 								                newWords = aWords[count + 3:]
 								                newText = ""
 								                for word in newWords:
 								                    newText += word + " "
 								                afterAndVal = extractnumber_pt(newText[:-1])
 								                if afterAndVal:
 								                    if result is None:
 								                        result = 0
 								                    result += afterAndVal
 								                    break
 								        decimals = ["ponto", "virgula", u"v<EFBFBD>rgula", ".", ","]
 								        if next_word in decimals:
 								            zeros = 0
 								            newWords = aWords[count + 2:]
 								            newText = ""
 								            for word in newWords:
 								                newText += word + " "
 								            for word in newWords:
 								                if word == "zero" or word == "0":
 								                    zeros += 1
 								                else:
 								                    break
 								            afterDotVal = str(extractnumber_pt(newText[:-1]))
 								            afterDotVal = zeros * "0" + afterDotVal
 								            result = float(str(result) + "." + afterDotVal)
 								            break
 								        count += 1
 								    if result is None:
 								        return False
 								    # Return the $str with the number related words removed
 								    # (now empty strings, so strlen == 0)
 								    # aWords = [word for word in aWords if len(word) > 0]
 								    # text = ' '.join(aWords)
 								    if "." in str(result):
 								        integer, dec = str(result).split(".")
 								        # cast float to int
 								        if dec == "0":
 								            result = int(integer)
 								    return result
 								def pt_number_parse(words, i):
 								    def pt_cte(i, s):
 								        if i < len(words) and s == words[i]:
 								            return s, i + 1
 								        return None
 								    def pt_number_word(i, mi, ma):
 								        if i < len(words):
 								            v = pt_numbers.get(words[i])
 								            if v and v >= mi and v <= ma:
 								                return v, i + 1
 								        return None
 								    def pt_number_1_99(i):
 								        r1 = pt_number_word(i, 1, 29)
 								        if r1:
 								            return r1
 								        r1 = pt_number_word(i, 30, 90)
 								        if r1:
 								            v1, i1 = r1
 								            r2 = pt_cte(i1, "e")
 								            if r2:
 								                v2, i2 = r2
 								                r3 = pt_number_word(i2, 1, 9)
 								                if r3:
 								                    v3, i3 = r3
 								                    return v1 + v3, i3
 								            return r1
 								        return None
 								    def pt_number_1_999(i):
 								        # [2-9]cientos [1-99]?
 								        r1 = pt_number_word(i, 100, 900)
 								        if r1:
 								            v1, i1 = r1
 								            r2 = pt_number_1_99(i1)
 								            if r2:
 								                v2, i2 = r2
 								                return v1 + v2, i2
 								            else:
 								                return r1
 								        # [1-99]
 								        r1 = pt_number_1_99(i)
 								        if r1:
 								            return r1
 								        return None
 								    def pt_number(i):
 								        # check for cero
 								        r1 = pt_number_word(i, 0, 0)
 								        if r1:
 								            return r1
 								        # check for [1-999] (mil [0-999])?
 								        r1 = pt_number_1_999(i)
 								        if r1:
 								            v1, i1 = r1
 								            r2 = pt_cte(i1, "mil")
 								            if r2:
 								                v2, i2 = r2
 								                r3 = pt_number_1_999(i2)
 								                if r3:
 								                    v3, i3 = r3
 								                    return v1 * 1000 + v3, i3
 								                else:
 								                    return v1 * 1000, i2
 								            else:
 								                return r1
 								        return None
 								    return pt_number(i)
 								def normalize_pt(text, remove_articles):
 								    """ PT string normalization """
 								    words = text.split()  # this also removed extra spaces
 								    normalized = ""
 								    # Contractions are not common in PT
 								    # Convert numbers into digits, e.g. "dois" -> "2"
 								    normalized = ""
 								    i = 0
 								    while i < len(words):
 								        word = words[i]
 								        # remove articles
 								        if remove_articles and word in pt_articles:
 								            i += 1
 								            continue
 								        # Convert numbers into digits
 								        r = pt_number_parse(words, i)
 								        if r:
 								            v, i = r
 								            normalized += " " + str(v)
 								            continue
 								        # NOTE temporary , handle some numbers above >999
 								        if word in pt_numbers:
 								            word = str(pt_numbers[word])
 								        # end temporary
 								        normalized += " " + word
 								        i += 1
 								    # some articles in pt-pt can not be removed, but many words can
 								    # this is experimental and some meaning may be lost
 								    # maybe agressive should default to False
 								    # only usage will tell, as a native speaker this seems reasonable
 								    return pt_pruning(normalized[1:], agressive=remove_articles)
 								def extract_datetime_pt(input_str, currentDate=None):
 								    def clean_string(str):
 								        # cleans the input string of unneeded punctuation and capitalization
 								        # among other things
 								        symbols = [".", ",", ";", "?", "!", u"<EFBFBD>", u"<EFBFBD>"]
 								        noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de",
 								                       "ao", "aos"]
 								        for word in symbols:
 								            str = str.replace(word, "")
 								        for word in noise_words:
 								            str = str.replace(" " + word + " ", " ")
 								        str = str.lower().replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "a").replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "c").replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "a").replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "a").replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "e").replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "e").replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "e").replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "o").replace(
-												pt unicode bug fix in datetime extract

											
										
										
											2017-12-12 03:07:49 +00:00
+								            u"<EFBFBD>",
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            "o").replace(
 								            "-",
 								            " ").replace(
 								            "_",
 								            "")
 								        # handle synonims and equivalents, "tomorrow early = tomorrow morning
 								        synonims = {"manha": ["manhazinha", "cedo", "cedinho"],
 								                    "tarde": ["tardinha", "tarde"],
 								                    "noite": ["noitinha", "anoitecer"],
 								                    "todos": ["ao", "aos"],
 								                    "em": ["do", "da", "dos", "das", "de"]}
 								        for syn in synonims:
 								            for word in synonims[syn]:
 								                str = str.replace(" " + word + " ", " " + syn + " ")
 								        # relevant plurals, cant just extract all s in pt
 								        wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos",
 								                    "minutos", "segundos", "nas", "nos", "proximas",
 								                    "seguintes", "horas"]
 								        for idx, word in enumerate(wordlist):
 								            str = str.replace(word, word.rstrip('s'))
 								        str = str.replace("meses", "mes").replace("anteriores", "anterior")
 								        return str
 								    def date_found():
 								        return found or \
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								            (
 								                datestr != "" or timeStr != "" or
 								                yearOffset != 0 or monthOffset != 0 or
 								                dayOffset is True or hrOffset != 0 or
 								                hrAbs != 0 or minOffset != 0 or
 								                minAbs != 0 or secOffset != 0
 								            )
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
 								    if input_str == "":
 								        return None
 								    if currentDate is None:
 								        currentDate = datetime.now()
 								    found = False
 								    daySpecified = False
 								    dayOffset = False
 								    monthOffset = 0
 								    yearOffset = 0
 								    dateNow = currentDate
 								    today = dateNow.strftime("%w")
 								    currentYear = dateNow.strftime("%Y")
 								    fromFlag = False
 								    datestr = ""
 								    hasYear = False
 								    timeQualifier = ""
 								    words = clean_string(input_str).split(" ")
 								    timeQualifiersList = ['manha', 'tarde', 'noite']
 								    time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas",
 								                       "no", "dia", "hora"]
 								    days = ['segunda', 'terca', 'quarta',
 								            'quinta', 'sexta', 'sabado', 'domingo']
 								    months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho',
 								              'julho', 'agosto', 'setembro', 'outubro', 'novembro',
 								              'dezembro']
 								    monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag',
 								                   'set', 'out', 'nov', 'dec']
 								    nexts = ["proximo", "proxima"]
 								    suffix_nexts = ["seguinte", "subsequente", "seguir"]
 								    lasts = ["ultimo", "ultima"]
 								    suffix_lasts = ["passada", "passado", "anterior", "antes"]
 								    nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"]
 								    prevs = ["antes", "ante", "previa", "previamente", "anterior"]
 								    froms = ["partir", "em", "para", "na", "no", "daqui", "seguir",
 								             "depois", "por", "proxima", "proximo", "da", "do", "de"]
 								    thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse",
 								              "nessa"]
 								    froms += thises
 								    lists = nxts + prevs + froms + time_indicators
 								    for idx, word in enumerate(words):
 								        if word == "":
 								            continue
 								        wordPrevPrevPrev = words[idx - 3] if idx > 2 else ""
 								        wordPrevPrev = words[idx - 2] if idx > 1 else ""
 								        wordPrev = words[idx - 1] if idx > 0 else ""
 								        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
 								        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
 								        wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
 								        start = idx
 								        used = 0
 								        # save timequalifier for later
 								        if word in timeQualifiersList:
 								            timeQualifier = word
 								        # parse today, tomorrow, yesterday
 								        elif word == "hoje" and not fromFlag:
 								            dayOffset = 0
 								            used += 1
 								        elif word == "amanha" and not fromFlag:
 								            dayOffset = 1
 								            used += 1
 								        elif word == "ontem" and not fromFlag:
 								            dayOffset -= 1
 								            used += 1
 								        # "before yesterday" and "before before yesterday"
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								        elif (word == "anteontem" or
 								              (word == "ante" and wordNext == "ontem")) and not fromFlag:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            dayOffset -= 2
 								            used += 1
 								            if wordNext == "ontem":
 								                used += 1
 								        elif word == "ante" and wordNext == "ante" and wordNextNext == \
 								                "ontem" and not fromFlag:
 								            dayOffset -= 3
 								            used += 3
 								        elif word == "anteanteontem" and not fromFlag:
 								            dayOffset -= 3
 								            used += 1
 								        # day after tomorrow
 								        elif word == "depois" and wordNext == "amanha" and not fromFlag:
 								            dayOffset += 2
 								            used = 2
 								        # day before yesterday
 								        elif word == "antes" and wordNext == "ontem" and not fromFlag:
 								            dayOffset -= 2
 								            used = 2
 								        # parse 5 days, 10 weeks, last week, next week, week after
 								        elif word == "dia":
 								            if wordNext == "depois" or wordNext == "antes":
 								                used += 1
 								                if wordPrev and wordPrev[0].isdigit():
 								                    dayOffset += int(wordPrev)
 								                    start -= 1
 								                    used += 1
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								            elif (wordPrev and wordPrev[0].isdigit() and
 								                    wordNext not in months and
 								                    wordNext not in monthsShort):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                dayOffset += int(wordPrev)
 								                start -= 1
 								                used += 2
 								            elif wordNext and wordNext[0].isdigit() and wordNextNext not in \
 								                    months and wordNextNext not in monthsShort:
 								                dayOffset += int(wordNext)
 								                start -= 1
 								                used += 2
 								        elif word == "semana" and not fromFlag:
 								            if wordPrev[0].isdigit():
 								                dayOffset += int(wordPrev) * 7
 								                start -= 1
 								                used = 2
 								            for w in nexts:
 								                if wordPrev == w:
 								                    dayOffset = 7
 								                    start -= 1
 								                    used = 2
 								            for w in lasts:
 								                if wordPrev == w:
 								                    dayOffset = -7
 								                    start -= 1
 								                    used = 2
 								            for w in suffix_nexts:
 								                if wordNext == w:
 								                    dayOffset = 7
 								                    start -= 1
 								                    used = 2
 								            for w in suffix_lasts:
 								                if wordNext == w:
 								                    dayOffset = -7
 								                    start -= 1
 								                    used = 2
 								        # parse 10 months, next month, last month
 								        elif word == "mes" and not fromFlag:
 								            if wordPrev[0].isdigit():
 								                monthOffset = int(wordPrev)
 								                start -= 1
 								                used = 2
 								            for w in nexts:
 								                if wordPrev == w:
 								                    monthOffset = 7
 								                    start -= 1
 								                    used = 2
 								            for w in lasts:
 								                if wordPrev == w:
 								                    monthOffset = -7
 								                    start -= 1
 								                    used = 2
 								            for w in suffix_nexts:
 								                if wordNext == w:
 								                    monthOffset = 7
 								                    start -= 1
 								                    used = 2
 								            for w in suffix_lasts:
 								                if wordNext == w:
 								                    monthOffset = -7
 								                    start -= 1
 								                    used = 2
 								        # parse 5 years, next year, last year
 								        elif word == "ano" and not fromFlag:
 								            if wordPrev[0].isdigit():
 								                yearOffset = int(wordPrev)
 								                start -= 1
 								                used = 2
 								            for w in nexts:
 								                if wordPrev == w:
 								                    yearOffset = 7
 								                    start -= 1
 								                    used = 2
 								            for w in lasts:
 								                if wordPrev == w:
 								                    yearOffset = -7
 								                    start -= 1
 								                    used = 2
 								            for w in suffix_nexts:
 								                if wordNext == w:
 								                    yearOffset = 7
 								                    start -= 1
 								                    used = 2
 								            for w in suffix_lasts:
 								                if wordNext == w:
 								                    yearOffset = -7
 								                    start -= 1
 								                    used = 2
 								        # parse Monday, Tuesday, etc., and next Monday,
 								        # last Tuesday, etc.
 								        elif word in days and not fromFlag:
 								            d = days.index(word)
 								            dayOffset = (d + 1) - int(today)
 								            used = 1
 								            if dayOffset < 0:
 								                dayOffset += 7
 								            for w in nexts:
 								                if wordPrev == w:
 								                    dayOffset += 7
 								                    used += 1
 								                    start -= 1
 								            for w in lasts:
 								                if wordPrev == w:
 								                    dayOffset -= 7
 								                    used += 1
 								                    start -= 1
 								            for w in suffix_nexts:
 								                if wordNext == w:
 								                    dayOffset += 7
 								                    used += 1
 								                    start -= 1
 								            for w in suffix_lasts:
 								                if wordNext == w:
 								                    dayOffset -= 7
 								                    used += 1
 								                    start -= 1
 								            if wordNext == "feira":
 								                used += 1
 								        # parse 15 of July, June 20th, Feb 18, 19 of February
 								        elif word in months or word in monthsShort:
 								            try:
 								                m = months.index(word)
 								            except ValueError:
 								                m = monthsShort.index(word)
 								            used += 1
 								            datestr = months[m]
 								            if wordPrev and wordPrev[0].isdigit():
 								                # 13 maio
 								                datestr += " " + wordPrev
 								                start -= 1
 								                used += 1
 								                if wordNext and wordNext[0].isdigit():
 								                    datestr += " " + wordNext
 								                    used += 1
 								                    hasYear = True
 								                else:
 								                    hasYear = False
 								            elif wordNext and wordNext[0].isdigit():
 								                # maio 13
 								                datestr += " " + wordNext
 								                used += 1
 								                if wordNextNext and wordNextNext[0].isdigit():
 								                    datestr += " " + wordNextNext
 								                    used += 1
 								                    hasYear = True
 								                else:
 								                    hasYear = False
 								            elif wordPrevPrev and wordPrevPrev[0].isdigit():
 								                # 13 dia maio
 								                datestr += " " + wordPrevPrev
 								                start -= 2
 								                used += 2
 								                if wordNext and word[0].isdigit():
 								                    datestr += " " + wordNext
 								                    used += 1
 								                    hasYear = True
 								                else:
 								                    hasYear = False
 								            elif wordNextNext and wordNextNext[0].isdigit():
 								                # maio dia 13
 								                datestr += " " + wordNextNext
 								                used += 2
 								                if wordNextNextNext and wordNextNextNext[0].isdigit():
 								                    datestr += " " + wordNextNextNext
 								                    used += 1
 								                    hasYear = True
 								                else:
 								                    hasYear = False
 								            if datestr in months:
 								                datestr = ""
 								        # parse 5 days from tomorrow, 10 weeks from next thursday,
 								        # 2 months from July
 								        validFollowups = days + months + monthsShort
 								        validFollowups.append("hoje")
 								        validFollowups.append("amanha")
 								        validFollowups.append("ontem")
 								        validFollowups.append("anteontem")
 								        validFollowups.append("agora")
 								        validFollowups.append("ja")
 								        validFollowups.append("ante")
 								        # TODO debug word "depois" that one is failing for some reason
 								        if word in froms and wordNext in validFollowups:
 								            if not (wordNext == "amanha" and wordNext == "ontem") and not (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                    word == "depois" or word == "antes" or word == "em"):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                used = 2
 								                fromFlag = True
 								            if wordNext == "amanha" and word != "depois":
 								                dayOffset += 1
 								            elif wordNext == "ontem":
 								                dayOffset -= 1
 								            elif wordNext == "anteontem":
 								                dayOffset -= 2
 								            elif wordNext == "ante" and wordNextNext == "ontem":
 								                dayOffset -= 2
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								            elif (wordNext == "ante" and wordNext == "ante" and
 								                  wordNextNextNext == "ontem"):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                dayOffset -= 3
 								            elif wordNext in days:
 								                d = days.index(wordNext)
 								                tmpOffset = (d + 1) - int(today)
 								                used = 2
 								                if wordNextNext == "feira":
 								                    used += 1
 								                if tmpOffset < 0:
 								                    tmpOffset += 7
 								                if wordNextNext:
 								                    if wordNextNext in nxts:
 								                        tmpOffset += 7
 								                        used += 1
 								                    elif wordNextNext in prevs:
 								                        tmpOffset -= 7
 								                        used += 1
 								                dayOffset += tmpOffset
 								            elif wordNextNext and wordNextNext in days:
 								                d = days.index(wordNextNext)
 								                tmpOffset = (d + 1) - int(today)
 								                used = 3
 								                if wordNextNextNext:
 								                    if wordNextNextNext in nxts:
 								                        tmpOffset += 7
 								                        used += 1
 								                    elif wordNextNextNext in prevs:
 								                        tmpOffset -= 7
 								                        used += 1
 								                dayOffset += tmpOffset
 								                if wordNextNextNext == "feira":
 								                    used += 1
 								        if wordNext in months:
 								            used -= 1
 								        if used > 0:
 								            if start - 1 > 0 and words[start - 1] in lists:
 								                start -= 1
 								                used += 1
 								            for i in range(0, used):
 								                words[i + start] = ""
 								            if (start - 1 >= 0 and words[start - 1] in lists):
 								                words[start - 1] = ""
 								            found = True
 								            daySpecified = True
 								    # parse time
 								    timeStr = ""
 								    hrOffset = 0
 								    minOffset = 0
 								    secOffset = 0
 								    hrAbs = 0
 								    minAbs = 0
 								    military = False
 								    for idx, word in enumerate(words):
 								        if word == "":
 								            continue
 								        wordPrevPrev = words[idx - 2] if idx > 1 else ""
 								        wordPrev = words[idx - 1] if idx > 0 else ""
 								        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
 								        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
 								        wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
 								        # parse noon, midnight, morning, afternoon, evening
 								        used = 0
 								        if word == "meio" and wordNext == "dia":
 								            hrAbs = 12
 								            used += 2
 								        elif word == "meia" and wordNext == "noite":
 								            hrAbs = 0
 								            used += 2
 								        elif word == "manha":
 								            if hrAbs == 0:
 								                hrAbs = 8
 								            used += 1
 								        elif word == "tarde":
 								            if hrAbs == 0:
 								                hrAbs = 15
 								            used += 1
 								        elif word == "meio" and wordNext == "tarde":
 								            if hrAbs == 0:
 								                hrAbs = 17
 								            used += 2
 								        elif word == "meio" and wordNext == "manha":
 								            if hrAbs == 0:
 								                hrAbs = 10
 								            used += 2
 								        elif word == "fim" and wordNext == "tarde":
 								            if hrAbs == 0:
 								                hrAbs = 19
 								            used += 2
 								        elif word == "fim" and wordNext == "manha":
 								            if hrAbs == 0:
 								                hrAbs = 11
 								            used += 2
 								        elif word == "tantas" and wordNext == "manha":
 								            if hrAbs == 0:
 								                hrAbs = 4
 								            used += 2
 								        elif word == "noite":
 								            if hrAbs == 0:
 								                hrAbs = 22
 								            used += 1
 								        # parse half an hour, quarter hour
 								        elif word == "hora" and \
 								                (wordPrev in time_indicators or wordPrevPrev in
 								                    time_indicators):
 								            if wordPrev == "meia":
 								                minOffset = 30
 								            elif wordPrev == "quarto":
 								                minOffset = 15
 								            elif wordPrevPrev == "quarto":
 								                minOffset = 15
 								                if idx > 2 and words[idx - 3] in time_indicators:
 								                    words[idx - 3] = ""
 								                words[idx - 2] = ""
 								            else:
 								                hrOffset = 1
 								            if wordPrevPrev in time_indicators:
 								                words[idx - 2] = ""
 								            words[idx - 1] = ""
 								            used += 1
 								            hrAbs = -1
 								            minAbs = -1
 								        # parse 5:00 am, 12:00 p.m., etc
 								        elif word[0].isdigit():
 								            isTime = True
 								            strHH = ""
 								            strMM = ""
 								            remainder = ""
 								            if ':' in word:
 								                # parse colons
 								                # "3:00 in the morning"
 								                stage = 0
 								                length = len(word)
 								                for i in range(length):
 								                    if stage == 0:
 								                        if word[i].isdigit():
 								                            strHH += word[i]
 								                        elif word[i] == ":":
 								                            stage = 1
 								                        else:
 								                            stage = 2
 								                            i -= 1
 								                    elif stage == 1:
 								                        if word[i].isdigit():
 								                            strMM += word[i]
 								                        else:
 								                            stage = 2
 								                            i -= 1
 								                    elif stage == 2:
 								                        remainder = word[i:].replace(".", "")
 								                        break
 								                if remainder == "":
 								                    nextWord = wordNext.replace(".", "")
 								                    if nextWord == "am" or nextWord == "pm":
 								                        remainder = nextWord
 								                        used += 1
 								                    elif wordNext == "manha":
 								                        remainder = "am"
 								                        used += 1
 								                    elif wordNext == "tarde":
 								                        remainder = "pm"
 								                        used += 1
 								                    elif wordNext == "noite":
 								                        if 0 < int(word[0]) < 6:
 								                            remainder = "am"
 								                        else:
 								                            remainder = "pm"
 								                        used += 1
 								                    elif wordNext in thises and wordNextNext == "manha":
 								                        remainder = "am"
 								                        used = 2
 								                    elif wordNext in thises and wordNextNext == "tarde":
 								                        remainder = "pm"
 								                        used = 2
 								                    elif wordNext in thises and wordNextNext == "noite":
 								                        remainder = "pm"
 								                        used = 2
 								                    else:
 								                        if timeQualifier != "":
 								                            military = True
 								                            if strHH <= 12 and \
 								                                    (timeQualifier == "manha" or
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                                     timeQualifier == "tarde"):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                                strHH += 12
 								            else:
 								                # try to parse # s without colons
 								                # 5 hours, 10 minutes etc.
 								                length = len(word)
 								                strNum = ""
 								                remainder = ""
 								                for i in range(length):
 								                    if word[i].isdigit():
 								                        strNum += word[i]
 								                    else:
 								                        remainder += word[i]
 								                if remainder == "":
 								                    remainder = wordNext.replace(".", "").lstrip().rstrip()
 								                if (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                        remainder == "pm" or
 								                        wordNext == "pm" or
 								                        remainder == "p.m." or
 								                        wordNext == "p.m."):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    strHH = strNum
 								                    remainder = "pm"
 								                    used = 1
 								                elif (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                        remainder == "am" or
 								                        wordNext == "am" or
 								                        remainder == "a.m." or
 								                        wordNext == "a.m."):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    strHH = strNum
 								                    remainder = "am"
 								                    used = 1
 								                else:
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                    if (wordNext == "pm" or
 								                            wordNext == "p.m." or
 								                            wordNext == "tarde"):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                        strHH = strNum
 								                        remainder = "pm"
 								                        used = 1
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                    elif (wordNext == "am" or
 								                          wordNext == "a.m." or
 								                          wordNext == "manha"):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                        strHH = strNum
 								                        remainder = "am"
 								                        used = 1
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                    elif (int(word) > 100 and
 								                            (
 								                                wordPrev == "o" or
 								                                wordPrev == "oh" or
 								                                wordPrev == "zero"
 								                            )):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                        # 0800 hours (pronounced oh-eight-hundred)
 								                        strHH = int(word) / 100
 								                        strMM = int(word) - strHH * 100
 								                        military = True
 								                        if wordNext == "hora":
 								                            used += 1
 								                    elif (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            wordNext == "hora" and
 								                            word[0] != '0' and
 								                            (
 								                                int(word) < 100 and
 								                                int(word) > 2400
 								                            )):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                        # ignores military time
 								                        # "in 3 hours"
 								                        hrOffset = int(word)
 								                        used = 2
 								                        isTime = False
 								                        hrAbs = -1
 								                        minAbs = -1
 								                    elif wordNext == "minuto":
 								                        # "in 10 minutes"
 								                        minOffset = int(word)
 								                        used = 2
 								                        isTime = False
 								                        hrAbs = -1
 								                        minAbs = -1
 								                    elif wordNext == "segundo":
 								                        # in 5 seconds
 								                        secOffset = int(word)
 								                        used = 2
 								                        isTime = False
 								                        hrAbs = -1
 								                        minAbs = -1
 								                    elif int(word) > 100:
 								                        strHH = int(word) / 100
 								                        strMM = int(word) - strHH * 100
 								                        military = True
 								                        if wordNext == "hora":
 								                            used += 1
 								                    elif wordNext == "" or (
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                            wordNext == "em" and wordNextNext == "ponto"):
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                        strHH = word
 								                        strMM = 00
 								                        if wordNext == "em" and wordNextNext == "ponto":
 								                            used += 2
 								                            if wordNextNextNext == "tarde":
 								                                remainder = "pm"
 								                                used += 1
 								                            elif wordNextNextNext == "manha":
 								                                remainder = "am"
 								                                used += 1
 								                            elif wordNextNextNext == "noite":
 								                                if 0 > strHH > 6:
 								                                    remainder = "am"
 								                                else:
 								                                    remainder = "pm"
 								                                used += 1
 								                    elif wordNext[0].isdigit():
 								                        strHH = word
 								                        strMM = wordNext
 								                        military = True
 								                        used += 1
 								                        if wordNextNext == "hora":
 								                            used += 1
 								                    else:
 								                        isTime = False
 								            strHH = int(strHH) if strHH else 0
 								            strMM = int(strMM) if strMM else 0
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								            strHH = strHH + 12 if (remainder == "pm" and
 < strHH < 12) else strHH
 								            strHH = strHH - 12 if (remainder == "am" and
 < strHH >= 12) else strHH
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            if strHH > 24 or strMM > 59:
 								                isTime = False
 								                used = 0
 								            if isTime:
 								                hrAbs = strHH * 1
 								                minAbs = strMM * 1
 								                used += 1
 								        if used > 0:
 								            # removed parsed words from the sentence
 								            for i in range(used):
 								                words[idx + i] = ""
 								            if wordPrev == "em" or wordPrev == "ponto":
 								                words[words.index(wordPrev)] = ""
 								            if idx > 0 and wordPrev in time_indicators:
 								                words[idx - 1] = ""
 								            if idx > 1 and wordPrevPrev in time_indicators:
 								                words[idx - 2] = ""
 								            idx += used - 1
 								            found = True
 								    # check that we found a date
 								    if not date_found:
 								        return None
 								    if dayOffset is False:
 								        dayOffset = 0
 								    # perform date manipulation
 								    extractedDate = dateNow
 								    extractedDate = extractedDate.replace(microsecond=0,
 								                                          second=0,
 								                                          minute=0,
 								                                          hour=0)
 								    if datestr != "":
 								        en_months = ['january', 'february', 'march', 'april', 'may', 'june',
 								                     'july', 'august', 'september', 'october', 'november',
 								                     'december']
 								        en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
 								                          'aug',
 								                          'sept', 'oct', 'nov', 'dec']
 								        for idx, en_month in enumerate(en_months):
 								            datestr = datestr.replace(months[idx], en_month)
 								        for idx, en_month in enumerate(en_monthsShort):
 								            datestr = datestr.replace(monthsShort[idx], en_month)
 								        temp = datetime.strptime(datestr, "%B %d")
 								        if not hasYear:
 								            temp = temp.replace(year=extractedDate.year)
 								            if extractedDate < temp:
 								                extractedDate = extractedDate.replace(year=int(currentYear),
 								                                                      month=int(
 								                                                          temp.strftime(
 								                                                              "%m")),
 								                                                      day=int(temp.strftime(
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                                                          "%d")))
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            else:
 								                extractedDate = extractedDate.replace(
 								                    year=int(currentYear) + 1,
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                    month=int(temp.strftime("%m")),
 								                    day=int(temp.strftime("%d")))
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        else:
 								            extractedDate = extractedDate.replace(
 								                year=int(temp.strftime("%Y")),
-												Fix PEP8 and unittest errors from PR 1049

PR 1049 introduced several cosmetic PEP8 errors that were easily fixed.
Additionally there are unittests that include non-ASCII characters which are
failing.  As Pt-PT support is a work-in-progress, I just commented them out
with TODOs next to them.

											
										
										
											2017-10-27 22:39:05 +00:00
+								                month=int(temp.strftime("%m")),
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                day=int(temp.strftime("%d")))
 								    if timeStr != "":
 								        temp = datetime(timeStr)
 								        extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
 								                                              minute=temp.strftime("%M"),
 								                                              second=temp.strftime("%S"))
 								    if yearOffset != 0:
 								        extractedDate = extractedDate + relativedelta(years=yearOffset)
 								    if monthOffset != 0:
 								        extractedDate = extractedDate + relativedelta(months=monthOffset)
 								    if dayOffset != 0:
 								        extractedDate = extractedDate + relativedelta(days=dayOffset)
 								    if hrAbs != -1 and minAbs != -1:
 								        extractedDate = extractedDate + relativedelta(hours=hrAbs,
 								                                                      minutes=minAbs)
 								        if (hrAbs != 0 or minAbs != 0) and datestr == "":
 								            if not daySpecified and dateNow > extractedDate:
 								                extractedDate = extractedDate + relativedelta(days=1)
 								    if hrOffset != 0:
 								        extractedDate = extractedDate + relativedelta(hours=hrOffset)
 								    if minOffset != 0:
 								        extractedDate = extractedDate + relativedelta(minutes=minOffset)
 								    if secOffset != 0:
 								        extractedDate = extractedDate + relativedelta(seconds=secOffset)
 								    resultStr = " ".join(words)
 								    resultStr = ' '.join(resultStr.split())
 								    resultStr = pt_pruning(resultStr)
 								    return [extractedDate, resultStr]
 								def pt_pruning(text, symbols=True, accents=True, agressive=True):
 								    # agressive pt word pruning
 								    words = ["a", "o", "os", "as", "de", "dos", "das",
 								             "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para",
 								             "este",
 								             "esta", "deste", "desta", "neste", "nesta", "nesse",
 								             "nessa", "foi", "que"]
 								    if symbols:
 								        symbols = [".", ",", ";", ":", "!", "?", u"<EFBFBD>", u"<EFBFBD>"]
 								        for symbol in symbols:
 								            text = text.replace(symbol, "")
 								        text = text.replace("-", " ").replace("_", " ")
 								    if accents:
-												pt unicode bug fix in normalization

											
										
										
											2017-12-12 03:10:00 +00:00
+								        accents = {"a": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
 								                   "e": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
 								                   "i": [u"<EFBFBD>", u"<EFBFBD>"],
 								                   "o": [u"<EFBFBD>", u"<EFBFBD>"],
 								                   "u": [u"<EFBFBD>", u"<EFBFBD>"],
 								                   "c": [u"<EFBFBD>"]}
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								        for char in accents:
 								            for acc in accents[char]:
 								                text = text.replace(acc, char)
 								    if agressive:
 								        text_words = text.split(" ")
 								        for idx, word in enumerate(text_words):
 								            if word in words:
 								                text_words[idx] = ""
 								        text = " ".join(text_words)
 								        text = ' '.join(text.split())
 								    return text
 								def get_gender_pt(word, raw_string=""):
 								    word = word.rstrip("s")
 								    gender = False
 								    words = raw_string.split(" ")
 								    for idx, w in enumerate(words):
 								        if w == word and idx != 0:
 								            previous = words[idx - 1]
 								            gender = get_gender_pt(previous)
 								            break
 								    if not gender:
 								        if word[-1] == "a":
 								            gender = "f"
 								        if word[-1] == "o" or word[-1] == "e":
 								            gender = "m"
 								    return gender
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								####################################################################
 								# Spanish normalization
 								#
 								# TODO: numbers greater than 999999
 								####################################################################
 								# Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
 								# in Spanish, "un caballo" means "a horse" or "one horse".
 								es_articles = ["el", "la", "los", "las"]
 								es_numbers_xlat = {
 								    "un": 1,
 								    "uno": 1,
 								    "una": 1,
 								    "dos": 2,
 								    "tres": 3,
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								    u"tr<EFBFBD>s": 3,
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								    "cuatro": 4,
 								    "cinco": 5,
 								    "seis": 6,
 								    "siete": 7,
 								    "ocho": 8,
 								    "nueve": 9,
 								    "diez": 10,
 								    "once": 11,
 								    "doce": 12,
 								    "trece": 13,
 								    "catorce": 14,
 								    "quince": 15,
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								    "dieciseis": 16,
 								    u"diecis<EFBFBD>is": 16,
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								    "diecisiete": 17,
 								    "dieciocho": 18,
 								    "diecinueve": 19,
 								    "veinte": 20,
 								    "veintiuno": 21,
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								    u"veintid<EFBFBD>s": 22,
 								    u"veintitr<EFBFBD>s": 23,
 								    "veintidos": 22,
 								    "veintitres": 23,
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								    "veinticuatro": 24,
 								    "veinticinco": 25,
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								    u"veintis<EFBFBD>is": 26,
 								    "veintiseis": 26,
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								    "veintisiete": 27,
 								    "veintiocho": 28,
 								    "veintinueve": 29,
 								    "treinta": 30,
 								    "cuarenta": 40,
 								    "cincuenta": 50,
 								    "sesenta": 60,
 								    "setenta": 70,
 								    "ochenta": 80,
 								    "noventa": 90,
 								    "cien": 100,
 								    "ciento": 100,
 								    "doscientos": 200,
 								    "doscientas": 200,
 								    "trescientos": 300,
 								    "trescientas": 300,
 								    "cuatrocientos": 400,
 								    "cuatrocientas": 400,
 								    "quinientos": 500,
 								    "quinientas": 500,
 								    "seiscientos": 600,
 								    "seiscientas": 600,
 								    "setecientos": 700,
 								    "setecientas": 700,
 								    "ochocientos": 800,
 								    "ochocientas": 800,
 								    "novecientos": 900,
 								    "novecientas": 900}
 								def es_parse(words, i):
 								    def es_cte(i, s):
 								        if i < len(words) and s == words[i]:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								            return s, i + 1
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								        return None
 								    def es_number_word(i, mi, ma):
 								        if i < len(words):
 								            v = es_numbers_xlat.get(words[i])
 								            if v and v >= mi and v <= ma:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                return v, i + 1
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								        return None
 								    def es_number_1_99(i):
 								        r1 = es_number_word(i, 1, 29)
 								        if r1:
 								            return r1
 								        r1 = es_number_word(i, 30, 90)
 								        if r1:
 								            v1, i1 = r1
 								            r2 = es_cte(i1, "y")
 								            if r2:
 								                v2, i2 = r2
 								                r3 = es_number_word(i2, 1, 9)
 								                if r3:
 								                    v3, i3 = r3
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    return v1 + v3, i3
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								            return r1
 								        return None
 								    def es_number_1_999(i):
-												+

											
										
										
											2017-05-05 12:10:19 +00:00
+								        # [2-9]cientos [1-99]?
 								        r1 = es_number_word(i, 100, 900)
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								        if r1:
 								            v1, i1 = r1
 								            r2 = es_number_1_99(i1)
 								            if r2:
 								                v2, i2 = r2
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                return v1 + v2, i2
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								            else:
-												+

											
										
										
											2017-05-05 12:10:19 +00:00
+								                return r1
-												+

											
										
										
											2017-05-03 17:48:08 +00:00
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								        # [1-99]
 								        r1 = es_number_1_99(i)
 								        if r1:
 								            return r1
 								        return None
 								    def es_number(i):
 								        # check for cero
 								        r1 = es_number_word(i, 0, 0)
 								        if r1:
 								            return r1
 								        # check for [1-999] (mil [0-999])?
 								        r1 = es_number_1_999(i)
 								        if r1:
 								            v1, i1 = r1
 								            r2 = es_cte(i1, "mil")
 								            if r2:
 								                v2, i2 = r2
 								                r3 = es_number_1_999(i2)
 								                if r3:
 								                    v3, i3 = r3
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    return v1 * 1000 + v3, i3
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								                else:
-												Pt-PT translation (#1049)

Portuguese translation helpers!  
											
										
										
											2017-10-26 23:18:00 +00:00
+								                    return v1 * 1000, i2
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								            else:
 								                return r1
 								        return None
 								    return es_number(i)
-												normalize_es

											
										
										
											2017-05-03 09:37:00 +00:00
 								def normalize_es(text, remove_articles):
 								    """ Spanish string normalization """
 								    words = text.split()  # this also removed extra spaces
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
-												normalize_es

											
										
										
											2017-05-03 09:37:00 +00:00
+								    normalized = ""
-												+

											
										
										
											2017-05-03 17:48:08 +00:00
+								    i = 0
 								    while i < len(words):
 								        word = words[i]
 								        if remove_articles and word in es_articles:
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								            i += 1
-												normalize_es

											
										
										
											2017-05-03 09:37:00 +00:00
+								            continue
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								        # Convert numbers into digits
 								        r = es_parse(words, i)
 								        if r:
 								            v, i = r
 								            normalized += " " + str(v)
 								            continue
-												normalize_es

											
										
										
											2017-05-03 09:37:00 +00:00
 								        normalized += " " + word
-												+

											
										
										
											2017-05-05 11:27:35 +00:00
+								        i += 1
-												normalize_es

											
										
										
											2017-05-03 09:37:00 +00:00
 								    return normalized[1:]  # strip the initial space