diff --git a/mycroft/util/format.py b/mycroft/util/format.py index b23b888cdb..6c655ffda0 100755 --- a/mycroft/util/format.py +++ b/mycroft/util/format.py @@ -1,5 +1,3 @@ -# -*- coding: iso-8859-15 -*- -# # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,52 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FRACTION_STRING_EN = { - 2: 'half', - 3: 'third', - 4: 'forth', - 5: 'fifth', - 6: 'sixth', - 7: 'seventh', - 8: 'eigth', - 9: 'ninth', - 10: 'tenth', - 11: 'eleventh', - 12: 'twelveth', - 13: 'thirteenth', - 14: 'fourteenth', - 15: 'fifteenth', - 16: 'sixteenth', - 17: 'seventeenth', - 18: 'eighteenth', - 19: 'nineteenth', - 20: 'twentyith' -} -FRACTION_STRING_PT = { - 2: 'meio', - 3: u'terço', - 4: 'quarto', - 5: 'quinto', - 6: 'sexto', - 7: u'sétimo', - 8: 'oitavo', - 9: 'nono', - 10: u'décimo', - 11: 'onze avos', - 12: 'doze avos', - 13: 'treze avos', - 14: 'catorze avos', - 15: 'quinze avos', - 16: 'dezasseis avos', - 17: 'dezassete avos', - 18: 'dezoito avos', - 19: 'dezanove avos', - 20: u'vigésimo', - 30: u'trigésimo', - 100: u'centésimo', - 1000: u'milésimo' -} +from mycroft.util.lang.format_en import * +from mycroft.util.lang.format_es import * +from mycroft.util.lang.format_pt import * def nice_number(number, lang="en-us", speech=True, denominators=None): @@ -95,55 +51,6 @@ def nice_number(number, lang="en-us", speech=True, denominators=None): return str(number) -def nice_number_en(result): - """ English conversion for nice_number """ - whole, num, den = result - if num == 0: - return str(whole) - den_str = FRACTION_STRING_EN[den] - if whole == 0: - if num == 1: - return_string = 'a {}'.format(den_str) - else: - return_string = '{} {}'.format(num, den_str) - elif num == 1: - return_string = '{} and a {}'.format(whole, den_str) - else: - return_string = '{} and {} {}'.format(whole, num, den_str) - if num > 1: - return_string += 's' - return return_string - - -def nice_number_pt(result): - """ Portuguese conversion for nice_number """ - whole, num, den = result - if num == 0: - return str(whole) - # denominador - den_str = FRACTION_STRING_PT[den] - # fracções - if whole == 0: - if num == 1: - # um décimo - return_string = 'um {}'.format(den_str) - else: - # três meio - return_string = '{} {}'.format(num, den_str) - # inteiros >10 - elif num == 1: - # trinta e um - return_string = '{} e {}'.format(whole, den_str) - # inteiros >10 com fracções - else: - # vinte e 3 décimo - return_string = '{} e {} {}'.format(whole, num, den_str) - # plural - if num > 1: - return_string += 's' - return return_string - - def convert_number(number, denominators): """ Convert floats to mixed fractions """ int_number = int(number) diff --git a/mycroft/util/lang/__init__.py b/mycroft/util/lang/__init__.py new file mode 100644 index 0000000000..b5d9e70242 --- /dev/null +++ b/mycroft/util/lang/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/mycroft/util/lang/format_en.py b/mycroft/util/lang/format_en.py new file mode 100644 index 0000000000..a99343af6f --- /dev/null +++ b/mycroft/util/lang/format_en.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +FRACTION_STRING_EN = { + 2: 'half', + 3: 'third', + 4: 'forth', + 5: 'fifth', + 6: 'sixth', + 7: 'seventh', + 8: 'eigth', + 9: 'ninth', + 10: 'tenth', + 11: 'eleventh', + 12: 'twelveth', + 13: 'thirteenth', + 14: 'fourteenth', + 15: 'fifteenth', + 16: 'sixteenth', + 17: 'seventeenth', + 18: 'eighteenth', + 19: 'nineteenth', + 20: 'twentyith' +} + + +def nice_number_en(result): + """ English conversion for nice_number """ + whole, num, den = result + if num == 0: + return str(whole) + den_str = FRACTION_STRING_EN[den] + if whole == 0: + if num == 1: + return_string = 'a {}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + return_string = '{} and a {}'.format(whole, den_str) + else: + return_string = '{} and {} {}'.format(whole, num, den_str) + if num > 1: + return_string += 's' + return return_string diff --git a/mycroft/util/lang/format_pt.py b/mycroft/util/lang/format_pt.py new file mode 100644 index 0000000000..296ded36a9 --- /dev/null +++ b/mycroft/util/lang/format_pt.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +FRACTION_STRING_PT = { + 2: 'meio', + 3: u'terço', + 4: 'quarto', + 5: 'quinto', + 6: 'sexto', + 7: u'sétimo', + 8: 'oitavo', + 9: 'nono', + 10: u'décimo', + 11: 'onze avos', + 12: 'doze avos', + 13: 'treze avos', + 14: 'catorze avos', + 15: 'quinze avos', + 16: 'dezasseis avos', + 17: 'dezassete avos', + 18: 'dezoito avos', + 19: 'dezanove avos', + 20: u'vigésimo', + 30: u'trigésimo', + 100: u'centésimo', + 1000: u'milésimo' +} + + +def nice_number_pt(result): + """ Portuguese conversion for nice_number """ + whole, num, den = result + if num == 0: + return str(whole) + # denominador + den_str = FRACTION_STRING_PT[den] + # fracções + if whole == 0: + if num == 1: + # um décimo + return_string = 'um {}'.format(den_str) + else: + # três meio + return_string = '{} {}'.format(num, den_str) + # inteiros >10 + elif num == 1: + # trinta e um + return_string = '{} e {}'.format(whole, den_str) + # inteiros >10 com fracções + else: + # vinte e 3 décimo + return_string = '{} e {} {}'.format(whole, num, den_str) + # plural + if num > 1: + return_string += 's' + return return_string diff --git a/mycroft/util/lang/parse_common.py b/mycroft/util/lang/parse_common.py new file mode 100644 index 0000000000..22b1176ba2 --- /dev/null +++ b/mycroft/util/lang/parse_common.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +def is_numeric(input_str): + """ + Takes in a string and tests to see if it is a number. + Args: + text (str): string to test if a number + Returns: + (bool): True if a number, else False + + """ + + try: + float(input_str) + return True + except ValueError: + return False + + +def look_for_fractions(split_list): + """" + This function takes a list made by fraction & determines if a fraction. + + Args: + split_list (list): list created by splitting on '/' + Returns: + (bool): False if not a fraction, otherwise True + + """ + + if len(split_list) == 2: + if is_numeric(split_list[0]) and is_numeric(split_list[1]): + return True + + return False diff --git a/mycroft/util/lang/parse_en.py b/mycroft/util/lang/parse_en.py new file mode 100644 index 0000000000..fef772f9e7 --- /dev/null +++ b/mycroft/util/lang/parse_en.py @@ -0,0 +1,835 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta +from mycroft.util.lang.parse_common import * + + +def extractnumber_en(text): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + + """ + aWords = text.split() + aWords = [word for word in aWords if word not in ["the", "a", "an"]] + andPass = False + valPreAnd = False + val = False + count = 0 + while count < len(aWords): + word = aWords[count] + if is_numeric(word): + # if word.isdigit(): # doesn't work with decimals + val = float(word) + elif word == "first": + val = 1 + elif word == "second": + val = 2 + elif isFractional_en(word): + val = isFractional_en(word) + else: + if word == "one": + val = 1 + elif word == "two": + val = 2 + elif word == "three": + val = 3 + elif word == "four": + val = 4 + elif word == "five": + val = 5 + elif word == "six": + val = 6 + elif word == "seven": + val = 7 + elif word == "eight": + val = 8 + elif word == "nine": + val = 9 + elif word == "ten": + val = 10 + if val: + if count < (len(aWords) - 1): + wordNext = aWords[count + 1] + else: + wordNext = "" + valNext = isFractional_en(wordNext) + + if valNext: + val = val * valNext + aWords[count + 1] = "" + + # if val == False: + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + elif andPass: + # added to value, quit here + val = valPreAnd + break + else: + count += 1 + continue + + aWords[count] = "" + + if (andPass): + aWords[count - 1] = '' # remove "and" + val += valPreAnd + elif count + 1 < len(aWords) and aWords[count + 1] == 'and': + andPass = True + valPreAnd = val + val = False + count += 2 + continue + elif count + 2 < len(aWords) and aWords[count + 2] == 'and': + andPass = True + valPreAnd = val + val = False + count += 3 + continue + + break + + # if val == False: + if not val: + return False + + # Return the $str with the number related words removed + # (now empty strings, so strlen == 0) + aWords = [word for word in aWords if len(word) > 0] + text = ' '.join(aWords) + + return val + + +def extract_datetime_en(str, currentDate=None): + def clean_string(str): + # cleans the input string of unneeded punctuation and capitalization + # among other things + str = str.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') + wordList = str.split() + for idx, word in enumerate(wordList): + word = word.replace("'s", "") + + ordinals = ["rd", "st", "nd", "th"] + if word[0].isdigit(): + for ord in ordinals: + if ord in word: + word = word.replace(ord, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs != 0 or minOffset != 0 or + minAbs != 0 or secOffset != 0 + ) + + if str == "": + return None + if currentDate is None: + currentDate = datetime.now() + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = currentDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ['morning', 'afternoon', 'evening'] + markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of'] + days = ['monday', 'tuesday', 'wednesday', + 'thursday', 'friday', 'saturday', 'sunday'] + months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', + 'sept', 'oct', 'nov', 'dec'] + + words = clean_string(str) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + word = word.rstrip('s') + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "today" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "tomorrow" and not fromFlag: + dayOffset = 1 + used += 1 + elif (word == "day" and + wordNext == "after" and + wordNextNext == "tomorrow" and + not fromFlag and + not wordPrev[0].isdigit()): + dayOffset = 2 + used = 3 + if wordPrev == "the": + start -= 1 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "day": + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "week" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "next": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "last": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "month" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "next": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "last": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "year" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "next": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "last": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "next": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "last": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("today") + validFollowups.append("tomorrow") + validFollowups.append("next") + validFollowups.append("last") + validFollowups.append("now") + if (word == "from" or word == "after") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "tomorrow": + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "next": + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "last": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "this": + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if (start - 1 >= 0 and words[start - 1] in markers): + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = 0 + minAbs = 0 + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "noon": + hrAbs = 12 + used += 1 + elif word == "midnight": + hrAbs = 0 + used += 1 + elif word == "morning": + if hrAbs == 0: + hrAbs = 8 + used += 1 + elif word == "afternoon": + if hrAbs == 0: + hrAbs = 15 + used += 1 + elif word == "evening": + if hrAbs == 0: + hrAbs = 19 + used += 1 + # parse half an hour, quarter hour + elif word == "hour" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "half": + minOffset = 30 + elif wordPrev == "quarter": + minOffset = 15 + elif wordPrevPrev == "quarter": + minOffset = 15 + if idx > 2 and words[idx - 3] in markers: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif nextWord == "tonight": + remainder = "pm" + used += 1 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "morning": + reaminder = "am" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "afternoon": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "evening": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "morning": + remainder = "am" + used += 2 + elif wordNext == "in" and wordNextNext == "afternoon": + remainder = "pm" + used += 2 + elif wordNext == "in" and wordNextNext == "evening": + remainder = "pm" + used += 2 + elif wordNext == "this" and wordNextNext == "morning": + remainder = "am" + used = 2 + elif wordNext == "this" and wordNextNext == "afternoon": + remainder = "pm" + used = 2 + elif wordNext == "this" and wordNextNext == "evening": + remainder = "pm" + used = 2 + elif wordNext == "at" and wordNextNext == "night": + if strHH > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + else: + if timeQualifier != "": + military = True + if strHH <= 12 and \ + (timeQualifier == "evening" or + timeQualifier == "afternoon"): + strHH += 12 + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if wordNext == "pm" or wordNext == "p.m.": + strHH = strNum + reaminder = "pm" + used = 1 + elif wordNext == "am" or wordNext == "a.m.": + strHH = strNum + remainder = "am" + used = 1 + elif ( + int(word) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hours": + used += 1 + elif ( + wordNext == "hours" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minutes": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "seconds": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hours": + used += 1 + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + military = True + used += 1 + if wordNextNext == "hours": + used += 1 + elif ( + wordNext == "" or wordNext == "o'clock" or + ( + wordNext == "in" and + ( + wordNextNext == "the" or + wordNextNext == timeQualifier + ) + )): + strHH = word + strMM = 00 + if wordNext == "o'clock": + used += 1 + if wordNext == "in" or wordNextNext == "in": + used += (1 if wordNext == "in" else 2) + if (wordNextNext and + wordNextNext in timeQualifier or + (words[words.index(wordNextNext) + 1] and + words[words.index(wordNextNext) + 1] in + timeQualifier)): + if (wordNextNext == "afternoon" or + (len(words) > + words.index(wordNextNext) + 1 and + words[words.index( + wordNextNext) + 1] == "afternoon")): + remainder = "pm" + if (wordNextNext == "evening" or + (len(words) > + (words.index(wordNextNext) + 1) and + words[words.index( + wordNextNext) + 1] == "evening")): + remainder = "pm" + if (wordNextNext == "morning" or + (len(words) > + words.index(wordNextNext) + 1 and + words[words.index( + wordNextNext) + 1] == "morning")): + remainder = "am" + if timeQualifier != "": + military = True + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH + strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "o" or wordPrev == "oh": + words[words.index(wordPrev)] = "" + + if wordPrev == "early": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "late": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + temp = datetime.strptime(datestr, "%B %d") + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "and" and words[idx - 1] == "" and words[ + idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def isFractional_en(input_str): + """ + This function takes the given text and checks if it is a fraction. + + Args: + text (str): the string to check if fractional + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth", + "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"] + + if input_str.lower() in aFrac: + return 1.0 / (aFrac.index(input_str) + 1) + if input_str == "quarter": + return 1.0 / 4 + + return False + + +def normalize_en(text, remove_articles): + """ English string normalization """ + + words = text.split() # this also removed extra spaces + normalized = "" + for word in words: + if remove_articles and word in ["the", "a", "an"]: + continue + + # Expand common contractions, e.g. "isn't" -> "is not" + contraction = ["ain't", "aren't", "can't", "could've", "couldn't", + "didn't", "doesn't", "don't", "gonna", "gotta", + "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", + "how'd", "how'll", "how's", "I'd", "I'll", "I'm", + "I've", "isn't", "it'd", "it'll", "it's", "mightn't", + "might've", "mustn't", "must've", "needn't", + "oughtn't", + "shan't", "she'd", "she'll", "she's", "shouldn't", + "should've", "somebody's", "someone'd", "someone'll", + "someone's", "that'll", "that's", "that'd", "there'd", + "there're", "there's", "they'd", "they'll", "they're", + "they've", "wasn't", "we'd", "we'll", "we're", "we've", + "weren't", "what'd", "what'll", "what're", "what's", + "whats", # technically incorrect but some STT outputs + "what've", "when's", "when'd", "where'd", "where's", + "where've", "who'd", "who'd've", "who'll", "who're", + "who's", "who've", "why'd", "why're", "why's", "won't", + "won't've", "would've", "wouldn't", "wouldn't've", + "y'all", "ya'll", "you'd", "you'd've", "you'll", + "y'aint", "y'ain't", "you're", "you've"] + if word in contraction: + expansion = ["is not", "are not", "can not", "could have", + "could not", "did not", "does not", "do not", + "going to", "got to", "had not", "has not", + "have not", "he would", "he will", "he is", + "how did", + "how will", "how is", "I would", "I will", "I am", + "I have", "is not", "it would", "it will", "it is", + "might not", "might have", "must not", "must have", + "need not", "ought not", "shall not", "she would", + "she will", "she is", "should not", "should have", + "somebody is", "someone would", "someone will", + "someone is", "that will", "that is", "that would", + "there would", "there are", "there is", "they would", + "they will", "they are", "they have", "was not", + "we would", "we will", "we are", "we have", + "were not", "what did", "what will", "what are", + "what is", + "what is", "what have", "when is", "when did", + "where did", "where is", "where have", "who would", + "who would have", "who will", "who are", "who is", + "who have", "why did", "why are", "why is", + "will not", "will not have", "would have", + "would not", "would not have", "you all", "you all", + "you would", "you would have", "you will", + "you are not", "you are not", "you are", "you have"] + word = expansion[contraction.index(word)] + + # Convert numbers into digits, e.g. "two" -> "2" + textNumbers = ["zero", "one", "two", "three", "four", "five", "six", + "seven", "eight", "nine", "ten", "eleven", "twelve", + "thirteen", "fourteen", "fifteen", "sixteen", + "seventeen", "eighteen", "nineteen", "twenty"] + if word in textNumbers: + word = str(textNumbers.index(word)) + + normalized += " " + word + + return normalized[1:] # strip the initial space diff --git a/mycroft/util/lang/parse_es.py b/mycroft/util/lang/parse_es.py new file mode 100644 index 0000000000..221af2c8f5 --- /dev/null +++ b/mycroft/util/lang/parse_es.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta +from mycroft.util.lang.parse_common import * +""" + Parse functions for spanish (es) + TODO: numbers greater than 999999 +""" + +# Undefined articles ["un", "una", "unos", "unas"] can not be supressed, +# in Spanish, "un caballo" means "a horse" or "one horse". +es_articles = ["el", "la", "los", "las"] + +es_numbers_xlat = { + "un": 1, + "uno": 1, + "una": 1, + "dos": 2, + "tres": 3, + u"tr�s": 3, + "cuatro": 4, + "cinco": 5, + "seis": 6, + "siete": 7, + "ocho": 8, + "nueve": 9, + "diez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + "dieciseis": 16, + u"dieciséis": 16, + "diecisiete": 17, + "dieciocho": 18, + "diecinueve": 19, + "veinte": 20, + "veintiuno": 21, + u"veintid�s": 22, + u"veintitr�s": 23, + "veintidos": 22, + "veintitres": 23, + u"veintitrés": 23, + "veinticuatro": 24, + "veinticinco": 25, + u"veintiséis": 26, + "veintiseis": 26, + "veintisiete": 27, + "veintiocho": 28, + "veintinueve": 29, + "treinta": 30, + "cuarenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "ochenta": 80, + "noventa": 90, + "cien": 100, + "ciento": 100, + "doscientos": 200, + "doscientas": 200, + "trescientos": 300, + "trescientas": 300, + "cuatrocientos": 400, + "cuatrocientas": 400, + "quinientos": 500, + "quinientas": 500, + "seiscientos": 600, + "seiscientas": 600, + "setecientos": 700, + "setecientas": 700, + "ochocientos": 800, + "ochocientas": 800, + "novecientos": 900, + "novecientas": 900} + + +def es_parse(words, i): + def es_cte(i, s): + if i < len(words) and s == words[i]: + return s, i + 1 + return None + + def es_number_word(i, mi, ma): + if i < len(words): + v = es_numbers_xlat.get(words[i]) + if v and v >= mi and v <= ma: + return v, i + 1 + return None + + def es_number_1_99(i): + r1 = es_number_word(i, 1, 29) + if r1: + return r1 + + r1 = es_number_word(i, 30, 90) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "y") + if r2: + v2, i2 = r2 + r3 = es_number_word(i2, 1, 9) + if r3: + v3, i3 = r3 + return v1 + v3, i3 + return r1 + return None + + def es_number_1_999(i): + # [2-9]cientos [1-99]? + r1 = es_number_word(i, 100, 900) + if r1: + v1, i1 = r1 + r2 = es_number_1_99(i1) + if r2: + v2, i2 = r2 + return v1 + v2, i2 + else: + return r1 + + # [1-99] + r1 = es_number_1_99(i) + if r1: + return r1 + + return None + + def es_number(i): + # check for cero + r1 = es_number_word(i, 0, 0) + if r1: + return r1 + + # check for [1-999] (mil [0-999])? + r1 = es_number_1_999(i) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "mil") + if r2: + v2, i2 = r2 + r3 = es_number_1_999(i2) + if r3: + v3, i3 = r3 + return v1 * 1000 + v3, i3 + else: + return v1 * 1000, i2 + else: + return r1 + return None + + return es_number(i) + + +def normalize_es(text, remove_articles): + """ Spanish string normalization """ + + words = text.split() # this also removed extra spaces + + normalized = "" + i = 0 + while i < len(words): + word = words[i] + + if remove_articles and word in es_articles: + i += 1 + continue + + # Convert numbers into digits + r = es_parse(words, i) + if r: + v, i = r + normalized += " " + str(v) + continue + + normalized += " " + word + i += 1 + + return normalized[1:] # strip the initial space diff --git a/mycroft/util/lang/parse_pt.py b/mycroft/util/lang/parse_pt.py new file mode 100644 index 0000000000..7ee731ae65 --- /dev/null +++ b/mycroft/util/lang/parse_pt.py @@ -0,0 +1,1199 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta +from mycroft.util.lang.parse_common import * +""" + Parse functions for Portuguese (PT-PT) + + TODO: numbers greater than 999999 + TODO: date time pt +""" + +# Undefined articles ["um", "uma", "uns", "umas"] can not be supressed, +# in PT, "um cavalo" means "a horse" or "one horse". +pt_articles = ["o", "a", "os", "as"] + +pt_numbers = { + "zero": 0, + "um": 1, + "uma": 1, + "uns": 1, + "umas": 1, + "primeiro": 1, + "segundo": 2, + "terceiro": 3, + "dois": 2, + "duas": 2, + "tres": 3, + u"três": 3, + "quatro": 4, + "cinco": 5, + "seis": 6, + "sete": 7, + "oito": 8, + "nove": 9, + "dez": 10, + "onze": 11, + "doze": 12, + "treze": 13, + "catorze": 14, + "quinze": 15, + "dezasseis": 16, + "dezassete": 17, + "dezoito": 18, + "dezanove": 19, + "vinte": 20, + "trinta": 30, + "quarenta": 40, + "cinquenta": 50, + "sessenta": 60, + "setenta": 70, + "oitenta": 80, + "noventa": 90, + "cem": 100, + "cento": 100, + "duzentos": 200, + "duzentas": 200, + "trezentos": 300, + "trezentas": 300, + "quatrocentos": 400, + "quatrocentas": 400, + "quinhentos": 500, + "quinhentas": 500, + "seiscentos": 600, + "seiscentas": 600, + "setecentos": 700, + "setecentas": 700, + "oitocentos": 800, + "oitocentas": 800, + "novecentos": 900, + "novecentas": 900, + "mil": 1000, + u"milh�o": 1000000} + + +def isFractional_pt(input_str): + """ + This function takes the given text and checks if it is a fraction. + + Args: + text (str): the string to check if fractional + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + aFrac = ["meio", u"terço", "quarto", "quinto", "sexto", + "setimo", "oitavo", "nono", u"décimo"] + + if input_str.lower() in aFrac: + return 1.0 / (aFrac.index(input_str) + 2) + if input_str == u"vigésimo": + return 1.0 / 20 + if input_str == u"trigésimo": + return 1.0 / 30 + if input_str == u"centésimo": + return 1.0 / 100 + if input_str == u"milésimo": + return 1.0 / 1000 + if (input_str == u"sétimo" or input_str == "septimo" or + input_str == u"séptimo"): + return 1.0 / 7 + + return False + + +def extractnumber_pt(text): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + + """ + aWords = text.split() + count = 0 + result = None + while count < len(aWords): + val = 0 + word = aWords[count] + next_next_word = None + if count + 1 < len(aWords): + next_word = aWords[count + 1] + if count + 2 < len(aWords): + next_next_word = aWords[count + 2] + else: + next_word = None + + # is current word a number? + if word in pt_numbers: + val = pt_numbers[word] + elif word.isdigit(): # doesn't work with decimals + val = int(word) + elif is_numeric(word): + val = float(word) + elif isFractional_pt(word): + if not result: + result = 1 + result = result * isFractional_pt(word) + count += 1 + continue + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + + if val: + if result is None: + result = 0 + # handle fractions + if next_word != "avos": + result += val + else: + result = float(result) / float(val) + + if next_word is None: + break + + # number word and fraction + ands = ["e"] + if next_word in ands: + zeros = 0 + if result is None: + count += 1 + continue + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + + afterAndVal = extractnumber_pt(newText[:-1]) + if afterAndVal: + if result < afterAndVal or result < 20: + while afterAndVal > 1: + afterAndVal = afterAndVal / 10.0 + for word in newWords: + if word == "zero" or word == "0": + zeros += 1 + else: + break + for i in range(0, zeros): + afterAndVal = afterAndVal / 10.0 + result += afterAndVal + break + elif next_next_word is not None: + if next_next_word in ands: + newWords = aWords[count + 3:] + newText = "" + for word in newWords: + newText += word + " " + afterAndVal = extractnumber_pt(newText[:-1]) + if afterAndVal: + if result is None: + result = 0 + result += afterAndVal + break + + decimals = ["ponto", "virgula", u"v�rgula", ".", ","] + if next_word in decimals: + zeros = 0 + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + for word in newWords: + if word == "zero" or word == "0": + zeros += 1 + else: + break + afterDotVal = str(extractnumber_pt(newText[:-1])) + afterDotVal = zeros * "0" + afterDotVal + result = float(str(result) + "." + afterDotVal) + break + count += 1 + + if result is None: + return False + + # Return the $str with the number related words removed + # (now empty strings, so strlen == 0) + # aWords = [word for word in aWords if len(word) > 0] + # text = ' '.join(aWords) + if "." in str(result): + integer, dec = str(result).split(".") + # cast float to int + if dec == "0": + result = int(integer) + + return result + + +def pt_number_parse(words, i): + def pt_cte(i, s): + if i < len(words) and s == words[i]: + return s, i + 1 + return None + + def pt_number_word(i, mi, ma): + if i < len(words): + v = pt_numbers.get(words[i]) + if v and v >= mi and v <= ma: + return v, i + 1 + return None + + def pt_number_1_99(i): + r1 = pt_number_word(i, 1, 29) + if r1: + return r1 + + r1 = pt_number_word(i, 30, 90) + if r1: + v1, i1 = r1 + r2 = pt_cte(i1, "e") + if r2: + v2, i2 = r2 + r3 = pt_number_word(i2, 1, 9) + if r3: + v3, i3 = r3 + return v1 + v3, i3 + return r1 + return None + + def pt_number_1_999(i): + # [2-9]cientos [1-99]? + r1 = pt_number_word(i, 100, 900) + if r1: + v1, i1 = r1 + r2 = pt_number_1_99(i1) + if r2: + v2, i2 = r2 + return v1 + v2, i2 + else: + return r1 + + # [1-99] + r1 = pt_number_1_99(i) + if r1: + return r1 + + return None + + def pt_number(i): + # check for cero + r1 = pt_number_word(i, 0, 0) + if r1: + return r1 + + # check for [1-999] (mil [0-999])? + r1 = pt_number_1_999(i) + if r1: + v1, i1 = r1 + r2 = pt_cte(i1, "mil") + if r2: + v2, i2 = r2 + r3 = pt_number_1_999(i2) + if r3: + v3, i3 = r3 + return v1 * 1000 + v3, i3 + else: + return v1 * 1000, i2 + else: + return r1 + return None + + return pt_number(i) + + +def normalize_pt(text, remove_articles): + """ PT string normalization """ + + words = text.split() # this also removed extra spaces + normalized = "" + # Contractions are not common in PT + + # Convert numbers into digits, e.g. "dois" -> "2" + normalized = "" + i = 0 + while i < len(words): + word = words[i] + # remove articles + if remove_articles and word in pt_articles: + i += 1 + continue + + # Convert numbers into digits + r = pt_number_parse(words, i) + if r: + v, i = r + normalized += " " + str(v) + continue + + # NOTE temporary , handle some numbers above >999 + if word in pt_numbers: + word = str(pt_numbers[word]) + # end temporary + + normalized += " " + word + i += 1 + # some articles in pt-pt can not be removed, but many words can + # this is experimental and some meaning may be lost + # maybe agressive should default to False + # only usage will tell, as a native speaker this seems reasonable + return pt_pruning(normalized[1:], agressive=remove_articles) + + +def extract_datetime_pt(input_str, currentDate=None): + def clean_string(str): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", u"º", u"ª"] + noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", + "ao", "aos"] + + for word in symbols: + str = str.replace(word, "") + for word in noise_words: + str = str.replace(" " + word + " ", " ") + str = str.lower().replace( + u"á", + "a").replace( + u"ç", + "c").replace( + u"à", + "a").replace( + u"ã", + "a").replace( + u"é", + "e").replace( + u"è", + "e").replace( + u"ê", + "e").replace( + u"ó", + "o").replace( + u"ò", + "o").replace( + "-", + " ").replace( + "_", + "") + # handle synonims and equivalents, "tomorrow early = tomorrow morning + synonims = {"manha": ["manhazinha", "cedo", "cedinho"], + "tarde": ["tardinha", "tarde"], + "noite": ["noitinha", "anoitecer"], + "todos": ["ao", "aos"], + "em": ["do", "da", "dos", "das", "de"]} + for syn in synonims: + for word in synonims[syn]: + str = str.replace(" " + word + " ", " " + syn + " ") + # relevant plurals, cant just extract all s in pt + wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", + "minutos", "segundos", "nas", "nos", "proximas", + "seguintes", "horas"] + for idx, word in enumerate(wordlist): + str = str.replace(word, word.rstrip('s')) + str = str.replace("meses", "mes").replace("anteriores", "anterior") + return str + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs != 0 or minOffset != 0 or + minAbs != 0 or secOffset != 0 + ) + + if input_str == "": + return None + if currentDate is None: + currentDate = datetime.now() + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = currentDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(input_str).split(" ") + timeQualifiersList = ['manha', 'tarde', 'noite'] + time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", + "no", "dia", "hora"] + days = ['segunda', 'terca', 'quarta', + 'quinta', 'sexta', 'sabado', 'domingo'] + months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', + 'julho', 'agosto', 'setembro', 'outubro', 'novembro', + 'dezembro'] + monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', + 'set', 'out', 'nov', 'dec'] + nexts = ["proximo", "proxima"] + suffix_nexts = ["seguinte", "subsequente", "seguir"] + lasts = ["ultimo", "ultima"] + suffix_lasts = ["passada", "passado", "anterior", "antes"] + nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] + prevs = ["antes", "ante", "previa", "previamente", "anterior"] + froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", + "depois", "por", "proxima", "proximo", "da", "do", "de"] + thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", + "nessa"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "hoje" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "amanha" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "ontem" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "anteontem" or + (word == "ante" and wordNext == "ontem")) and not fromFlag: + dayOffset -= 2 + used += 1 + if wordNext == "ontem": + used += 1 + elif word == "ante" and wordNext == "ante" and wordNextNext == \ + "ontem" and not fromFlag: + dayOffset -= 3 + used += 3 + elif word == "anteanteontem" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow + elif word == "depois" and wordNext == "amanha" and not fromFlag: + dayOffset += 2 + used = 2 + # day before yesterday + elif word == "antes" and wordNext == "ontem" and not fromFlag: + dayOffset -= 2 + used = 2 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "dia": + if wordNext == "depois" or wordNext == "antes": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "semana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "ano" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + for w in nexts: + if wordPrev == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in lasts: + if wordPrev == w: + dayOffset -= 7 + used += 1 + start -= 1 + for w in suffix_nexts: + if wordNext == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in suffix_lasts: + if wordNext == w: + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "feira": + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 maio + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # maio 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia maio + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # maio dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("hoje") + validFollowups.append("amanha") + validFollowups.append("ontem") + validFollowups.append("anteontem") + validFollowups.append("agora") + validFollowups.append("ja") + validFollowups.append("ante") + + # TODO debug word "depois" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == "amanha" and wordNext == "ontem") and not ( + word == "depois" or word == "antes" or word == "em"): + used = 2 + fromFlag = True + if wordNext == "amanha" and word != "depois": + dayOffset += 1 + elif wordNext == "ontem": + dayOffset -= 1 + elif wordNext == "anteontem": + dayOffset -= 2 + elif wordNext == "ante" and wordNextNext == "ontem": + dayOffset -= 2 + elif (wordNext == "ante" and wordNext == "ante" and + wordNextNextNext == "ontem"): + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if wordNextNext == "feira": + used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + if wordNextNextNext == "feira": + used += 1 + if wordNext in months: + used -= 1 + if used > 0: + + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if (start - 1 >= 0 and words[start - 1] in lists): + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = 0 + minAbs = 0 + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "meio" and wordNext == "dia": + hrAbs = 12 + used += 2 + elif word == "meia" and wordNext == "noite": + hrAbs = 0 + used += 2 + elif word == "manha": + if hrAbs == 0: + hrAbs = 8 + used += 1 + elif word == "tarde": + if hrAbs == 0: + hrAbs = 15 + used += 1 + elif word == "meio" and wordNext == "tarde": + if hrAbs == 0: + hrAbs = 17 + used += 2 + elif word == "meio" and wordNext == "manha": + if hrAbs == 0: + hrAbs = 10 + used += 2 + elif word == "fim" and wordNext == "tarde": + if hrAbs == 0: + hrAbs = 19 + used += 2 + elif word == "fim" and wordNext == "manha": + if hrAbs == 0: + hrAbs = 11 + used += 2 + elif word == "tantas" and wordNext == "manha": + if hrAbs == 0: + hrAbs = 4 + used += 2 + elif word == "noite": + if hrAbs == 0: + hrAbs = 22 + used += 1 + # parse half an hour, quarter hour + elif word == "hora" and \ + (wordPrev in time_indicators or wordPrevPrev in + time_indicators): + if wordPrev == "meia": + minOffset = 30 + elif wordPrev == "quarto": + minOffset = 15 + elif wordPrevPrev == "quarto": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "manha": + remainder = "am" + used += 1 + elif wordNext == "tarde": + remainder = "pm" + used += 1 + elif wordNext == "noite": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == "manha": + remainder = "am" + used = 2 + elif wordNext in thises and wordNextNext == "tarde": + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "noite": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + military = True + if strHH <= 12 and \ + (timeQualifier == "manha" or + timeQualifier == "tarde"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarde"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == "manha"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" or + wordPrev == "zero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuto": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segundo": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "em" and wordNextNext == "ponto"): + strHH = word + strMM = 00 + if wordNext == "em" and wordNextNext == "ponto": + used += 2 + if wordNextNextNext == "tarde": + remainder = "pm" + used += 1 + elif wordNextNextNext == "manha": + remainder = "am" + used += 1 + elif wordNextNextNext == "noite": + if 0 > strHH > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + military = True + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "em" or wordPrev == "ponto": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + resultStr = pt_pruning(resultStr) + return [extractedDate, resultStr] + + +def pt_pruning(text, symbols=True, accents=True, agressive=True): + # agressive pt word pruning + words = ["a", "o", "os", "as", "de", "dos", "das", + "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", + "este", + "esta", "deste", "desta", "neste", "nesta", "nesse", + "nessa", "foi", "que"] + if symbols: + symbols = [".", ",", ";", ":", "!", "?", u"�", u"�"] + for symbol in symbols: + text = text.replace(symbol, "") + text = text.replace("-", " ").replace("_", " ") + if accents: + accents = {"a": [u"á", u"à", u"ã", u"â"], + "e": [u"ê", u"è", u"é"], + "i": [u"í", u"ì"], + "o": [u"ò", u"ó"], + "u": [u"ú", u"ù"], + "c": [u"ç"]} + for char in accents: + for acc in accents[char]: + text = text.replace(acc, char) + if agressive: + text_words = text.split(" ") + for idx, word in enumerate(text_words): + if word in words: + text_words[idx] = "" + text = " ".join(text_words) + text = ' '.join(text.split()) + return text + + +def get_gender_pt(word, raw_string=""): + word = word.rstrip("s") + gender = False + words = raw_string.split(" ") + for idx, w in enumerate(words): + if w == word and idx != 0: + previous = words[idx - 1] + gender = get_gender_pt(previous) + break + if not gender: + if word[-1] == "a": + gender = "f" + if word[-1] == "o" or word[-1] == "e": + gender = "m" + return gender diff --git a/mycroft/util/parse.py b/mycroft/util/parse.py index fae84a78b0..752f44d71b 100644 --- a/mycroft/util/parse.py +++ b/mycroft/util/parse.py @@ -1,4 +1,4 @@ -# -*- coding: iso-8859-15 -*- +# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # @@ -14,10 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from datetime import datetime, timedelta -from dateutil.relativedelta import relativedelta from difflib import SequenceMatcher +from mycroft.util.lang.parse_en import * +from mycroft.util.lang.parse_pt import * +from mycroft.util.lang.parse_es import * +from mycroft.util.lang.parse_common import * + def fuzzy_match(x, against): """Perform a 'fuzzy' comparison between two strings. @@ -101,799 +104,6 @@ def extract_datetime(text, anchorDate=None, lang="en-us"): return extract_datetime_pt(text, anchorDate) return text - - -def is_numeric(input_str): - """ - Takes in a string and tests to see if it is a number. - Args: - text (str): string to test if a number - Returns: - (bool): True if a number, else False - - """ - - try: - float(input_str) - return True - except ValueError: - return False - - -def extractnumber_en(text): - """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - - """ - aWords = text.split() - aWords = [word for word in aWords if word not in ["the", "a", "an"]] - andPass = False - valPreAnd = False - val = False - count = 0 - while count < len(aWords): - word = aWords[count] - if is_numeric(word): - # if word.isdigit(): # doesn't work with decimals - val = float(word) - elif word == "first": - val = 1 - elif word == "second": - val = 2 - elif isFractional_en(word): - val = isFractional_en(word) - else: - if word == "one": - val = 1 - elif word == "two": - val = 2 - elif word == "three": - val = 3 - elif word == "four": - val = 4 - elif word == "five": - val = 5 - elif word == "six": - val = 6 - elif word == "seven": - val = 7 - elif word == "eight": - val = 8 - elif word == "nine": - val = 9 - elif word == "ten": - val = 10 - if val: - if count < (len(aWords) - 1): - wordNext = aWords[count + 1] - else: - wordNext = "" - valNext = isFractional_en(wordNext) - - if valNext: - val = val * valNext - aWords[count + 1] = "" - - # if val == False: - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - elif andPass: - # added to value, quit here - val = valPreAnd - break - else: - count += 1 - continue - - aWords[count] = "" - - if (andPass): - aWords[count - 1] = '' # remove "and" - val += valPreAnd - elif count + 1 < len(aWords) and aWords[count + 1] == 'and': - andPass = True - valPreAnd = val - val = False - count += 2 - continue - elif count + 2 < len(aWords) and aWords[count + 2] == 'and': - andPass = True - valPreAnd = val - val = False - count += 3 - continue - - break - - # if val == False: - if not val: - return False - - # Return the $str with the number related words removed - # (now empty strings, so strlen == 0) - aWords = [word for word in aWords if len(word) > 0] - text = ' '.join(aWords) - - return val - - -def extract_datetime_en(str, currentDate=None): - def clean_string(str): - # cleans the input string of unneeded punctuation and capitalization - # among other things - str = str.lower().replace('?', '').replace('.', '').replace(',', '') \ - .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') - wordList = str.split() - for idx, word in enumerate(wordList): - word = word.replace("'s", "") - - ordinals = ["rd", "st", "nd", "th"] - if word[0].isdigit(): - for ord in ordinals: - if ord in word: - word = word.replace(ord, "") - wordList[idx] = word - - return wordList - - def date_found(): - return found or \ - ( - datestr != "" or timeStr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs != 0 or minOffset != 0 or - minAbs != 0 or secOffset != 0 - ) - - if str == "": - return None - if currentDate is None: - currentDate = datetime.now() - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - dateNow = currentDate - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - timeQualifiersList = ['morning', 'afternoon', 'evening'] - markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of'] - days = ['monday', 'tuesday', 'wednesday', - 'thursday', 'friday', 'saturday', 'sunday'] - months = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', - 'sept', 'oct', 'nov', 'dec'] - - words = clean_string(str) - - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - - # this isn't in clean string because I don't want to save back to words - word = word.rstrip('s') - start = idx - used = 0 - # save timequalifier for later - if word in timeQualifiersList: - timeQualifier = word - # parse today, tomorrow, day after tomorrow - elif word == "today" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "tomorrow" and not fromFlag: - dayOffset = 1 - used += 1 - elif (word == "day" and - wordNext == "after" and - wordNextNext == "tomorrow" and - not fromFlag and - not wordPrev[0].isdigit()): - dayOffset = 2 - used = 3 - if wordPrev == "the": - start -= 1 - used += 1 - # parse 5 days, 10 weeks, last week, next week - elif word == "day": - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used = 2 - elif word == "week" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - elif wordPrev == "next": - dayOffset = 7 - start -= 1 - used = 2 - elif wordPrev == "last": - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "month" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev == "next": - monthOffset = 1 - start -= 1 - used = 2 - elif wordPrev == "last": - monthOffset = -1 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "year" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev == "next": - yearOffset = 1 - start -= 1 - used = 2 - elif wordPrev == "last": - yearOffset = -1 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - if wordPrev == "next": - dayOffset += 7 - used += 1 - start -= 1 - elif wordPrev == "last": - dayOffset -= 7 - used += 1 - start -= 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in monthsShort and not fromFlag: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months[m] - if wordPrev and (wordPrev[0].isdigit() or - (wordPrev == "of" and wordPrevPrev[0].isdigit())): - if wordPrev == "of" and wordPrevPrev[0].isdigit(): - datestr += " " + words[idx - 2] - used += 1 - start -= 1 - else: - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - validFollowups = days + months + monthsShort - validFollowups.append("today") - validFollowups.append("tomorrow") - validFollowups.append("next") - validFollowups.append("last") - validFollowups.append("now") - if (word == "from" or word == "after") and wordNext in validFollowups: - used = 2 - fromFlag = True - if wordNext == "tomorrow": - dayOffset += 1 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if tmpOffset < 0: - tmpOffset += 7 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNext == "next": - tmpOffset += 7 - used += 1 - start -= 1 - elif wordNext == "last": - tmpOffset -= 7 - used += 1 - start -= 1 - dayOffset += tmpOffset - if used > 0: - if start - 1 > 0 and words[start - 1] == "this": - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if (start - 1 >= 0 and words[start - 1] in markers): - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - timeStr = "" - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = 0 - minAbs = 0 - military = False - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word == "noon": - hrAbs = 12 - used += 1 - elif word == "midnight": - hrAbs = 0 - used += 1 - elif word == "morning": - if hrAbs == 0: - hrAbs = 8 - used += 1 - elif word == "afternoon": - if hrAbs == 0: - hrAbs = 15 - used += 1 - elif word == "evening": - if hrAbs == 0: - hrAbs = 19 - used += 1 - # parse half an hour, quarter hour - elif word == "hour" and \ - (wordPrev in markers or wordPrevPrev in markers): - if wordPrev == "half": - minOffset = 30 - elif wordPrev == "quarter": - minOffset = 15 - elif wordPrevPrev == "quarter": - minOffset = 15 - if idx > 2 and words[idx - 3] in markers: - words[idx - 3] = "" - words[idx - 2] = "" - else: - hrOffset = 1 - if wordPrevPrev in markers: - words[idx - 2] = "" - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - elif nextWord == "tonight": - remainder = "pm" - used += 1 - elif wordNext == "in" and wordNextNext == "the" and \ - words[idx + 3] == "morning": - reaminder = "am" - used += 3 - elif wordNext == "in" and wordNextNext == "the" and \ - words[idx + 3] == "afternoon": - remainder = "pm" - used += 3 - elif wordNext == "in" and wordNextNext == "the" and \ - words[idx + 3] == "evening": - remainder = "pm" - used += 3 - elif wordNext == "in" and wordNextNext == "morning": - remainder = "am" - used += 2 - elif wordNext == "in" and wordNextNext == "afternoon": - remainder = "pm" - used += 2 - elif wordNext == "in" and wordNextNext == "evening": - remainder = "pm" - used += 2 - elif wordNext == "this" and wordNextNext == "morning": - remainder = "am" - used = 2 - elif wordNext == "this" and wordNextNext == "afternoon": - remainder = "pm" - used = 2 - elif wordNext == "this" and wordNextNext == "evening": - remainder = "pm" - used = 2 - elif wordNext == "at" and wordNextNext == "night": - if strHH > 5: - remainder = "pm" - else: - remainder = "am" - used += 2 - else: - if timeQualifier != "": - military = True - if strHH <= 12 and \ - (timeQualifier == "evening" or - timeQualifier == "afternoon"): - strHH += 12 - else: - # try to parse # s without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - else: - if wordNext == "pm" or wordNext == "p.m.": - strHH = strNum - reaminder = "pm" - used = 1 - elif wordNext == "am" or wordNext == "a.m.": - strHH = strNum - remainder = "am" - used = 1 - elif ( - int(word) > 100 and - ( - wordPrev == "o" or - wordPrev == "oh" - )): - # 0800 hours (pronounced oh-eight-hundred) - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - military = True - if wordNext == "hours": - used += 1 - elif ( - wordNext == "hours" and - word[0] != '0' and - ( - int(word) < 100 and - int(word) > 2400 - )): - # ignores military time - # "in 3 hours" - hrOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "minutes": - # "in 10 minutes" - minOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "seconds": - # in 5 seconds - secOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif int(word) > 100: - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - military = True - if wordNext == "hours": - used += 1 - elif wordNext[0].isdigit(): - strHH = word - strMM = wordNext - military = True - used += 1 - if wordNextNext == "hours": - used += 1 - elif ( - wordNext == "" or wordNext == "o'clock" or - ( - wordNext == "in" and - ( - wordNextNext == "the" or - wordNextNext == timeQualifier - ) - )): - strHH = word - strMM = 00 - if wordNext == "o'clock": - used += 1 - if wordNext == "in" or wordNextNext == "in": - used += (1 if wordNext == "in" else 2) - if (wordNextNext and - wordNextNext in timeQualifier or - (words[words.index(wordNextNext) + 1] and - words[words.index(wordNextNext) + 1] in - timeQualifier)): - if (wordNextNext == "afternoon" or - (len(words) > - words.index(wordNextNext) + 1 and - words[words.index( - wordNextNext) + 1] == "afternoon")): - remainder = "pm" - if (wordNextNext == "evening" or - (len(words) > - (words.index(wordNextNext) + 1) and - words[words.index( - wordNextNext) + 1] == "evening")): - remainder = "pm" - if (wordNextNext == "morning" or - (len(words) > - words.index(wordNextNext) + 1 and - words[words.index( - wordNextNext) + 1] == "morning")): - remainder = "am" - if timeQualifier != "": - military = True - else: - isTime = False - - strHH = int(strHH) if strHH else 0 - strMM = int(strMM) if strMM else 0 - strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH - strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH - if strHH > 24 or strMM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = strHH * 1 - minAbs = strMM * 1 - used += 1 - if used > 0: - # removed parsed words from the sentence - for i in range(used): - words[idx + i] = "" - - if wordPrev == "o" or wordPrev == "oh": - words[words.index(wordPrev)] = "" - - if wordPrev == "early": - hrOffset = -1 - words[idx - 1] = "" - idx -= 1 - elif wordPrev == "late": - hrOffset = 1 - words[idx - 1] = "" - idx -= 1 - if idx > 0 and wordPrev in markers: - words[idx - 1] = "" - if idx > 1 and wordPrevPrev in markers: - words[idx - 2] = "" - - idx += used - 1 - found = True - - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow - extractedDate = extractedDate.replace(microsecond=0, - second=0, - minute=0, - hour=0) - if datestr != "": - temp = datetime.strptime(datestr, "%B %d") - if not hasYear: - temp = temp.replace(year=extractedDate.year) - if extractedDate < temp: - extractedDate = extractedDate.replace(year=int(currentYear), - month=int( - temp.strftime( - "%m")), - day=int(temp.strftime( - "%d"))) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - - if timeStr != "": - temp = datetime(timeStr) - extractedDate = extractedDate.replace(hour=temp.strftime("%H"), - minute=temp.strftime("%M"), - second=temp.strftime("%S")) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - if hrAbs != -1 and minAbs != -1: - - extractedDate = extractedDate + relativedelta(hours=hrAbs, - minutes=minAbs) - if (hrAbs != 0 or minAbs != 0) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - for idx, word in enumerate(words): - if words[idx] == "and" and words[idx - 1] == "" and words[ - idx + 1] == "": - words[idx] = "" - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - return [extractedDate, resultStr] - - -def look_for_fractions(split_list): - """" - This function takes a list made by fraction & determines if a fraction. - - Args: - split_list (list): list created by splitting on '/' - Returns: - (bool): False if not a fraction, otherwise True - - """ - - if len(split_list) == 2: - if is_numeric(split_list[0]) and is_numeric(split_list[1]): - return True - - return False - - -def isFractional_en(input_str): - """ - This function takes the given text and checks if it is a fraction. - - Args: - text (str): the string to check if fractional - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - - aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth", - "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"] - - if input_str.lower() in aFrac: - return 1.0 / (aFrac.index(input_str) + 1) - if input_str == "quarter": - return 1.0 / 4 - - return False - - -def get_gender(word, input_string="", lang="en-us"): - ''' - guess gender of word, optionally use raw input text for context - returns "m" if the word is male, "f" if female, False if unknown - ''' - if "pt" in lang or "es" in lang: - # spanish follows same rules - return get_gender_pt(word, input_string) - return False - - # ============================================================== @@ -921,1431 +131,12 @@ def normalize(text, lang="en-us", remove_articles=True): return text -def normalize_en(text, remove_articles): - """ English string normalization """ - - words = text.split() # this also removed extra spaces - normalized = "" - for word in words: - if remove_articles and word in ["the", "a", "an"]: - continue - - # Expand common contractions, e.g. "isn't" -> "is not" - contraction = ["ain't", "aren't", "can't", "could've", "couldn't", - "didn't", "doesn't", "don't", "gonna", "gotta", - "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", - "how'd", "how'll", "how's", "I'd", "I'll", "I'm", - "I've", "isn't", "it'd", "it'll", "it's", "mightn't", - "might've", "mustn't", "must've", "needn't", - "oughtn't", - "shan't", "she'd", "she'll", "she's", "shouldn't", - "should've", "somebody's", "someone'd", "someone'll", - "someone's", "that'll", "that's", "that'd", "there'd", - "there're", "there's", "they'd", "they'll", "they're", - "they've", "wasn't", "we'd", "we'll", "we're", "we've", - "weren't", "what'd", "what'll", "what're", "what's", - "whats", # technically incorrect but some STT outputs - "what've", "when's", "when'd", "where'd", "where's", - "where've", "who'd", "who'd've", "who'll", "who're", - "who's", "who've", "why'd", "why're", "why's", "won't", - "won't've", "would've", "wouldn't", "wouldn't've", - "y'all", "ya'll", "you'd", "you'd've", "you'll", - "y'aint", "y'ain't", "you're", "you've"] - if word in contraction: - expansion = ["is not", "are not", "can not", "could have", - "could not", "did not", "does not", "do not", - "going to", "got to", "had not", "has not", - "have not", "he would", "he will", "he is", - "how did", - "how will", "how is", "I would", "I will", "I am", - "I have", "is not", "it would", "it will", "it is", - "might not", "might have", "must not", "must have", - "need not", "ought not", "shall not", "she would", - "she will", "she is", "should not", "should have", - "somebody is", "someone would", "someone will", - "someone is", "that will", "that is", "that would", - "there would", "there are", "there is", "they would", - "they will", "they are", "they have", "was not", - "we would", "we will", "we are", "we have", - "were not", "what did", "what will", "what are", - "what is", - "what is", "what have", "when is", "when did", - "where did", "where is", "where have", "who would", - "who would have", "who will", "who are", "who is", - "who have", "why did", "why are", "why is", - "will not", "will not have", "would have", - "would not", "would not have", "you all", "you all", - "you would", "you would have", "you will", - "you are not", "you are not", "you are", "you have"] - word = expansion[contraction.index(word)] - - # Convert numbers into digits, e.g. "two" -> "2" - textNumbers = ["zero", "one", "two", "three", "four", "five", "six", - "seven", "eight", "nine", "ten", "eleven", "twelve", - "thirteen", "fourteen", "fifteen", "sixteen", - "seventeen", "eighteen", "nineteen", "twenty"] - if word in textNumbers: - word = str(textNumbers.index(word)) - - normalized += " " + word - - return normalized[1:] # strip the initial space - - -#################################################################### -# PT-PT -# -# TODO: numbers greater than 999999 -# TODO: date time pt -#################################################################### - -# Undefined articles ["um", "uma", "uns", "umas"] can not be supressed, -# in PT, "um cavalo" means "a horse" or "one horse". -pt_articles = ["o", "a", "os", "as"] - -pt_numbers = { - "zero": 0, - "um": 1, - "uma": 1, - "uns": 1, - "umas": 1, - "primeiro": 1, - "segundo": 2, - "terceiro": 3, - "dois": 2, - "duas": 2, - "tres": 3, - u"tr�s": 3, - "quatro": 4, - "cinco": 5, - "seis": 6, - "sete": 7, - "oito": 8, - "nove": 9, - "dez": 10, - "onze": 11, - "doze": 12, - "treze": 13, - "catorze": 14, - "quinze": 15, - "dezasseis": 16, - "dezassete": 17, - "dezoito": 18, - "dezanove": 19, - "vinte": 20, - "trinta": 30, - "quarenta": 40, - "cinquenta": 50, - "sessenta": 60, - "setenta": 70, - "oitenta": 80, - "noventa": 90, - "cem": 100, - "cento": 100, - "duzentos": 200, - "duzentas": 200, - "trezentos": 300, - "trezentas": 300, - "quatrocentos": 400, - "quatrocentas": 400, - "quinhentos": 500, - "quinhentas": 500, - "seiscentos": 600, - "seiscentas": 600, - "setecentos": 700, - "setecentas": 700, - "oitocentos": 800, - "oitocentas": 800, - "novecentos": 900, - "novecentas": 900, - "mil": 1000, - u"milh�o": 1000000} - - -def isFractional_pt(input_str): - """ - This function takes the given text and checks if it is a fraction. - - Args: - text (str): the string to check if fractional - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - - aFrac = ["meio", u"terço", "quarto", "quinto", "sexto", - "setimo", "oitavo", "nono", u"décimo"] - - if input_str.lower() in aFrac: - return 1.0 / (aFrac.index(input_str) + 2) - if input_str == u"vigésimo": - return 1.0 / 20 - if input_str == u"trigésimo": - return 1.0 / 30 - if input_str == u"centésimo": - return 1.0 / 100 - if input_str == u"milésimo": - return 1.0 / 1000 - if (input_str == u"sétimo" or input_str == "septimo" or - input_str == u"séptimo"): - return 1.0 / 7 - +def get_gender(word, input_string="", lang="en-us"): + ''' + guess gender of word, optionally use raw input text for context + returns "m" if the word is male, "f" if female, False if unknown + ''' + if "pt" in lang or "es" in lang: + # spanish follows same rules + return get_gender_pt(word, input_string) return False - - -def extractnumber_pt(text): - """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - - """ - aWords = text.split() - count = 0 - result = None - while count < len(aWords): - val = 0 - word = aWords[count] - next_next_word = None - if count + 1 < len(aWords): - next_word = aWords[count + 1] - if count + 2 < len(aWords): - next_next_word = aWords[count + 2] - else: - next_word = None - - # is current word a number? - if word in pt_numbers: - val = pt_numbers[word] - elif word.isdigit(): # doesn't work with decimals - val = int(word) - elif is_numeric(word): - val = float(word) - elif isFractional_pt(word): - if not result: - result = 1 - result = result * isFractional_pt(word) - count += 1 - continue - - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - - if val: - if result is None: - result = 0 - # handle fractions - if next_word != "avos": - result += val - else: - result = float(result) / float(val) - - if next_word is None: - break - - # number word and fraction - ands = ["e"] - if next_word in ands: - zeros = 0 - if result is None: - count += 1 - continue - newWords = aWords[count + 2:] - newText = "" - for word in newWords: - newText += word + " " - - afterAndVal = extractnumber_pt(newText[:-1]) - if afterAndVal: - if result < afterAndVal or result < 20: - while afterAndVal > 1: - afterAndVal = afterAndVal / 10.0 - for word in newWords: - if word == "zero" or word == "0": - zeros += 1 - else: - break - for i in range(0, zeros): - afterAndVal = afterAndVal / 10.0 - result += afterAndVal - break - elif next_next_word is not None: - if next_next_word in ands: - newWords = aWords[count + 3:] - newText = "" - for word in newWords: - newText += word + " " - afterAndVal = extractnumber_pt(newText[:-1]) - if afterAndVal: - if result is None: - result = 0 - result += afterAndVal - break - - decimals = ["ponto", "virgula", u"v�rgula", ".", ","] - if next_word in decimals: - zeros = 0 - newWords = aWords[count + 2:] - newText = "" - for word in newWords: - newText += word + " " - for word in newWords: - if word == "zero" or word == "0": - zeros += 1 - else: - break - afterDotVal = str(extractnumber_pt(newText[:-1])) - afterDotVal = zeros * "0" + afterDotVal - result = float(str(result) + "." + afterDotVal) - break - count += 1 - - if result is None: - return False - - # Return the $str with the number related words removed - # (now empty strings, so strlen == 0) - # aWords = [word for word in aWords if len(word) > 0] - # text = ' '.join(aWords) - if "." in str(result): - integer, dec = str(result).split(".") - # cast float to int - if dec == "0": - result = int(integer) - - return result - - -def pt_number_parse(words, i): - def pt_cte(i, s): - if i < len(words) and s == words[i]: - return s, i + 1 - return None - - def pt_number_word(i, mi, ma): - if i < len(words): - v = pt_numbers.get(words[i]) - if v and v >= mi and v <= ma: - return v, i + 1 - return None - - def pt_number_1_99(i): - r1 = pt_number_word(i, 1, 29) - if r1: - return r1 - - r1 = pt_number_word(i, 30, 90) - if r1: - v1, i1 = r1 - r2 = pt_cte(i1, "e") - if r2: - v2, i2 = r2 - r3 = pt_number_word(i2, 1, 9) - if r3: - v3, i3 = r3 - return v1 + v3, i3 - return r1 - return None - - def pt_number_1_999(i): - # [2-9]cientos [1-99]? - r1 = pt_number_word(i, 100, 900) - if r1: - v1, i1 = r1 - r2 = pt_number_1_99(i1) - if r2: - v2, i2 = r2 - return v1 + v2, i2 - else: - return r1 - - # [1-99] - r1 = pt_number_1_99(i) - if r1: - return r1 - - return None - - def pt_number(i): - # check for cero - r1 = pt_number_word(i, 0, 0) - if r1: - return r1 - - # check for [1-999] (mil [0-999])? - r1 = pt_number_1_999(i) - if r1: - v1, i1 = r1 - r2 = pt_cte(i1, "mil") - if r2: - v2, i2 = r2 - r3 = pt_number_1_999(i2) - if r3: - v3, i3 = r3 - return v1 * 1000 + v3, i3 - else: - return v1 * 1000, i2 - else: - return r1 - return None - - return pt_number(i) - - -def normalize_pt(text, remove_articles): - """ PT string normalization """ - - words = text.split() # this also removed extra spaces - normalized = "" - # Contractions are not common in PT - - # Convert numbers into digits, e.g. "dois" -> "2" - normalized = "" - i = 0 - while i < len(words): - word = words[i] - # remove articles - if remove_articles and word in pt_articles: - i += 1 - continue - - # Convert numbers into digits - r = pt_number_parse(words, i) - if r: - v, i = r - normalized += " " + str(v) - continue - - # NOTE temporary , handle some numbers above >999 - if word in pt_numbers: - word = str(pt_numbers[word]) - # end temporary - - normalized += " " + word - i += 1 - # some articles in pt-pt can not be removed, but many words can - # this is experimental and some meaning may be lost - # maybe agressive should default to False - # only usage will tell, as a native speaker this seems reasonable - return pt_pruning(normalized[1:], agressive=remove_articles) - - -def extract_datetime_pt(input_str, currentDate=None): - def clean_string(str): - # cleans the input string of unneeded punctuation and capitalization - # among other things - symbols = [".", ",", ";", "?", "!", u"�", u"�"] - noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", - "ao", "aos"] - - for word in symbols: - str = str.replace(word, "") - for word in noise_words: - str = str.replace(" " + word + " ", " ") - str = str.lower().replace( - u"á", - "a").replace( - u"ç", - "c").replace( - u"à", - "a").replace( - u"ã", - "a").replace( - u"é", - "e").replace( - u"è", - "e").replace( - u"ê", - "e").replace( - u"ó", - "o").replace( - u"ò", - "o").replace( - "-", - " ").replace( - "_", - "") - # handle synonims and equivalents, "tomorrow early = tomorrow morning - synonims = {"manha": ["manhazinha", "cedo", "cedinho"], - "tarde": ["tardinha", "tarde"], - "noite": ["noitinha", "anoitecer"], - "todos": ["ao", "aos"], - "em": ["do", "da", "dos", "das", "de"]} - for syn in synonims: - for word in synonims[syn]: - str = str.replace(" " + word + " ", " " + syn + " ") - # relevant plurals, cant just extract all s in pt - wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", - "minutos", "segundos", "nas", "nos", "proximas", - "seguintes", "horas"] - for idx, word in enumerate(wordlist): - str = str.replace(word, word.rstrip('s')) - str = str.replace("meses", "mes").replace("anteriores", "anterior") - return str - - def date_found(): - return found or \ - ( - datestr != "" or timeStr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs != 0 or minOffset != 0 or - minAbs != 0 or secOffset != 0 - ) - - if input_str == "": - return None - if currentDate is None: - currentDate = datetime.now() - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - dateNow = currentDate - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - words = clean_string(input_str).split(" ") - timeQualifiersList = ['manha', 'tarde', 'noite'] - time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", - "no", "dia", "hora"] - days = ['segunda', 'terca', 'quarta', - 'quinta', 'sexta', 'sabado', 'domingo'] - months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', - 'julho', 'agosto', 'setembro', 'outubro', 'novembro', - 'dezembro'] - monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', - 'set', 'out', 'nov', 'dec'] - nexts = ["proximo", "proxima"] - suffix_nexts = ["seguinte", "subsequente", "seguir"] - lasts = ["ultimo", "ultima"] - suffix_lasts = ["passada", "passado", "anterior", "antes"] - nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] - prevs = ["antes", "ante", "previa", "previamente", "anterior"] - froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", - "depois", "por", "proxima", "proximo", "da", "do", "de"] - thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", - "nessa"] - froms += thises - lists = nxts + prevs + froms + time_indicators - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" - - start = idx - used = 0 - # save timequalifier for later - if word in timeQualifiersList: - timeQualifier = word - - # parse today, tomorrow, yesterday - elif word == "hoje" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "amanha" and not fromFlag: - dayOffset = 1 - used += 1 - elif word == "ontem" and not fromFlag: - dayOffset -= 1 - used += 1 - # "before yesterday" and "before before yesterday" - elif (word == "anteontem" or - (word == "ante" and wordNext == "ontem")) and not fromFlag: - dayOffset -= 2 - used += 1 - if wordNext == "ontem": - used += 1 - elif word == "ante" and wordNext == "ante" and wordNextNext == \ - "ontem" and not fromFlag: - dayOffset -= 3 - used += 3 - elif word == "anteanteontem" and not fromFlag: - dayOffset -= 3 - used += 1 - # day after tomorrow - elif word == "depois" and wordNext == "amanha" and not fromFlag: - dayOffset += 2 - used = 2 - # day before yesterday - elif word == "antes" and wordNext == "ontem" and not fromFlag: - dayOffset -= 2 - used = 2 - # parse 5 days, 10 weeks, last week, next week, week after - elif word == "dia": - if wordNext == "depois" or wordNext == "antes": - used += 1 - if wordPrev and wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used += 1 - elif (wordPrev and wordPrev[0].isdigit() and - wordNext not in months and - wordNext not in monthsShort): - dayOffset += int(wordPrev) - start -= 1 - used += 2 - elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ - months and wordNextNext not in monthsShort: - dayOffset += int(wordNext) - start -= 1 - used += 2 - - elif word == "semana" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - dayOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - dayOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - dayOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "mes" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - monthOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - monthOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - monthOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - monthOffset = -7 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "ano" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - yearOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - yearOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - yearOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - yearOffset = -7 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - for w in nexts: - if wordPrev == w: - dayOffset += 7 - used += 1 - start -= 1 - for w in lasts: - if wordPrev == w: - dayOffset -= 7 - used += 1 - start -= 1 - for w in suffix_nexts: - if wordNext == w: - dayOffset += 7 - used += 1 - start -= 1 - for w in suffix_lasts: - if wordNext == w: - dayOffset -= 7 - used += 1 - start -= 1 - if wordNext == "feira": - used += 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in monthsShort: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months[m] - if wordPrev and wordPrev[0].isdigit(): - # 13 maio - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - # maio 13 - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordPrevPrev and wordPrevPrev[0].isdigit(): - # 13 dia maio - datestr += " " + wordPrevPrev - - start -= 2 - used += 2 - if wordNext and word[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNextNext and wordNextNext[0].isdigit(): - # maio dia 13 - datestr += " " + wordNextNext - used += 2 - if wordNextNextNext and wordNextNextNext[0].isdigit(): - datestr += " " + wordNextNextNext - used += 1 - hasYear = True - else: - hasYear = False - - if datestr in months: - datestr = "" - - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - validFollowups = days + months + monthsShort - validFollowups.append("hoje") - validFollowups.append("amanha") - validFollowups.append("ontem") - validFollowups.append("anteontem") - validFollowups.append("agora") - validFollowups.append("ja") - validFollowups.append("ante") - - # TODO debug word "depois" that one is failing for some reason - if word in froms and wordNext in validFollowups: - - if not (wordNext == "amanha" and wordNext == "ontem") and not ( - word == "depois" or word == "antes" or word == "em"): - used = 2 - fromFlag = True - if wordNext == "amanha" and word != "depois": - dayOffset += 1 - elif wordNext == "ontem": - dayOffset -= 1 - elif wordNext == "anteontem": - dayOffset -= 2 - elif wordNext == "ante" and wordNextNext == "ontem": - dayOffset -= 2 - elif (wordNext == "ante" and wordNext == "ante" and - wordNextNextNext == "ontem"): - dayOffset -= 3 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if wordNextNext == "feira": - used += 1 - if tmpOffset < 0: - tmpOffset += 7 - if wordNextNext: - if wordNextNext in nxts: - tmpOffset += 7 - used += 1 - elif wordNextNext in prevs: - tmpOffset -= 7 - used += 1 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNextNextNext: - if wordNextNextNext in nxts: - tmpOffset += 7 - used += 1 - elif wordNextNextNext in prevs: - tmpOffset -= 7 - used += 1 - dayOffset += tmpOffset - if wordNextNextNext == "feira": - used += 1 - if wordNext in months: - used -= 1 - if used > 0: - - if start - 1 > 0 and words[start - 1] in lists: - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if (start - 1 >= 0 and words[start - 1] in lists): - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - timeStr = "" - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = 0 - minAbs = 0 - military = False - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word == "meio" and wordNext == "dia": - hrAbs = 12 - used += 2 - elif word == "meia" and wordNext == "noite": - hrAbs = 0 - used += 2 - elif word == "manha": - if hrAbs == 0: - hrAbs = 8 - used += 1 - elif word == "tarde": - if hrAbs == 0: - hrAbs = 15 - used += 1 - elif word == "meio" and wordNext == "tarde": - if hrAbs == 0: - hrAbs = 17 - used += 2 - elif word == "meio" and wordNext == "manha": - if hrAbs == 0: - hrAbs = 10 - used += 2 - elif word == "fim" and wordNext == "tarde": - if hrAbs == 0: - hrAbs = 19 - used += 2 - elif word == "fim" and wordNext == "manha": - if hrAbs == 0: - hrAbs = 11 - used += 2 - elif word == "tantas" and wordNext == "manha": - if hrAbs == 0: - hrAbs = 4 - used += 2 - elif word == "noite": - if hrAbs == 0: - hrAbs = 22 - used += 1 - # parse half an hour, quarter hour - elif word == "hora" and \ - (wordPrev in time_indicators or wordPrevPrev in - time_indicators): - if wordPrev == "meia": - minOffset = 30 - elif wordPrev == "quarto": - minOffset = 15 - elif wordPrevPrev == "quarto": - minOffset = 15 - if idx > 2 and words[idx - 3] in time_indicators: - words[idx - 3] = "" - words[idx - 2] = "" - else: - hrOffset = 1 - if wordPrevPrev in time_indicators: - words[idx - 2] = "" - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - elif wordNext == "manha": - remainder = "am" - used += 1 - elif wordNext == "tarde": - remainder = "pm" - used += 1 - elif wordNext == "noite": - if 0 < int(word[0]) < 6: - remainder = "am" - else: - remainder = "pm" - used += 1 - elif wordNext in thises and wordNextNext == "manha": - remainder = "am" - used = 2 - elif wordNext in thises and wordNextNext == "tarde": - remainder = "pm" - used = 2 - elif wordNext in thises and wordNextNext == "noite": - remainder = "pm" - used = 2 - else: - if timeQualifier != "": - military = True - if strHH <= 12 and \ - (timeQualifier == "manha" or - timeQualifier == "tarde"): - strHH += 12 - - else: - # try to parse # s without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - else: - if (wordNext == "pm" or - wordNext == "p.m." or - wordNext == "tarde"): - strHH = strNum - remainder = "pm" - used = 1 - elif (wordNext == "am" or - wordNext == "a.m." or - wordNext == "manha"): - strHH = strNum - remainder = "am" - used = 1 - elif (int(word) > 100 and - ( - wordPrev == "o" or - wordPrev == "oh" or - wordPrev == "zero" - )): - # 0800 hours (pronounced oh-eight-hundred) - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - military = True - if wordNext == "hora": - used += 1 - elif ( - wordNext == "hora" and - word[0] != '0' and - ( - int(word) < 100 and - int(word) > 2400 - )): - # ignores military time - # "in 3 hours" - hrOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "minuto": - # "in 10 minutes" - minOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "segundo": - # in 5 seconds - secOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif int(word) > 100: - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - military = True - if wordNext == "hora": - used += 1 - - elif wordNext == "" or ( - wordNext == "em" and wordNextNext == "ponto"): - strHH = word - strMM = 00 - if wordNext == "em" and wordNextNext == "ponto": - used += 2 - if wordNextNextNext == "tarde": - remainder = "pm" - used += 1 - elif wordNextNextNext == "manha": - remainder = "am" - used += 1 - elif wordNextNextNext == "noite": - if 0 > strHH > 6: - remainder = "am" - else: - remainder = "pm" - used += 1 - - elif wordNext[0].isdigit(): - strHH = word - strMM = wordNext - military = True - used += 1 - if wordNextNext == "hora": - used += 1 - else: - isTime = False - - strHH = int(strHH) if strHH else 0 - strMM = int(strMM) if strMM else 0 - strHH = strHH + 12 if (remainder == "pm" and - 0 < strHH < 12) else strHH - strHH = strHH - 12 if (remainder == "am" and - 0 < strHH >= 12) else strHH - if strHH > 24 or strMM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = strHH * 1 - minAbs = strMM * 1 - used += 1 - - if used > 0: - # removed parsed words from the sentence - for i in range(used): - words[idx + i] = "" - - if wordPrev == "em" or wordPrev == "ponto": - words[words.index(wordPrev)] = "" - - if idx > 0 and wordPrev in time_indicators: - words[idx - 1] = "" - if idx > 1 and wordPrevPrev in time_indicators: - words[idx - 2] = "" - - idx += used - 1 - found = True - - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow - extractedDate = extractedDate.replace(microsecond=0, - second=0, - minute=0, - hour=0) - if datestr != "": - en_months = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', - 'aug', - 'sept', 'oct', 'nov', 'dec'] - for idx, en_month in enumerate(en_months): - datestr = datestr.replace(months[idx], en_month) - for idx, en_month in enumerate(en_monthsShort): - datestr = datestr.replace(monthsShort[idx], en_month) - - temp = datetime.strptime(datestr, "%B %d") - if not hasYear: - temp = temp.replace(year=extractedDate.year) - if extractedDate < temp: - extractedDate = extractedDate.replace(year=int(currentYear), - month=int( - temp.strftime( - "%m")), - day=int(temp.strftime( - "%d"))) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - - if timeStr != "": - temp = datetime(timeStr) - extractedDate = extractedDate.replace(hour=temp.strftime("%H"), - minute=temp.strftime("%M"), - second=temp.strftime("%S")) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - if hrAbs != -1 and minAbs != -1: - - extractedDate = extractedDate + relativedelta(hours=hrAbs, - minutes=minAbs) - if (hrAbs != 0 or minAbs != 0) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - resultStr = pt_pruning(resultStr) - return [extractedDate, resultStr] - - -def pt_pruning(text, symbols=True, accents=True, agressive=True): - # agressive pt word pruning - words = ["a", "o", "os", "as", "de", "dos", "das", - "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", - "este", - "esta", "deste", "desta", "neste", "nesta", "nesse", - "nessa", "foi", "que"] - if symbols: - symbols = [".", ",", ";", ":", "!", "?", u"�", u"�"] - for symbol in symbols: - text = text.replace(symbol, "") - text = text.replace("-", " ").replace("_", " ") - if accents: - accents = {"a": [u"á", u"à", u"ã", u"â"], - "e": [u"ê", u"è", u"é"], - "i": [u"í", u"ì"], - "o": [u"ò", u"ó"], - "u": [u"ú", u"ù"], - "c": [u"ç"]} - for char in accents: - for acc in accents[char]: - text = text.replace(acc, char) - if agressive: - text_words = text.split(" ") - for idx, word in enumerate(text_words): - if word in words: - text_words[idx] = "" - text = " ".join(text_words) - text = ' '.join(text.split()) - return text - - -def get_gender_pt(word, raw_string=""): - word = word.rstrip("s") - gender = False - words = raw_string.split(" ") - for idx, w in enumerate(words): - if w == word and idx != 0: - previous = words[idx - 1] - gender = get_gender_pt(previous) - break - if not gender: - if word[-1] == "a": - gender = "f" - if word[-1] == "o" or word[-1] == "e": - gender = "m" - return gender - - -#################################################################### -# Spanish normalization -# -# TODO: numbers greater than 999999 -#################################################################### - -# Undefined articles ["un", "una", "unos", "unas"] can not be supressed, -# in Spanish, "un caballo" means "a horse" or "one horse". -es_articles = ["el", "la", "los", "las"] - -es_numbers_xlat = { - "un": 1, - "uno": 1, - "una": 1, - "dos": 2, - "tres": 3, - u"tr�s": 3, - "cuatro": 4, - "cinco": 5, - "seis": 6, - "siete": 7, - "ocho": 8, - "nueve": 9, - "diez": 10, - "once": 11, - "doce": 12, - "trece": 13, - "catorce": 14, - "quince": 15, - "dieciseis": 16, - u"diecis�is": 16, - "diecisiete": 17, - "dieciocho": 18, - "diecinueve": 19, - "veinte": 20, - "veintiuno": 21, - u"veintid�s": 22, - u"veintitr�s": 23, - "veintidos": 22, - "veintitres": 23, - "veinticuatro": 24, - "veinticinco": 25, - u"veintis�is": 26, - "veintiseis": 26, - "veintisiete": 27, - "veintiocho": 28, - "veintinueve": 29, - "treinta": 30, - "cuarenta": 40, - "cincuenta": 50, - "sesenta": 60, - "setenta": 70, - "ochenta": 80, - "noventa": 90, - "cien": 100, - "ciento": 100, - "doscientos": 200, - "doscientas": 200, - "trescientos": 300, - "trescientas": 300, - "cuatrocientos": 400, - "cuatrocientas": 400, - "quinientos": 500, - "quinientas": 500, - "seiscientos": 600, - "seiscientas": 600, - "setecientos": 700, - "setecientas": 700, - "ochocientos": 800, - "ochocientas": 800, - "novecientos": 900, - "novecientas": 900} - - -def es_parse(words, i): - def es_cte(i, s): - if i < len(words) and s == words[i]: - return s, i + 1 - return None - - def es_number_word(i, mi, ma): - if i < len(words): - v = es_numbers_xlat.get(words[i]) - if v and v >= mi and v <= ma: - return v, i + 1 - return None - - def es_number_1_99(i): - r1 = es_number_word(i, 1, 29) - if r1: - return r1 - - r1 = es_number_word(i, 30, 90) - if r1: - v1, i1 = r1 - r2 = es_cte(i1, "y") - if r2: - v2, i2 = r2 - r3 = es_number_word(i2, 1, 9) - if r3: - v3, i3 = r3 - return v1 + v3, i3 - return r1 - return None - - def es_number_1_999(i): - # [2-9]cientos [1-99]? - r1 = es_number_word(i, 100, 900) - if r1: - v1, i1 = r1 - r2 = es_number_1_99(i1) - if r2: - v2, i2 = r2 - return v1 + v2, i2 - else: - return r1 - - # [1-99] - r1 = es_number_1_99(i) - if r1: - return r1 - - return None - - def es_number(i): - # check for cero - r1 = es_number_word(i, 0, 0) - if r1: - return r1 - - # check for [1-999] (mil [0-999])? - r1 = es_number_1_999(i) - if r1: - v1, i1 = r1 - r2 = es_cte(i1, "mil") - if r2: - v2, i2 = r2 - r3 = es_number_1_999(i2) - if r3: - v3, i3 = r3 - return v1 * 1000 + v3, i3 - else: - return v1 * 1000, i2 - else: - return r1 - return None - - return es_number(i) - - -def normalize_es(text, remove_articles): - """ Spanish string normalization """ - - words = text.split() # this also removed extra spaces - - normalized = "" - i = 0 - while i < len(words): - word = words[i] - - if remove_articles and word in es_articles: - i += 1 - continue - - # Convert numbers into digits - r = es_parse(words, i) - if r: - v, i = r - normalized += " " + str(v) - continue - - normalized += " " + word - i += 1 - - return normalized[1:] # strip the initial space diff --git a/test/unittests/util/test_parse.py b/test/unittests/util/test_parse.py index b301ab9af4..ba53eae436 100644 --- a/test/unittests/util/test_parse.py +++ b/test/unittests/util/test_parse.py @@ -375,7 +375,7 @@ class TestNormalize(unittest.TestCase): self.assertEqual( extractnumber("tres quartos de chocolate", lang="pt"), 3.0 / 4.0) - self.assertEqual(extractnumber(u"tr�s quarto de chocolate", + self.assertEqual(extractnumber(u"três quarto de chocolate", lang="pt"), 3.0 / 4.0) self.assertEqual(extractnumber("sete ponto cinco", lang="pt"), 7.5) self.assertEqual(extractnumber("sete ponto 5", lang="pt"), 7.5) @@ -420,9 +420,9 @@ class TestNormalize(unittest.TestCase): "isto e 1 teste") def test_numbers_pt(self): - self.assertEqual(normalize(u"isto e o um dois tr�s teste", lang="pt"), + self.assertEqual(normalize(u"isto e o um dois três teste", lang="pt"), u"isto 1 2 3 teste") - self.assertEqual(normalize(u"� a sete oito nove test", lang="pt"), + self.assertEqual(normalize(u"ê a sete oito nove test", lang="pt"), u"7 8 9 test") self.assertEqual( normalize("teste zero dez onze doze treze", lang="pt"), @@ -459,9 +459,9 @@ class TestNormalize(unittest.TestCase): self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) - testExtract(u"que dia � hoje", + testExtract(u"que dia é hoje", "2017-06-27 00:00:00", u"dia") - testExtract(u"que dia � amanha", + testExtract(u"que dia é amanha", "2017-06-28 00:00:00", u"dia") testExtract(u"que dia foi ontem", "2017-06-26 00:00:00", u"dia") @@ -513,7 +513,7 @@ class TestNormalize(unittest.TestCase): testExtract("lembra me para ligar a mae no dia 3 de agosto", "2017-08-03 00:00:00", "lembra ligar mae") - testExtract(u"compra facas no 13� dia de maio", + testExtract(u"compra facas no 13º dia de maio", "2018-05-13 00:00:00", "compra facas") testExtract(u"gasta dinheiro no maio dia 13", "2018-05-13 00:00:00", "gasta dinheiro") @@ -588,7 +588,7 @@ class TestNormalize(unittest.TestCase): self.assertEqual(normalize("diez once doce trece catorce quince", lang="es"), "10 11 12 13 14 15") - self.assertEqual(normalize(u"diecis�is diecisiete", lang="es"), + self.assertEqual(normalize(u"dieciséis diecisiete", lang="es"), "16 17") self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"), "18 19") @@ -609,7 +609,7 @@ class TestNormalize(unittest.TestCase): lang="es"), "2345") self.assertEqual(normalize( - u"ciento veintitr�s mil cuatrocientas cincuenta y seis", + u"ciento veintitrés mil cuatrocientas cincuenta y seis", lang="es"), "123456") self.assertEqual(normalize(