2340 lines
79 KiB
Python
2340 lines
79 KiB
Python
# -*- coding: iso-8859-15 -*-
|
||
#
|
||
# Copyright 2017 Mycroft AI Inc.
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
#
|
||
from datetime import datetime, timedelta
|
||
from dateutil.relativedelta import relativedelta
|
||
|
||
|
||
def extractnumber(text, lang="en-us"):
|
||
"""Takes in a string and extracts a number.
|
||
Args:
|
||
text (str): the string to extract a number from
|
||
lang (str): the code for the language text is in
|
||
Returns:
|
||
(str): The number extracted or the original text.
|
||
"""
|
||
|
||
lang_lower = str(lang).lower()
|
||
if lang_lower.startswith("en"):
|
||
# return extractnumber_en(text, remove_articles)
|
||
return extractnumber_en(text)
|
||
elif lang_lower.startswith("pt"):
|
||
return extractnumber_pt(text)
|
||
|
||
# TODO: Normalization for other languages
|
||
return text
|
||
|
||
|
||
def extract_datetime(text, anchorDate=None, lang="en-us"):
|
||
"""
|
||
Parsing function that extracts date and time information
|
||
from sentences. Parses many of the common ways that humans
|
||
express dates and times. Includes relative dates like "5 days from today".
|
||
|
||
Vague terminology are given arbitrary values, like:
|
||
- morning = 8 AM
|
||
- afternoon = 3 PM
|
||
- evening = 7 PM
|
||
|
||
If a time isn't supplied, the function defaults to 12 AM
|
||
|
||
Args:
|
||
str (string): the text to be normalized
|
||
anchortDate (:obj:`datetime`, optional): the date to be used for
|
||
relative dating (for example, what does "tomorrow" mean?).
|
||
Defaults to the current date
|
||
(acquired with datetime.datetime.now())
|
||
lang (string): the language of the sentence(s)
|
||
|
||
Returns:
|
||
[:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date
|
||
as a datetime object. Times are represented in 24 hour notation.
|
||
'leftover_string' is the original phrase with all date and time
|
||
related keywords stripped out. See examples for further
|
||
clarification
|
||
|
||
Returns 'None' if no date was extracted.
|
||
|
||
Examples:
|
||
|
||
>>> extract_datetime(
|
||
... "What is the weather like the day after tomorrow?",
|
||
... datetime(2017, 06, 30, 00, 00)
|
||
... )
|
||
[datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like']
|
||
|
||
>>> extract_datetime(
|
||
... "Set up an appointment 2 weeks from Sunday at 5 pm",
|
||
... datetime(2016, 02, 19, 00, 00)
|
||
... )
|
||
[datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment']
|
||
"""
|
||
|
||
lang_lower = str(lang).lower()
|
||
|
||
if lang_lower.startswith("en"):
|
||
return extract_datetime_en(text, anchorDate)
|
||
elif lang_lower.startswith("pt"):
|
||
return extract_datetime_pt(text, anchorDate)
|
||
|
||
return text
|
||
|
||
|
||
def is_numeric(input_str):
|
||
"""
|
||
Takes in a string and tests to see if it is a number.
|
||
Args:
|
||
text (str): string to test if a number
|
||
Returns:
|
||
(bool): True if a number, else False
|
||
|
||
"""
|
||
|
||
try:
|
||
float(input_str)
|
||
return True
|
||
except ValueError:
|
||
return False
|
||
|
||
|
||
def extractnumber_en(text):
|
||
"""
|
||
This function prepares the given text for parsing by making
|
||
numbers consistent, getting rid of contractions, etc.
|
||
Args:
|
||
text (str): the string to normalize
|
||
Returns:
|
||
(int) or (float): The value of extracted number
|
||
|
||
"""
|
||
aWords = text.split()
|
||
aWords = [word for word in aWords if word not in ["the", "a", "an"]]
|
||
andPass = False
|
||
valPreAnd = False
|
||
val = False
|
||
count = 0
|
||
while count < len(aWords):
|
||
word = aWords[count]
|
||
if is_numeric(word):
|
||
# if word.isdigit(): # doesn't work with decimals
|
||
val = float(word)
|
||
elif word == "first":
|
||
val = 1
|
||
elif word == "second":
|
||
val = 2
|
||
elif isFractional_en(word):
|
||
val = isFractional_en(word)
|
||
else:
|
||
if word == "one":
|
||
val = 1
|
||
elif word == "two":
|
||
val = 2
|
||
elif word == "three":
|
||
val = 3
|
||
elif word == "four":
|
||
val = 4
|
||
elif word == "five":
|
||
val = 5
|
||
elif word == "six":
|
||
val = 6
|
||
elif word == "seven":
|
||
val = 7
|
||
elif word == "eight":
|
||
val = 8
|
||
elif word == "nine":
|
||
val = 9
|
||
elif word == "ten":
|
||
val = 10
|
||
if val:
|
||
if count < (len(aWords) - 1):
|
||
wordNext = aWords[count + 1]
|
||
else:
|
||
wordNext = ""
|
||
valNext = isFractional_en(wordNext)
|
||
|
||
if valNext:
|
||
val = val * valNext
|
||
aWords[count + 1] = ""
|
||
|
||
# if val == False:
|
||
if not val:
|
||
# look for fractions like "2/3"
|
||
aPieces = word.split('/')
|
||
# if (len(aPieces) == 2 and is_numeric(aPieces[0])
|
||
# and is_numeric(aPieces[1])):
|
||
if look_for_fractions(aPieces):
|
||
val = float(aPieces[0]) / float(aPieces[1])
|
||
elif andPass:
|
||
# added to value, quit here
|
||
val = valPreAnd
|
||
break
|
||
else:
|
||
count += 1
|
||
continue
|
||
|
||
aWords[count] = ""
|
||
|
||
if (andPass):
|
||
aWords[count - 1] = '' # remove "and"
|
||
val += valPreAnd
|
||
elif count + 1 < len(aWords) and aWords[count + 1] == 'and':
|
||
andPass = True
|
||
valPreAnd = val
|
||
val = False
|
||
count += 2
|
||
continue
|
||
elif count + 2 < len(aWords) and aWords[count + 2] == 'and':
|
||
andPass = True
|
||
valPreAnd = val
|
||
val = False
|
||
count += 3
|
||
continue
|
||
|
||
break
|
||
|
||
# if val == False:
|
||
if not val:
|
||
return False
|
||
|
||
# Return the $str with the number related words removed
|
||
# (now empty strings, so strlen == 0)
|
||
aWords = [word for word in aWords if len(word) > 0]
|
||
text = ' '.join(aWords)
|
||
|
||
return val
|
||
|
||
|
||
def extract_datetime_en(str, currentDate=None):
|
||
def clean_string(str):
|
||
# cleans the input string of unneeded punctuation and capitalization
|
||
# among other things
|
||
str = str.lower().replace('?', '').replace('.', '').replace(',', '') \
|
||
.replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ')
|
||
wordList = str.split()
|
||
for idx, word in enumerate(wordList):
|
||
word = word.replace("'s", "")
|
||
|
||
ordinals = ["rd", "st", "nd", "th"]
|
||
if word[0].isdigit():
|
||
for ord in ordinals:
|
||
if ord in word:
|
||
word = word.replace(ord, "")
|
||
wordList[idx] = word
|
||
|
||
return wordList
|
||
|
||
def date_found():
|
||
return found or \
|
||
(
|
||
datestr != "" or timeStr != "" or
|
||
yearOffset != 0 or monthOffset != 0 or
|
||
dayOffset is True or hrOffset != 0 or
|
||
hrAbs != 0 or minOffset != 0 or
|
||
minAbs != 0 or secOffset != 0
|
||
)
|
||
|
||
if str == "":
|
||
return None
|
||
if currentDate is None:
|
||
currentDate = datetime.now()
|
||
|
||
found = False
|
||
daySpecified = False
|
||
dayOffset = False
|
||
monthOffset = 0
|
||
yearOffset = 0
|
||
dateNow = currentDate
|
||
today = dateNow.strftime("%w")
|
||
currentYear = dateNow.strftime("%Y")
|
||
fromFlag = False
|
||
datestr = ""
|
||
hasYear = False
|
||
timeQualifier = ""
|
||
|
||
timeQualifiersList = ['morning', 'afternoon', 'evening']
|
||
markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of']
|
||
days = ['monday', 'tuesday', 'wednesday',
|
||
'thursday', 'friday', 'saturday', 'sunday']
|
||
months = ['january', 'february', 'march', 'april', 'may', 'june',
|
||
'july', 'august', 'september', 'october', 'november',
|
||
'december']
|
||
monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
|
||
'sept', 'oct', 'nov', 'dec']
|
||
|
||
words = clean_string(str)
|
||
|
||
for idx, word in enumerate(words):
|
||
if word == "":
|
||
continue
|
||
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
||
wordPrev = words[idx - 1] if idx > 0 else ""
|
||
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
||
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
||
|
||
# this isn't in clean string because I don't want to save back to words
|
||
word = word.rstrip('s')
|
||
start = idx
|
||
used = 0
|
||
# save timequalifier for later
|
||
if word in timeQualifiersList:
|
||
timeQualifier = word
|
||
# parse today, tomorrow, day after tomorrow
|
||
elif word == "today" and not fromFlag:
|
||
dayOffset = 0
|
||
used += 1
|
||
elif word == "tomorrow" and not fromFlag:
|
||
dayOffset = 1
|
||
used += 1
|
||
elif (word == "day" and
|
||
wordNext == "after" and
|
||
wordNextNext == "tomorrow" and
|
||
not fromFlag and
|
||
not wordPrev[0].isdigit()):
|
||
dayOffset = 2
|
||
used = 3
|
||
if wordPrev == "the":
|
||
start -= 1
|
||
used += 1
|
||
# parse 5 days, 10 weeks, last week, next week
|
||
elif word == "day":
|
||
if wordPrev[0].isdigit():
|
||
dayOffset += int(wordPrev)
|
||
start -= 1
|
||
used = 2
|
||
elif word == "week" and not fromFlag:
|
||
if wordPrev[0].isdigit():
|
||
dayOffset += int(wordPrev) * 7
|
||
start -= 1
|
||
used = 2
|
||
elif wordPrev == "next":
|
||
dayOffset = 7
|
||
start -= 1
|
||
used = 2
|
||
elif wordPrev == "last":
|
||
dayOffset = -7
|
||
start -= 1
|
||
used = 2
|
||
# parse 10 months, next month, last month
|
||
elif word == "month" and not fromFlag:
|
||
if wordPrev[0].isdigit():
|
||
monthOffset = int(wordPrev)
|
||
start -= 1
|
||
used = 2
|
||
elif wordPrev == "next":
|
||
monthOffset = 1
|
||
start -= 1
|
||
used = 2
|
||
elif wordPrev == "last":
|
||
monthOffset = -1
|
||
start -= 1
|
||
used = 2
|
||
# parse 5 years, next year, last year
|
||
elif word == "year" and not fromFlag:
|
||
if wordPrev[0].isdigit():
|
||
yearOffset = int(wordPrev)
|
||
start -= 1
|
||
used = 2
|
||
elif wordPrev == "next":
|
||
yearOffset = 1
|
||
start -= 1
|
||
used = 2
|
||
elif wordPrev == "last":
|
||
yearOffset = -1
|
||
start -= 1
|
||
used = 2
|
||
# parse Monday, Tuesday, etc., and next Monday,
|
||
# last Tuesday, etc.
|
||
elif word in days and not fromFlag:
|
||
d = days.index(word)
|
||
dayOffset = (d + 1) - int(today)
|
||
used = 1
|
||
if dayOffset < 0:
|
||
dayOffset += 7
|
||
if wordPrev == "next":
|
||
dayOffset += 7
|
||
used += 1
|
||
start -= 1
|
||
elif wordPrev == "last":
|
||
dayOffset -= 7
|
||
used += 1
|
||
start -= 1
|
||
# parse 15 of July, June 20th, Feb 18, 19 of February
|
||
elif word in months or word in monthsShort and not fromFlag:
|
||
try:
|
||
m = months.index(word)
|
||
except ValueError:
|
||
m = monthsShort.index(word)
|
||
used += 1
|
||
datestr = months[m]
|
||
if wordPrev[0].isdigit() or \
|
||
(wordPrev == "of" and wordPrevPrev[0].isdigit()):
|
||
if wordPrev == "of" and wordPrevPrev[0].isdigit():
|
||
datestr += " " + words[idx - 2]
|
||
used += 1
|
||
start -= 1
|
||
else:
|
||
datestr += " " + wordPrev
|
||
start -= 1
|
||
used += 1
|
||
if wordNext and wordNext[0].isdigit():
|
||
datestr += " " + wordNext
|
||
used += 1
|
||
hasYear = True
|
||
else:
|
||
hasYear = False
|
||
|
||
elif wordNext and wordNext[0].isdigit():
|
||
datestr += " " + wordNext
|
||
used += 1
|
||
if wordNextNext and wordNextNext[0].isdigit():
|
||
datestr += " " + wordNextNext
|
||
used += 1
|
||
hasYear = True
|
||
else:
|
||
hasYear = False
|
||
# parse 5 days from tomorrow, 10 weeks from next thursday,
|
||
# 2 months from July
|
||
validFollowups = days + months + monthsShort
|
||
validFollowups.append("today")
|
||
validFollowups.append("tomorrow")
|
||
validFollowups.append("next")
|
||
validFollowups.append("last")
|
||
validFollowups.append("now")
|
||
if (word == "from" or word == "after") and wordNext in validFollowups:
|
||
used = 2
|
||
fromFlag = True
|
||
if wordNext == "tomorrow":
|
||
dayOffset += 1
|
||
elif wordNext in days:
|
||
d = days.index(wordNext)
|
||
tmpOffset = (d + 1) - int(today)
|
||
used = 2
|
||
if tmpOffset < 0:
|
||
tmpOffset += 7
|
||
dayOffset += tmpOffset
|
||
elif wordNextNext and wordNextNext in days:
|
||
d = days.index(wordNextNext)
|
||
tmpOffset = (d + 1) - int(today)
|
||
used = 3
|
||
if wordNext == "next":
|
||
tmpOffset += 7
|
||
used += 1
|
||
start -= 1
|
||
elif wordNext == "last":
|
||
tmpOffset -= 7
|
||
used += 1
|
||
start -= 1
|
||
dayOffset += tmpOffset
|
||
if used > 0:
|
||
if start - 1 > 0 and words[start - 1] == "this":
|
||
start -= 1
|
||
used += 1
|
||
|
||
for i in range(0, used):
|
||
words[i + start] = ""
|
||
|
||
if (start - 1 >= 0 and words[start - 1] in markers):
|
||
words[start - 1] = ""
|
||
found = True
|
||
daySpecified = True
|
||
|
||
# parse time
|
||
timeStr = ""
|
||
hrOffset = 0
|
||
minOffset = 0
|
||
secOffset = 0
|
||
hrAbs = 0
|
||
minAbs = 0
|
||
military = False
|
||
|
||
for idx, word in enumerate(words):
|
||
if word == "":
|
||
continue
|
||
|
||
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
||
wordPrev = words[idx - 1] if idx > 0 else ""
|
||
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
||
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
||
# parse noon, midnight, morning, afternoon, evening
|
||
used = 0
|
||
if word == "noon":
|
||
hrAbs = 12
|
||
used += 1
|
||
elif word == "midnight":
|
||
hrAbs = 0
|
||
used += 1
|
||
elif word == "morning":
|
||
if hrAbs == 0:
|
||
hrAbs = 8
|
||
used += 1
|
||
elif word == "afternoon":
|
||
if hrAbs == 0:
|
||
hrAbs = 15
|
||
used += 1
|
||
elif word == "evening":
|
||
if hrAbs == 0:
|
||
hrAbs = 19
|
||
used += 1
|
||
# parse half an hour, quarter hour
|
||
elif word == "hour" and \
|
||
(wordPrev in markers or wordPrevPrev in markers):
|
||
if wordPrev == "half":
|
||
minOffset = 30
|
||
elif wordPrev == "quarter":
|
||
minOffset = 15
|
||
elif wordPrevPrev == "quarter":
|
||
minOffset = 15
|
||
if idx > 2 and words[idx - 3] in markers:
|
||
words[idx - 3] = ""
|
||
words[idx - 2] = ""
|
||
else:
|
||
hrOffset = 1
|
||
if wordPrevPrev in markers:
|
||
words[idx - 2] = ""
|
||
words[idx - 1] = ""
|
||
used += 1
|
||
hrAbs = -1
|
||
minAbs = -1
|
||
# parse 5:00 am, 12:00 p.m., etc
|
||
elif word[0].isdigit():
|
||
isTime = True
|
||
strHH = ""
|
||
strMM = ""
|
||
remainder = ""
|
||
if ':' in word:
|
||
# parse colons
|
||
# "3:00 in the morning"
|
||
stage = 0
|
||
length = len(word)
|
||
for i in range(length):
|
||
if stage == 0:
|
||
if word[i].isdigit():
|
||
strHH += word[i]
|
||
elif word[i] == ":":
|
||
stage = 1
|
||
else:
|
||
stage = 2
|
||
i -= 1
|
||
elif stage == 1:
|
||
if word[i].isdigit():
|
||
strMM += word[i]
|
||
else:
|
||
stage = 2
|
||
i -= 1
|
||
elif stage == 2:
|
||
remainder = word[i:].replace(".", "")
|
||
break
|
||
if remainder == "":
|
||
nextWord = wordNext.replace(".", "")
|
||
if nextWord == "am" or nextWord == "pm":
|
||
remainder = nextWord
|
||
used += 1
|
||
elif wordNext == "in" and wordNextNext == "the" and \
|
||
words[idx + 3] == "morning":
|
||
reaminder = "am"
|
||
used += 3
|
||
elif wordNext == "in" and wordNextNext == "the" and \
|
||
words[idx + 3] == "afternoon":
|
||
remainder = "pm"
|
||
used += 3
|
||
elif wordNext == "in" and wordNextNext == "the" and \
|
||
words[idx + 3] == "evening":
|
||
remainder = "pm"
|
||
used += 3
|
||
elif wordNext == "in" and wordNextNext == "morning":
|
||
remainder = "am"
|
||
used += 2
|
||
elif wordNext == "in" and wordNextNext == "afternoon":
|
||
remainder = "pm"
|
||
used += 2
|
||
elif wordNext == "in" and wordNextNext == "evening":
|
||
remainder = "pm"
|
||
used += 2
|
||
elif wordNext == "this" and wordNextNext == "morning":
|
||
remainder = "am"
|
||
used = 2
|
||
elif wordNext == "this" and wordNextNext == "afternoon":
|
||
remainder = "pm"
|
||
used = 2
|
||
elif wordNext == "this" and wordNextNext == "evening":
|
||
remainder = "pm"
|
||
used = 2
|
||
elif wordNext == "at" and wordNextNext == "night":
|
||
if strHH > 5:
|
||
remainder = "pm"
|
||
else:
|
||
remainder = "am"
|
||
used += 2
|
||
else:
|
||
if timeQualifier != "":
|
||
military = True
|
||
if strHH <= 12 and \
|
||
(timeQualifier == "evening" or
|
||
timeQualifier == "afternoon"):
|
||
strHH += 12
|
||
else:
|
||
# try to parse # s without colons
|
||
# 5 hours, 10 minutes etc.
|
||
length = len(word)
|
||
strNum = ""
|
||
remainder = ""
|
||
for i in range(length):
|
||
if word[i].isdigit():
|
||
strNum += word[i]
|
||
else:
|
||
remainder += word[i]
|
||
|
||
if remainder == "":
|
||
remainder = wordNext.replace(".", "").lstrip().rstrip()
|
||
|
||
if (
|
||
remainder == "pm" or
|
||
wordNext == "pm" or
|
||
remainder == "p.m." or
|
||
wordNext == "p.m."):
|
||
strHH = strNum
|
||
remainder = "pm"
|
||
used = 1
|
||
elif (
|
||
remainder == "am" or
|
||
wordNext == "am" or
|
||
remainder == "a.m." or
|
||
wordNext == "a.m."):
|
||
strHH = strNum
|
||
remainder = "am"
|
||
used = 1
|
||
else:
|
||
if wordNext == "pm" or wordNext == "p.m.":
|
||
strHH = strNum
|
||
reaminder = "pm"
|
||
used = 1
|
||
elif wordNext == "am" or wordNext == "a.m.":
|
||
strHH = strNum
|
||
remainder = "am"
|
||
used = 1
|
||
elif (
|
||
int(word) > 100 and
|
||
(
|
||
wordPrev == "o" or
|
||
wordPrev == "oh"
|
||
)):
|
||
# 0800 hours (pronounced oh-eight-hundred)
|
||
strHH = int(word) / 100
|
||
strMM = int(word) - strHH * 100
|
||
military = True
|
||
if wordNext == "hours":
|
||
used += 1
|
||
elif (
|
||
wordNext == "hours" and
|
||
word[0] != '0' and
|
||
(
|
||
int(word) < 100 and
|
||
int(word) > 2400
|
||
)):
|
||
# ignores military time
|
||
# "in 3 hours"
|
||
hrOffset = int(word)
|
||
used = 2
|
||
isTime = False
|
||
hrAbs = -1
|
||
minAbs = -1
|
||
|
||
elif wordNext == "minutes":
|
||
# "in 10 minutes"
|
||
minOffset = int(word)
|
||
used = 2
|
||
isTime = False
|
||
hrAbs = -1
|
||
minAbs = -1
|
||
elif wordNext == "seconds":
|
||
# in 5 seconds
|
||
secOffset = int(word)
|
||
used = 2
|
||
isTime = False
|
||
hrAbs = -1
|
||
minAbs = -1
|
||
elif int(word) > 100:
|
||
strHH = int(word) / 100
|
||
strMM = int(word) - strHH * 100
|
||
military = True
|
||
if wordNext == "hours":
|
||
used += 1
|
||
elif wordNext[0].isdigit():
|
||
strHH = word
|
||
strMM = wordNext
|
||
military = True
|
||
used += 1
|
||
if wordNextNext == "hours":
|
||
used += 1
|
||
elif (
|
||
wordNext == "" or wordNext == "o'clock" or
|
||
(
|
||
wordNext == "in" and
|
||
(
|
||
wordNextNext == "the" or
|
||
wordNextNext == timeQualifier
|
||
)
|
||
)):
|
||
strHH = word
|
||
strMM = 00
|
||
if wordNext == "o'clock":
|
||
used += 1
|
||
if wordNext == "in" or wordNextNext == "in":
|
||
used += (1 if wordNext == "in" else 2)
|
||
if (wordNextNext and
|
||
wordNextNext in timeQualifier or
|
||
(words[words.index(wordNextNext) + 1] and
|
||
words[words.index(wordNextNext) + 1] in
|
||
timeQualifier)):
|
||
if (wordNextNext == "afternoon" or
|
||
(len(words) >
|
||
words.index(wordNextNext) + 1 and
|
||
words[words.index(
|
||
wordNextNext) + 1] == "afternoon")):
|
||
remainder = "pm"
|
||
if (wordNextNext == "evening" or
|
||
(len(words) >
|
||
(words.index(wordNextNext) + 1) and
|
||
words[words.index(
|
||
wordNextNext) + 1] == "evening")):
|
||
remainder = "pm"
|
||
if (wordNextNext == "morning" or
|
||
(len(words) >
|
||
words.index(wordNextNext) + 1 and
|
||
words[words.index(
|
||
wordNextNext) + 1] == "morning")):
|
||
remainder = "am"
|
||
if timeQualifier != "":
|
||
military = True
|
||
else:
|
||
isTime = False
|
||
|
||
strHH = int(strHH) if strHH else 0
|
||
strMM = int(strMM) if strMM else 0
|
||
strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
|
||
strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
|
||
if strHH > 24 or strMM > 59:
|
||
isTime = False
|
||
used = 0
|
||
if isTime:
|
||
hrAbs = strHH * 1
|
||
minAbs = strMM * 1
|
||
used += 1
|
||
if used > 0:
|
||
# removed parsed words from the sentence
|
||
for i in range(used):
|
||
words[idx + i] = ""
|
||
|
||
if wordPrev == "o" or wordPrev == "oh":
|
||
words[words.index(wordPrev)] = ""
|
||
|
||
if wordPrev == "early":
|
||
hrOffset = -1
|
||
words[idx - 1] = ""
|
||
idx -= 1
|
||
elif wordPrev == "late":
|
||
hrOffset = 1
|
||
words[idx - 1] = ""
|
||
idx -= 1
|
||
if idx > 0 and wordPrev in markers:
|
||
words[idx - 1] = ""
|
||
if idx > 1 and wordPrevPrev in markers:
|
||
words[idx - 2] = ""
|
||
|
||
idx += used - 1
|
||
found = True
|
||
|
||
# check that we found a date
|
||
if not date_found:
|
||
return None
|
||
|
||
if dayOffset is False:
|
||
dayOffset = 0
|
||
|
||
# perform date manipulation
|
||
|
||
extractedDate = dateNow
|
||
extractedDate = extractedDate.replace(microsecond=0,
|
||
second=0,
|
||
minute=0,
|
||
hour=0)
|
||
if datestr != "":
|
||
temp = datetime.strptime(datestr, "%B %d")
|
||
if not hasYear:
|
||
temp = temp.replace(year=extractedDate.year)
|
||
if extractedDate < temp:
|
||
extractedDate = extractedDate.replace(year=int(currentYear),
|
||
month=int(
|
||
temp.strftime(
|
||
"%m")),
|
||
day=int(temp.strftime(
|
||
"%d")))
|
||
else:
|
||
extractedDate = extractedDate.replace(
|
||
year=int(currentYear) + 1,
|
||
month=int(temp.strftime("%m")),
|
||
day=int(temp.strftime("%d")))
|
||
else:
|
||
extractedDate = extractedDate.replace(
|
||
year=int(temp.strftime("%Y")),
|
||
month=int(temp.strftime("%m")),
|
||
day=int(temp.strftime("%d")))
|
||
|
||
if timeStr != "":
|
||
temp = datetime(timeStr)
|
||
extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
|
||
minute=temp.strftime("%M"),
|
||
second=temp.strftime("%S"))
|
||
|
||
if yearOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(years=yearOffset)
|
||
if monthOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(months=monthOffset)
|
||
if dayOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(days=dayOffset)
|
||
if hrAbs != -1 and minAbs != -1:
|
||
|
||
extractedDate = extractedDate + relativedelta(hours=hrAbs,
|
||
minutes=minAbs)
|
||
if (hrAbs != 0 or minAbs != 0) and datestr == "":
|
||
if not daySpecified and dateNow > extractedDate:
|
||
extractedDate = extractedDate + relativedelta(days=1)
|
||
if hrOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(hours=hrOffset)
|
||
if minOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(minutes=minOffset)
|
||
if secOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(seconds=secOffset)
|
||
for idx, word in enumerate(words):
|
||
if words[idx] == "and" and words[idx - 1] == "" and words[
|
||
idx + 1] == "":
|
||
words[idx] = ""
|
||
|
||
resultStr = " ".join(words)
|
||
resultStr = ' '.join(resultStr.split())
|
||
return [extractedDate, resultStr]
|
||
|
||
|
||
def look_for_fractions(split_list):
|
||
""""
|
||
This function takes a list made by fraction & determines if a fraction.
|
||
|
||
Args:
|
||
split_list (list): list created by splitting on '/'
|
||
Returns:
|
||
(bool): False if not a fraction, otherwise True
|
||
|
||
"""
|
||
|
||
if len(split_list) == 2:
|
||
if is_numeric(split_list[0]) and is_numeric(split_list[1]):
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def isFractional_en(input_str):
|
||
"""
|
||
This function takes the given text and checks if it is a fraction.
|
||
|
||
Args:
|
||
text (str): the string to check if fractional
|
||
Returns:
|
||
(bool) or (float): False if not a fraction, otherwise the fraction
|
||
|
||
"""
|
||
if input_str.endswith('s', -1):
|
||
input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
|
||
|
||
aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth",
|
||
"seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"]
|
||
|
||
if input_str.lower() in aFrac:
|
||
return 1.0 / (aFrac.index(input_str) + 1)
|
||
if input_str == "quarter":
|
||
return 1.0 / 4
|
||
|
||
return False
|
||
|
||
|
||
def get_gender(word, input_string="", lang="en-us"):
|
||
'''
|
||
guess gender of word, optionally use raw input text for context
|
||
returns "m" if the word is male, "f" if female, False if unknown
|
||
'''
|
||
if "pt" in lang or "es" in lang:
|
||
# spanish follows same rules
|
||
return get_gender_pt(word, input_string)
|
||
return False
|
||
|
||
|
||
# ==============================================================
|
||
|
||
|
||
def normalize(text, lang="en-us", remove_articles=True):
|
||
"""Prepare a string for parsing
|
||
|
||
This function prepares the given text for parsing by making
|
||
numbers consistent, getting rid of contractions, etc.
|
||
Args:
|
||
text (str): the string to normalize
|
||
lang (str): the code for the language text is in
|
||
remove_articles (bool): whether to remove articles (like 'a', or 'the')
|
||
Returns:
|
||
(str): The normalized string.
|
||
"""
|
||
|
||
lang_lower = str(lang).lower()
|
||
if lang_lower.startswith("en"):
|
||
return normalize_en(text, remove_articles)
|
||
elif lang_lower.startswith("es"):
|
||
return normalize_es(text, remove_articles)
|
||
elif lang_lower.startswith("pt"):
|
||
return normalize_pt(text, remove_articles)
|
||
# TODO: Normalization for other languages
|
||
return text
|
||
|
||
|
||
def normalize_en(text, remove_articles):
|
||
""" English string normalization """
|
||
|
||
words = text.split() # this also removed extra spaces
|
||
normalized = ""
|
||
for word in words:
|
||
if remove_articles and word in ["the", "a", "an"]:
|
||
continue
|
||
|
||
# Expand common contractions, e.g. "isn't" -> "is not"
|
||
contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
|
||
"didn't", "doesn't", "don't", "gonna", "gotta",
|
||
"hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
|
||
"how'd", "how'll", "how's", "I'd", "I'll", "I'm",
|
||
"I've", "isn't", "it'd", "it'll", "it's", "mightn't",
|
||
"might've", "mustn't", "must've", "needn't",
|
||
"oughtn't",
|
||
"shan't", "she'd", "she'll", "she's", "shouldn't",
|
||
"should've", "somebody's", "someone'd", "someone'll",
|
||
"someone's", "that'll", "that's", "that'd", "there'd",
|
||
"there're", "there's", "they'd", "they'll", "they're",
|
||
"they've", "wasn't", "we'd", "we'll", "we're", "we've",
|
||
"weren't", "what'd", "what'll", "what're", "what's",
|
||
"whats",
|
||
# technically incorrect but some STT does this
|
||
"what've", "when's", "when'd", "where'd", "where's",
|
||
"where've", "who'd", "who'd've", "who'll", "who're",
|
||
"who's", "who've", "why'd", "why're", "why's", "won't",
|
||
"won't've", "would've", "wouldn't", "wouldn't've",
|
||
"y'all", "ya'll", "you'd", "you'd've", "you'll",
|
||
"y'aint", "y'ain't", "you're", "you've"]
|
||
if word in contraction:
|
||
expansion = ["is not", "are not", "can not", "could have",
|
||
"could not", "did not", "does not", "do not",
|
||
"going to", "got to", "had not", "has not",
|
||
"have not", "he would", "he will", "he is",
|
||
"how did",
|
||
"how will", "how is", "I would", "I will", "I am",
|
||
"I have", "is not", "it would", "it will", "it is",
|
||
"might not", "might have", "must not", "must have",
|
||
"need not", "ought not", "shall not", "she would",
|
||
"she will", "she is", "should not", "should have",
|
||
"somebody is", "someone would", "someone will",
|
||
"someone is", "that will", "that is", "that would",
|
||
"there would", "there are", "there is", "they would",
|
||
"they will", "they are", "they have", "was not",
|
||
"we would", "we will", "we are", "we have",
|
||
"were not", "what did", "what will", "what are",
|
||
"what is",
|
||
"what is", "what have", "when is", "when did",
|
||
"where did", "where is", "where have", "who would",
|
||
"who would have", "who will", "who are", "who is",
|
||
"who have", "why did", "why are", "why is",
|
||
"will not", "will not have", "would have",
|
||
"would not", "would not have", "you all", "you all",
|
||
"you would", "you would have", "you will",
|
||
"you are not", "you are not", "you are", "you have"]
|
||
word = expansion[contraction.index(word)]
|
||
|
||
# Convert numbers into digits, e.g. "two" -> "2"
|
||
textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
|
||
"seven", "eight", "nine", "ten", "eleven", "twelve",
|
||
"thirteen", "fourteen", "fifteen", "sixteen",
|
||
"seventeen", "eighteen", "nineteen", "twenty"]
|
||
if word in textNumbers:
|
||
word = str(textNumbers.index(word))
|
||
|
||
normalized += " " + word
|
||
|
||
return normalized[1:] # strip the initial space
|
||
|
||
|
||
####################################################################
|
||
# PT-PT
|
||
#
|
||
# TODO: numbers greater than 999999
|
||
# TODO: date time pt
|
||
####################################################################
|
||
|
||
# Undefined articles ["um", "uma", "uns", "umas"] can not be supressed,
|
||
# in PT, "um cavalo" means "a horse" or "one horse".
|
||
pt_articles = ["o", "a", "os", "as"]
|
||
|
||
pt_numbers = {
|
||
"zero": 0,
|
||
"um": 1,
|
||
"uma": 1,
|
||
"uns": 1,
|
||
"umas": 1,
|
||
"primeiro": 1,
|
||
"segundo": 2,
|
||
"terceiro": 3,
|
||
"dois": 2,
|
||
"duas": 2,
|
||
"tres": 3,
|
||
u"tr<EFBFBD>s": 3,
|
||
"quatro": 4,
|
||
"cinco": 5,
|
||
"seis": 6,
|
||
"sete": 7,
|
||
"oito": 8,
|
||
"nove": 9,
|
||
"dez": 10,
|
||
"onze": 11,
|
||
"doze": 12,
|
||
"treze": 13,
|
||
"catorze": 14,
|
||
"quinze": 15,
|
||
"dezasseis": 16,
|
||
"dezassete": 17,
|
||
"dezoito": 18,
|
||
"dezanove": 19,
|
||
"vinte": 20,
|
||
"trinta": 30,
|
||
"quarenta": 40,
|
||
"cinquenta": 50,
|
||
"sessenta": 60,
|
||
"setenta": 70,
|
||
"oitenta": 80,
|
||
"noventa": 90,
|
||
"cem": 100,
|
||
"cento": 100,
|
||
"duzentos": 200,
|
||
"duzentas": 200,
|
||
"trezentos": 300,
|
||
"trezentas": 300,
|
||
"quatrocentos": 400,
|
||
"quatrocentas": 400,
|
||
"quinhentos": 500,
|
||
"quinhentas": 500,
|
||
"seiscentos": 600,
|
||
"seiscentas": 600,
|
||
"setecentos": 700,
|
||
"setecentas": 700,
|
||
"oitocentos": 800,
|
||
"oitocentas": 800,
|
||
"novecentos": 900,
|
||
"novecentas": 900,
|
||
"mil": 1000,
|
||
u"milh<EFBFBD>o": 1000000}
|
||
|
||
|
||
def isFractional_pt(input_str):
|
||
"""
|
||
This function takes the given text and checks if it is a fraction.
|
||
|
||
Args:
|
||
text (str): the string to check if fractional
|
||
Returns:
|
||
(bool) or (float): False if not a fraction, otherwise the fraction
|
||
|
||
"""
|
||
if input_str.endswith('s', -1):
|
||
input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
|
||
|
||
aFrac = ["meio", u"ter<EFBFBD>o", "quarto", "quinto", "sexto",
|
||
"setimo", "oitavo", "nono", u"d<EFBFBD>cimo"]
|
||
|
||
if input_str.lower() in aFrac:
|
||
return 1.0 / (aFrac.index(input_str) + 2)
|
||
if input_str == u"vig<EFBFBD>simo":
|
||
return 1.0 / 20
|
||
if input_str == u"trig<EFBFBD>simo":
|
||
return 1.0 / 30
|
||
if input_str == u"cent<EFBFBD>simo":
|
||
return 1.0 / 100
|
||
if input_str == u"mil<EFBFBD>simo":
|
||
return 1.0 / 1000
|
||
if (input_str == u"s<EFBFBD>timo" or input_str == "septimo" or
|
||
input_str == u"s<EFBFBD>ptimo"):
|
||
return 1.0 / 7
|
||
|
||
return False
|
||
|
||
|
||
def extractnumber_pt(text):
|
||
"""
|
||
This function prepares the given text for parsing by making
|
||
numbers consistent, getting rid of contractions, etc.
|
||
Args:
|
||
text (str): the string to normalize
|
||
Returns:
|
||
(int) or (float): The value of extracted number
|
||
|
||
"""
|
||
aWords = text.split()
|
||
count = 0
|
||
result = None
|
||
while count < len(aWords):
|
||
val = 0
|
||
word = aWords[count]
|
||
next_next_word = None
|
||
if count + 1 < len(aWords):
|
||
next_word = aWords[count + 1]
|
||
if count + 2 < len(aWords):
|
||
next_next_word = aWords[count + 2]
|
||
else:
|
||
next_word = None
|
||
|
||
# is current word a number?
|
||
if word in pt_numbers:
|
||
val = pt_numbers[word]
|
||
elif word.isdigit(): # doesn't work with decimals
|
||
val = int(word)
|
||
elif is_numeric(word):
|
||
val = float(word)
|
||
elif isFractional_pt(word):
|
||
if not result:
|
||
result = 1
|
||
result = result * isFractional_pt(word)
|
||
count += 1
|
||
continue
|
||
|
||
if not val:
|
||
# look for fractions like "2/3"
|
||
aPieces = word.split('/')
|
||
# if (len(aPieces) == 2 and is_numeric(aPieces[0])
|
||
# and is_numeric(aPieces[1])):
|
||
if look_for_fractions(aPieces):
|
||
val = float(aPieces[0]) / float(aPieces[1])
|
||
|
||
if val:
|
||
if result is None:
|
||
result = 0
|
||
# handle fractions
|
||
if next_word != "avos":
|
||
result += val
|
||
else:
|
||
result = float(result) / float(val)
|
||
|
||
if next_word is None:
|
||
break
|
||
|
||
# number word and fraction
|
||
ands = ["e"]
|
||
if next_word in ands:
|
||
zeros = 0
|
||
if result is None:
|
||
count += 1
|
||
continue
|
||
newWords = aWords[count + 2:]
|
||
newText = ""
|
||
for word in newWords:
|
||
newText += word + " "
|
||
|
||
afterAndVal = extractnumber_pt(newText[:-1])
|
||
if afterAndVal:
|
||
if result < afterAndVal or result < 20:
|
||
while afterAndVal > 1:
|
||
afterAndVal = afterAndVal / 10.0
|
||
for word in newWords:
|
||
if word == "zero" or word == "0":
|
||
zeros += 1
|
||
else:
|
||
break
|
||
for i in range(0, zeros):
|
||
afterAndVal = afterAndVal / 10.0
|
||
result += afterAndVal
|
||
break
|
||
elif next_next_word is not None:
|
||
if next_next_word in ands:
|
||
newWords = aWords[count + 3:]
|
||
newText = ""
|
||
for word in newWords:
|
||
newText += word + " "
|
||
afterAndVal = extractnumber_pt(newText[:-1])
|
||
if afterAndVal:
|
||
if result is None:
|
||
result = 0
|
||
result += afterAndVal
|
||
break
|
||
|
||
decimals = ["ponto", "virgula", u"v<EFBFBD>rgula", ".", ","]
|
||
if next_word in decimals:
|
||
zeros = 0
|
||
newWords = aWords[count + 2:]
|
||
newText = ""
|
||
for word in newWords:
|
||
newText += word + " "
|
||
for word in newWords:
|
||
if word == "zero" or word == "0":
|
||
zeros += 1
|
||
else:
|
||
break
|
||
afterDotVal = str(extractnumber_pt(newText[:-1]))
|
||
afterDotVal = zeros * "0" + afterDotVal
|
||
result = float(str(result) + "." + afterDotVal)
|
||
break
|
||
count += 1
|
||
|
||
if result is None:
|
||
return False
|
||
|
||
# Return the $str with the number related words removed
|
||
# (now empty strings, so strlen == 0)
|
||
# aWords = [word for word in aWords if len(word) > 0]
|
||
# text = ' '.join(aWords)
|
||
if "." in str(result):
|
||
integer, dec = str(result).split(".")
|
||
# cast float to int
|
||
if dec == "0":
|
||
result = int(integer)
|
||
|
||
return result
|
||
|
||
|
||
def pt_number_parse(words, i):
|
||
def pt_cte(i, s):
|
||
if i < len(words) and s == words[i]:
|
||
return s, i + 1
|
||
return None
|
||
|
||
def pt_number_word(i, mi, ma):
|
||
if i < len(words):
|
||
v = pt_numbers.get(words[i])
|
||
if v and v >= mi and v <= ma:
|
||
return v, i + 1
|
||
return None
|
||
|
||
def pt_number_1_99(i):
|
||
r1 = pt_number_word(i, 1, 29)
|
||
if r1:
|
||
return r1
|
||
|
||
r1 = pt_number_word(i, 30, 90)
|
||
if r1:
|
||
v1, i1 = r1
|
||
r2 = pt_cte(i1, "e")
|
||
if r2:
|
||
v2, i2 = r2
|
||
r3 = pt_number_word(i2, 1, 9)
|
||
if r3:
|
||
v3, i3 = r3
|
||
return v1 + v3, i3
|
||
return r1
|
||
return None
|
||
|
||
def pt_number_1_999(i):
|
||
# [2-9]cientos [1-99]?
|
||
r1 = pt_number_word(i, 100, 900)
|
||
if r1:
|
||
v1, i1 = r1
|
||
r2 = pt_number_1_99(i1)
|
||
if r2:
|
||
v2, i2 = r2
|
||
return v1 + v2, i2
|
||
else:
|
||
return r1
|
||
|
||
# [1-99]
|
||
r1 = pt_number_1_99(i)
|
||
if r1:
|
||
return r1
|
||
|
||
return None
|
||
|
||
def pt_number(i):
|
||
# check for cero
|
||
r1 = pt_number_word(i, 0, 0)
|
||
if r1:
|
||
return r1
|
||
|
||
# check for [1-999] (mil [0-999])?
|
||
r1 = pt_number_1_999(i)
|
||
if r1:
|
||
v1, i1 = r1
|
||
r2 = pt_cte(i1, "mil")
|
||
if r2:
|
||
v2, i2 = r2
|
||
r3 = pt_number_1_999(i2)
|
||
if r3:
|
||
v3, i3 = r3
|
||
return v1 * 1000 + v3, i3
|
||
else:
|
||
return v1 * 1000, i2
|
||
else:
|
||
return r1
|
||
return None
|
||
|
||
return pt_number(i)
|
||
|
||
|
||
def normalize_pt(text, remove_articles):
|
||
""" PT string normalization """
|
||
|
||
words = text.split() # this also removed extra spaces
|
||
normalized = ""
|
||
# Contractions are not common in PT
|
||
|
||
# Convert numbers into digits, e.g. "dois" -> "2"
|
||
normalized = ""
|
||
i = 0
|
||
while i < len(words):
|
||
word = words[i]
|
||
# remove articles
|
||
if remove_articles and word in pt_articles:
|
||
i += 1
|
||
continue
|
||
|
||
# Convert numbers into digits
|
||
r = pt_number_parse(words, i)
|
||
if r:
|
||
v, i = r
|
||
normalized += " " + str(v)
|
||
continue
|
||
|
||
# NOTE temporary , handle some numbers above >999
|
||
if word in pt_numbers:
|
||
word = str(pt_numbers[word])
|
||
# end temporary
|
||
|
||
normalized += " " + word
|
||
i += 1
|
||
# some articles in pt-pt can not be removed, but many words can
|
||
# this is experimental and some meaning may be lost
|
||
# maybe agressive should default to False
|
||
# only usage will tell, as a native speaker this seems reasonable
|
||
return pt_pruning(normalized[1:], agressive=remove_articles)
|
||
|
||
|
||
def extract_datetime_pt(input_str, currentDate=None):
|
||
def clean_string(str):
|
||
# cleans the input string of unneeded punctuation and capitalization
|
||
# among other things
|
||
symbols = [".", ",", ";", "?", "!", u"<EFBFBD>", u"<EFBFBD>"]
|
||
noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de",
|
||
"ao", "aos"]
|
||
|
||
for word in symbols:
|
||
str = str.replace(word, "")
|
||
for word in noise_words:
|
||
str = str.replace(" " + word + " ", " ")
|
||
str = str.lower().replace(
|
||
u"<EFBFBD>",
|
||
"a").replace(
|
||
u"<EFBFBD>",
|
||
"c").replace(
|
||
u"<EFBFBD>",
|
||
"a").replace(
|
||
u"<EFBFBD>",
|
||
"a").replace(
|
||
u"<EFBFBD>",
|
||
"e").replace(
|
||
u"<EFBFBD>",
|
||
"e").replace(
|
||
u"<EFBFBD>",
|
||
"e").replace(
|
||
u"<EFBFBD>",
|
||
"o").replace(
|
||
u"<EFBFBD>",
|
||
"o").replace(
|
||
"-",
|
||
" ").replace(
|
||
"_",
|
||
"")
|
||
# handle synonims and equivalents, "tomorrow early = tomorrow morning
|
||
synonims = {"manha": ["manhazinha", "cedo", "cedinho"],
|
||
"tarde": ["tardinha", "tarde"],
|
||
"noite": ["noitinha", "anoitecer"],
|
||
"todos": ["ao", "aos"],
|
||
"em": ["do", "da", "dos", "das", "de"]}
|
||
for syn in synonims:
|
||
for word in synonims[syn]:
|
||
str = str.replace(" " + word + " ", " " + syn + " ")
|
||
# relevant plurals, cant just extract all s in pt
|
||
wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos",
|
||
"minutos", "segundos", "nas", "nos", "proximas",
|
||
"seguintes", "horas"]
|
||
for idx, word in enumerate(wordlist):
|
||
str = str.replace(word, word.rstrip('s'))
|
||
str = str.replace("meses", "mes").replace("anteriores", "anterior")
|
||
return str
|
||
|
||
def date_found():
|
||
return found or \
|
||
(
|
||
datestr != "" or timeStr != "" or
|
||
yearOffset != 0 or monthOffset != 0 or
|
||
dayOffset is True or hrOffset != 0 or
|
||
hrAbs != 0 or minOffset != 0 or
|
||
minAbs != 0 or secOffset != 0
|
||
)
|
||
|
||
if input_str == "":
|
||
return None
|
||
if currentDate is None:
|
||
currentDate = datetime.now()
|
||
|
||
found = False
|
||
daySpecified = False
|
||
dayOffset = False
|
||
monthOffset = 0
|
||
yearOffset = 0
|
||
dateNow = currentDate
|
||
today = dateNow.strftime("%w")
|
||
currentYear = dateNow.strftime("%Y")
|
||
fromFlag = False
|
||
datestr = ""
|
||
hasYear = False
|
||
timeQualifier = ""
|
||
|
||
words = clean_string(input_str).split(" ")
|
||
timeQualifiersList = ['manha', 'tarde', 'noite']
|
||
time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas",
|
||
"no", "dia", "hora"]
|
||
days = ['segunda', 'terca', 'quarta',
|
||
'quinta', 'sexta', 'sabado', 'domingo']
|
||
months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho',
|
||
'julho', 'agosto', 'setembro', 'outubro', 'novembro',
|
||
'dezembro']
|
||
monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag',
|
||
'set', 'out', 'nov', 'dec']
|
||
nexts = ["proximo", "proxima"]
|
||
suffix_nexts = ["seguinte", "subsequente", "seguir"]
|
||
lasts = ["ultimo", "ultima"]
|
||
suffix_lasts = ["passada", "passado", "anterior", "antes"]
|
||
nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"]
|
||
prevs = ["antes", "ante", "previa", "previamente", "anterior"]
|
||
froms = ["partir", "em", "para", "na", "no", "daqui", "seguir",
|
||
"depois", "por", "proxima", "proximo", "da", "do", "de"]
|
||
thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse",
|
||
"nessa"]
|
||
froms += thises
|
||
lists = nxts + prevs + froms + time_indicators
|
||
for idx, word in enumerate(words):
|
||
if word == "":
|
||
continue
|
||
wordPrevPrevPrev = words[idx - 3] if idx > 2 else ""
|
||
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
||
wordPrev = words[idx - 1] if idx > 0 else ""
|
||
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
||
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
||
wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
|
||
|
||
start = idx
|
||
used = 0
|
||
# save timequalifier for later
|
||
if word in timeQualifiersList:
|
||
timeQualifier = word
|
||
|
||
# parse today, tomorrow, yesterday
|
||
elif word == "hoje" and not fromFlag:
|
||
dayOffset = 0
|
||
used += 1
|
||
elif word == "amanha" and not fromFlag:
|
||
dayOffset = 1
|
||
used += 1
|
||
elif word == "ontem" and not fromFlag:
|
||
dayOffset -= 1
|
||
used += 1
|
||
# "before yesterday" and "before before yesterday"
|
||
elif (word == "anteontem" or
|
||
(word == "ante" and wordNext == "ontem")) and not fromFlag:
|
||
dayOffset -= 2
|
||
used += 1
|
||
if wordNext == "ontem":
|
||
used += 1
|
||
elif word == "ante" and wordNext == "ante" and wordNextNext == \
|
||
"ontem" and not fromFlag:
|
||
dayOffset -= 3
|
||
used += 3
|
||
elif word == "anteanteontem" and not fromFlag:
|
||
dayOffset -= 3
|
||
used += 1
|
||
# day after tomorrow
|
||
elif word == "depois" and wordNext == "amanha" and not fromFlag:
|
||
dayOffset += 2
|
||
used = 2
|
||
# day before yesterday
|
||
elif word == "antes" and wordNext == "ontem" and not fromFlag:
|
||
dayOffset -= 2
|
||
used = 2
|
||
# parse 5 days, 10 weeks, last week, next week, week after
|
||
elif word == "dia":
|
||
if wordNext == "depois" or wordNext == "antes":
|
||
used += 1
|
||
if wordPrev and wordPrev[0].isdigit():
|
||
dayOffset += int(wordPrev)
|
||
start -= 1
|
||
used += 1
|
||
elif (wordPrev and wordPrev[0].isdigit() and
|
||
wordNext not in months and
|
||
wordNext not in monthsShort):
|
||
dayOffset += int(wordPrev)
|
||
start -= 1
|
||
used += 2
|
||
elif wordNext and wordNext[0].isdigit() and wordNextNext not in \
|
||
months and wordNextNext not in monthsShort:
|
||
dayOffset += int(wordNext)
|
||
start -= 1
|
||
used += 2
|
||
|
||
elif word == "semana" and not fromFlag:
|
||
if wordPrev[0].isdigit():
|
||
dayOffset += int(wordPrev) * 7
|
||
start -= 1
|
||
used = 2
|
||
for w in nexts:
|
||
if wordPrev == w:
|
||
dayOffset = 7
|
||
start -= 1
|
||
used = 2
|
||
for w in lasts:
|
||
if wordPrev == w:
|
||
dayOffset = -7
|
||
start -= 1
|
||
used = 2
|
||
for w in suffix_nexts:
|
||
if wordNext == w:
|
||
dayOffset = 7
|
||
start -= 1
|
||
used = 2
|
||
for w in suffix_lasts:
|
||
if wordNext == w:
|
||
dayOffset = -7
|
||
start -= 1
|
||
used = 2
|
||
# parse 10 months, next month, last month
|
||
elif word == "mes" and not fromFlag:
|
||
if wordPrev[0].isdigit():
|
||
monthOffset = int(wordPrev)
|
||
start -= 1
|
||
used = 2
|
||
for w in nexts:
|
||
if wordPrev == w:
|
||
monthOffset = 7
|
||
start -= 1
|
||
used = 2
|
||
for w in lasts:
|
||
if wordPrev == w:
|
||
monthOffset = -7
|
||
start -= 1
|
||
used = 2
|
||
for w in suffix_nexts:
|
||
if wordNext == w:
|
||
monthOffset = 7
|
||
start -= 1
|
||
used = 2
|
||
for w in suffix_lasts:
|
||
if wordNext == w:
|
||
monthOffset = -7
|
||
start -= 1
|
||
used = 2
|
||
# parse 5 years, next year, last year
|
||
elif word == "ano" and not fromFlag:
|
||
if wordPrev[0].isdigit():
|
||
yearOffset = int(wordPrev)
|
||
start -= 1
|
||
used = 2
|
||
for w in nexts:
|
||
if wordPrev == w:
|
||
yearOffset = 7
|
||
start -= 1
|
||
used = 2
|
||
for w in lasts:
|
||
if wordPrev == w:
|
||
yearOffset = -7
|
||
start -= 1
|
||
used = 2
|
||
for w in suffix_nexts:
|
||
if wordNext == w:
|
||
yearOffset = 7
|
||
start -= 1
|
||
used = 2
|
||
for w in suffix_lasts:
|
||
if wordNext == w:
|
||
yearOffset = -7
|
||
start -= 1
|
||
used = 2
|
||
# parse Monday, Tuesday, etc., and next Monday,
|
||
# last Tuesday, etc.
|
||
elif word in days and not fromFlag:
|
||
|
||
d = days.index(word)
|
||
dayOffset = (d + 1) - int(today)
|
||
used = 1
|
||
if dayOffset < 0:
|
||
dayOffset += 7
|
||
for w in nexts:
|
||
if wordPrev == w:
|
||
dayOffset += 7
|
||
used += 1
|
||
start -= 1
|
||
for w in lasts:
|
||
if wordPrev == w:
|
||
dayOffset -= 7
|
||
used += 1
|
||
start -= 1
|
||
for w in suffix_nexts:
|
||
if wordNext == w:
|
||
dayOffset += 7
|
||
used += 1
|
||
start -= 1
|
||
for w in suffix_lasts:
|
||
if wordNext == w:
|
||
dayOffset -= 7
|
||
used += 1
|
||
start -= 1
|
||
if wordNext == "feira":
|
||
used += 1
|
||
# parse 15 of July, June 20th, Feb 18, 19 of February
|
||
elif word in months or word in monthsShort:
|
||
try:
|
||
m = months.index(word)
|
||
except ValueError:
|
||
m = monthsShort.index(word)
|
||
used += 1
|
||
datestr = months[m]
|
||
if wordPrev and wordPrev[0].isdigit():
|
||
# 13 maio
|
||
datestr += " " + wordPrev
|
||
start -= 1
|
||
used += 1
|
||
if wordNext and wordNext[0].isdigit():
|
||
datestr += " " + wordNext
|
||
used += 1
|
||
hasYear = True
|
||
else:
|
||
hasYear = False
|
||
|
||
elif wordNext and wordNext[0].isdigit():
|
||
# maio 13
|
||
datestr += " " + wordNext
|
||
used += 1
|
||
if wordNextNext and wordNextNext[0].isdigit():
|
||
datestr += " " + wordNextNext
|
||
used += 1
|
||
hasYear = True
|
||
else:
|
||
hasYear = False
|
||
|
||
elif wordPrevPrev and wordPrevPrev[0].isdigit():
|
||
# 13 dia maio
|
||
datestr += " " + wordPrevPrev
|
||
|
||
start -= 2
|
||
used += 2
|
||
if wordNext and word[0].isdigit():
|
||
datestr += " " + wordNext
|
||
used += 1
|
||
hasYear = True
|
||
else:
|
||
hasYear = False
|
||
|
||
elif wordNextNext and wordNextNext[0].isdigit():
|
||
# maio dia 13
|
||
datestr += " " + wordNextNext
|
||
used += 2
|
||
if wordNextNextNext and wordNextNextNext[0].isdigit():
|
||
datestr += " " + wordNextNextNext
|
||
used += 1
|
||
hasYear = True
|
||
else:
|
||
hasYear = False
|
||
|
||
if datestr in months:
|
||
datestr = ""
|
||
|
||
# parse 5 days from tomorrow, 10 weeks from next thursday,
|
||
# 2 months from July
|
||
validFollowups = days + months + monthsShort
|
||
validFollowups.append("hoje")
|
||
validFollowups.append("amanha")
|
||
validFollowups.append("ontem")
|
||
validFollowups.append("anteontem")
|
||
validFollowups.append("agora")
|
||
validFollowups.append("ja")
|
||
validFollowups.append("ante")
|
||
|
||
# TODO debug word "depois" that one is failing for some reason
|
||
if word in froms and wordNext in validFollowups:
|
||
|
||
if not (wordNext == "amanha" and wordNext == "ontem") and not (
|
||
word == "depois" or word == "antes" or word == "em"):
|
||
used = 2
|
||
fromFlag = True
|
||
if wordNext == "amanha" and word != "depois":
|
||
dayOffset += 1
|
||
elif wordNext == "ontem":
|
||
dayOffset -= 1
|
||
elif wordNext == "anteontem":
|
||
dayOffset -= 2
|
||
elif wordNext == "ante" and wordNextNext == "ontem":
|
||
dayOffset -= 2
|
||
elif (wordNext == "ante" and wordNext == "ante" and
|
||
wordNextNextNext == "ontem"):
|
||
dayOffset -= 3
|
||
elif wordNext in days:
|
||
d = days.index(wordNext)
|
||
tmpOffset = (d + 1) - int(today)
|
||
used = 2
|
||
if wordNextNext == "feira":
|
||
used += 1
|
||
if tmpOffset < 0:
|
||
tmpOffset += 7
|
||
if wordNextNext:
|
||
if wordNextNext in nxts:
|
||
tmpOffset += 7
|
||
used += 1
|
||
elif wordNextNext in prevs:
|
||
tmpOffset -= 7
|
||
used += 1
|
||
dayOffset += tmpOffset
|
||
elif wordNextNext and wordNextNext in days:
|
||
d = days.index(wordNextNext)
|
||
tmpOffset = (d + 1) - int(today)
|
||
used = 3
|
||
if wordNextNextNext:
|
||
if wordNextNextNext in nxts:
|
||
tmpOffset += 7
|
||
used += 1
|
||
elif wordNextNextNext in prevs:
|
||
tmpOffset -= 7
|
||
used += 1
|
||
dayOffset += tmpOffset
|
||
if wordNextNextNext == "feira":
|
||
used += 1
|
||
if wordNext in months:
|
||
used -= 1
|
||
if used > 0:
|
||
|
||
if start - 1 > 0 and words[start - 1] in lists:
|
||
start -= 1
|
||
used += 1
|
||
|
||
for i in range(0, used):
|
||
words[i + start] = ""
|
||
|
||
if (start - 1 >= 0 and words[start - 1] in lists):
|
||
words[start - 1] = ""
|
||
found = True
|
||
daySpecified = True
|
||
|
||
# parse time
|
||
timeStr = ""
|
||
hrOffset = 0
|
||
minOffset = 0
|
||
secOffset = 0
|
||
hrAbs = 0
|
||
minAbs = 0
|
||
military = False
|
||
|
||
for idx, word in enumerate(words):
|
||
if word == "":
|
||
continue
|
||
|
||
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
||
wordPrev = words[idx - 1] if idx > 0 else ""
|
||
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
||
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
||
wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
|
||
# parse noon, midnight, morning, afternoon, evening
|
||
used = 0
|
||
if word == "meio" and wordNext == "dia":
|
||
hrAbs = 12
|
||
used += 2
|
||
elif word == "meia" and wordNext == "noite":
|
||
hrAbs = 0
|
||
used += 2
|
||
elif word == "manha":
|
||
if hrAbs == 0:
|
||
hrAbs = 8
|
||
used += 1
|
||
elif word == "tarde":
|
||
if hrAbs == 0:
|
||
hrAbs = 15
|
||
used += 1
|
||
elif word == "meio" and wordNext == "tarde":
|
||
if hrAbs == 0:
|
||
hrAbs = 17
|
||
used += 2
|
||
elif word == "meio" and wordNext == "manha":
|
||
if hrAbs == 0:
|
||
hrAbs = 10
|
||
used += 2
|
||
elif word == "fim" and wordNext == "tarde":
|
||
if hrAbs == 0:
|
||
hrAbs = 19
|
||
used += 2
|
||
elif word == "fim" and wordNext == "manha":
|
||
if hrAbs == 0:
|
||
hrAbs = 11
|
||
used += 2
|
||
elif word == "tantas" and wordNext == "manha":
|
||
if hrAbs == 0:
|
||
hrAbs = 4
|
||
used += 2
|
||
elif word == "noite":
|
||
if hrAbs == 0:
|
||
hrAbs = 22
|
||
used += 1
|
||
# parse half an hour, quarter hour
|
||
elif word == "hora" and \
|
||
(wordPrev in time_indicators or wordPrevPrev in
|
||
time_indicators):
|
||
if wordPrev == "meia":
|
||
minOffset = 30
|
||
elif wordPrev == "quarto":
|
||
minOffset = 15
|
||
elif wordPrevPrev == "quarto":
|
||
minOffset = 15
|
||
if idx > 2 and words[idx - 3] in time_indicators:
|
||
words[idx - 3] = ""
|
||
words[idx - 2] = ""
|
||
else:
|
||
hrOffset = 1
|
||
if wordPrevPrev in time_indicators:
|
||
words[idx - 2] = ""
|
||
words[idx - 1] = ""
|
||
used += 1
|
||
hrAbs = -1
|
||
minAbs = -1
|
||
# parse 5:00 am, 12:00 p.m., etc
|
||
elif word[0].isdigit():
|
||
isTime = True
|
||
strHH = ""
|
||
strMM = ""
|
||
remainder = ""
|
||
if ':' in word:
|
||
# parse colons
|
||
# "3:00 in the morning"
|
||
stage = 0
|
||
length = len(word)
|
||
for i in range(length):
|
||
if stage == 0:
|
||
if word[i].isdigit():
|
||
strHH += word[i]
|
||
elif word[i] == ":":
|
||
stage = 1
|
||
else:
|
||
stage = 2
|
||
i -= 1
|
||
elif stage == 1:
|
||
if word[i].isdigit():
|
||
strMM += word[i]
|
||
else:
|
||
stage = 2
|
||
i -= 1
|
||
elif stage == 2:
|
||
remainder = word[i:].replace(".", "")
|
||
break
|
||
if remainder == "":
|
||
nextWord = wordNext.replace(".", "")
|
||
if nextWord == "am" or nextWord == "pm":
|
||
remainder = nextWord
|
||
used += 1
|
||
elif wordNext == "manha":
|
||
remainder = "am"
|
||
used += 1
|
||
elif wordNext == "tarde":
|
||
remainder = "pm"
|
||
used += 1
|
||
elif wordNext == "noite":
|
||
if 0 < int(word[0]) < 6:
|
||
remainder = "am"
|
||
else:
|
||
remainder = "pm"
|
||
used += 1
|
||
elif wordNext in thises and wordNextNext == "manha":
|
||
remainder = "am"
|
||
used = 2
|
||
elif wordNext in thises and wordNextNext == "tarde":
|
||
remainder = "pm"
|
||
used = 2
|
||
elif wordNext in thises and wordNextNext == "noite":
|
||
remainder = "pm"
|
||
used = 2
|
||
else:
|
||
if timeQualifier != "":
|
||
military = True
|
||
if strHH <= 12 and \
|
||
(timeQualifier == "manha" or
|
||
timeQualifier == "tarde"):
|
||
strHH += 12
|
||
|
||
else:
|
||
# try to parse # s without colons
|
||
# 5 hours, 10 minutes etc.
|
||
length = len(word)
|
||
strNum = ""
|
||
remainder = ""
|
||
for i in range(length):
|
||
if word[i].isdigit():
|
||
strNum += word[i]
|
||
else:
|
||
remainder += word[i]
|
||
|
||
if remainder == "":
|
||
remainder = wordNext.replace(".", "").lstrip().rstrip()
|
||
|
||
if (
|
||
remainder == "pm" or
|
||
wordNext == "pm" or
|
||
remainder == "p.m." or
|
||
wordNext == "p.m."):
|
||
strHH = strNum
|
||
remainder = "pm"
|
||
used = 1
|
||
elif (
|
||
remainder == "am" or
|
||
wordNext == "am" or
|
||
remainder == "a.m." or
|
||
wordNext == "a.m."):
|
||
strHH = strNum
|
||
remainder = "am"
|
||
used = 1
|
||
else:
|
||
if (wordNext == "pm" or
|
||
wordNext == "p.m." or
|
||
wordNext == "tarde"):
|
||
strHH = strNum
|
||
remainder = "pm"
|
||
used = 1
|
||
elif (wordNext == "am" or
|
||
wordNext == "a.m." or
|
||
wordNext == "manha"):
|
||
strHH = strNum
|
||
remainder = "am"
|
||
used = 1
|
||
elif (int(word) > 100 and
|
||
(
|
||
wordPrev == "o" or
|
||
wordPrev == "oh" or
|
||
wordPrev == "zero"
|
||
)):
|
||
# 0800 hours (pronounced oh-eight-hundred)
|
||
strHH = int(word) / 100
|
||
strMM = int(word) - strHH * 100
|
||
military = True
|
||
if wordNext == "hora":
|
||
used += 1
|
||
elif (
|
||
wordNext == "hora" and
|
||
word[0] != '0' and
|
||
(
|
||
int(word) < 100 and
|
||
int(word) > 2400
|
||
)):
|
||
# ignores military time
|
||
# "in 3 hours"
|
||
hrOffset = int(word)
|
||
used = 2
|
||
isTime = False
|
||
hrAbs = -1
|
||
minAbs = -1
|
||
|
||
elif wordNext == "minuto":
|
||
# "in 10 minutes"
|
||
minOffset = int(word)
|
||
used = 2
|
||
isTime = False
|
||
hrAbs = -1
|
||
minAbs = -1
|
||
elif wordNext == "segundo":
|
||
# in 5 seconds
|
||
secOffset = int(word)
|
||
used = 2
|
||
isTime = False
|
||
hrAbs = -1
|
||
minAbs = -1
|
||
elif int(word) > 100:
|
||
strHH = int(word) / 100
|
||
strMM = int(word) - strHH * 100
|
||
military = True
|
||
if wordNext == "hora":
|
||
used += 1
|
||
|
||
elif wordNext == "" or (
|
||
wordNext == "em" and wordNextNext == "ponto"):
|
||
strHH = word
|
||
strMM = 00
|
||
if wordNext == "em" and wordNextNext == "ponto":
|
||
used += 2
|
||
if wordNextNextNext == "tarde":
|
||
remainder = "pm"
|
||
used += 1
|
||
elif wordNextNextNext == "manha":
|
||
remainder = "am"
|
||
used += 1
|
||
elif wordNextNextNext == "noite":
|
||
if 0 > strHH > 6:
|
||
remainder = "am"
|
||
else:
|
||
remainder = "pm"
|
||
used += 1
|
||
|
||
elif wordNext[0].isdigit():
|
||
strHH = word
|
||
strMM = wordNext
|
||
military = True
|
||
used += 1
|
||
if wordNextNext == "hora":
|
||
used += 1
|
||
else:
|
||
isTime = False
|
||
|
||
strHH = int(strHH) if strHH else 0
|
||
strMM = int(strMM) if strMM else 0
|
||
strHH = strHH + 12 if (remainder == "pm" and
|
||
0 < strHH < 12) else strHH
|
||
strHH = strHH - 12 if (remainder == "am" and
|
||
0 < strHH >= 12) else strHH
|
||
if strHH > 24 or strMM > 59:
|
||
isTime = False
|
||
used = 0
|
||
if isTime:
|
||
hrAbs = strHH * 1
|
||
minAbs = strMM * 1
|
||
used += 1
|
||
|
||
if used > 0:
|
||
# removed parsed words from the sentence
|
||
for i in range(used):
|
||
words[idx + i] = ""
|
||
|
||
if wordPrev == "em" or wordPrev == "ponto":
|
||
words[words.index(wordPrev)] = ""
|
||
|
||
if idx > 0 and wordPrev in time_indicators:
|
||
words[idx - 1] = ""
|
||
if idx > 1 and wordPrevPrev in time_indicators:
|
||
words[idx - 2] = ""
|
||
|
||
idx += used - 1
|
||
found = True
|
||
|
||
# check that we found a date
|
||
if not date_found:
|
||
return None
|
||
|
||
if dayOffset is False:
|
||
dayOffset = 0
|
||
|
||
# perform date manipulation
|
||
|
||
extractedDate = dateNow
|
||
extractedDate = extractedDate.replace(microsecond=0,
|
||
second=0,
|
||
minute=0,
|
||
hour=0)
|
||
if datestr != "":
|
||
en_months = ['january', 'february', 'march', 'april', 'may', 'june',
|
||
'july', 'august', 'september', 'october', 'november',
|
||
'december']
|
||
en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
|
||
'aug',
|
||
'sept', 'oct', 'nov', 'dec']
|
||
for idx, en_month in enumerate(en_months):
|
||
datestr = datestr.replace(months[idx], en_month)
|
||
for idx, en_month in enumerate(en_monthsShort):
|
||
datestr = datestr.replace(monthsShort[idx], en_month)
|
||
|
||
temp = datetime.strptime(datestr, "%B %d")
|
||
if not hasYear:
|
||
temp = temp.replace(year=extractedDate.year)
|
||
if extractedDate < temp:
|
||
extractedDate = extractedDate.replace(year=int(currentYear),
|
||
month=int(
|
||
temp.strftime(
|
||
"%m")),
|
||
day=int(temp.strftime(
|
||
"%d")))
|
||
else:
|
||
extractedDate = extractedDate.replace(
|
||
year=int(currentYear) + 1,
|
||
month=int(temp.strftime("%m")),
|
||
day=int(temp.strftime("%d")))
|
||
else:
|
||
extractedDate = extractedDate.replace(
|
||
year=int(temp.strftime("%Y")),
|
||
month=int(temp.strftime("%m")),
|
||
day=int(temp.strftime("%d")))
|
||
|
||
if timeStr != "":
|
||
temp = datetime(timeStr)
|
||
extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
|
||
minute=temp.strftime("%M"),
|
||
second=temp.strftime("%S"))
|
||
|
||
if yearOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(years=yearOffset)
|
||
if monthOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(months=monthOffset)
|
||
if dayOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(days=dayOffset)
|
||
if hrAbs != -1 and minAbs != -1:
|
||
|
||
extractedDate = extractedDate + relativedelta(hours=hrAbs,
|
||
minutes=minAbs)
|
||
if (hrAbs != 0 or minAbs != 0) and datestr == "":
|
||
if not daySpecified and dateNow > extractedDate:
|
||
extractedDate = extractedDate + relativedelta(days=1)
|
||
if hrOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(hours=hrOffset)
|
||
if minOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(minutes=minOffset)
|
||
if secOffset != 0:
|
||
extractedDate = extractedDate + relativedelta(seconds=secOffset)
|
||
|
||
resultStr = " ".join(words)
|
||
resultStr = ' '.join(resultStr.split())
|
||
resultStr = pt_pruning(resultStr)
|
||
return [extractedDate, resultStr]
|
||
|
||
|
||
def pt_pruning(text, symbols=True, accents=True, agressive=True):
|
||
# agressive pt word pruning
|
||
words = ["a", "o", "os", "as", "de", "dos", "das",
|
||
"lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para",
|
||
"este",
|
||
"esta", "deste", "desta", "neste", "nesta", "nesse",
|
||
"nessa", "foi", "que"]
|
||
if symbols:
|
||
symbols = [".", ",", ";", ":", "!", "?", u"<EFBFBD>", u"<EFBFBD>"]
|
||
for symbol in symbols:
|
||
text = text.replace(symbol, "")
|
||
text = text.replace("-", " ").replace("_", " ")
|
||
if accents:
|
||
accents = {"a": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
|
||
"e": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
|
||
"i": [u"<EFBFBD>", u"<EFBFBD>"],
|
||
"o": [u"<EFBFBD>", u"<EFBFBD>"],
|
||
"u": [u"<EFBFBD>", u"<EFBFBD>"],
|
||
"c": [u"<EFBFBD>", u"<EFBFBD>"]}
|
||
for char in accents:
|
||
for acc in accents[char]:
|
||
text = text.replace(acc, char)
|
||
if agressive:
|
||
text_words = text.split(" ")
|
||
for idx, word in enumerate(text_words):
|
||
if word in words:
|
||
text_words[idx] = ""
|
||
text = " ".join(text_words)
|
||
text = ' '.join(text.split())
|
||
return text
|
||
|
||
|
||
def get_gender_pt(word, raw_string=""):
|
||
word = word.rstrip("s")
|
||
gender = False
|
||
words = raw_string.split(" ")
|
||
for idx, w in enumerate(words):
|
||
if w == word and idx != 0:
|
||
previous = words[idx - 1]
|
||
gender = get_gender_pt(previous)
|
||
break
|
||
if not gender:
|
||
if word[-1] == "a":
|
||
gender = "f"
|
||
if word[-1] == "o" or word[-1] == "e":
|
||
gender = "m"
|
||
return gender
|
||
|
||
|
||
####################################################################
|
||
# Spanish normalization
|
||
#
|
||
# TODO: numbers greater than 999999
|
||
####################################################################
|
||
|
||
# Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
|
||
# in Spanish, "un caballo" means "a horse" or "one horse".
|
||
es_articles = ["el", "la", "los", "las"]
|
||
|
||
es_numbers_xlat = {
|
||
"un": 1,
|
||
"uno": 1,
|
||
"una": 1,
|
||
"dos": 2,
|
||
"tres": 3,
|
||
u"tr<EFBFBD>s": 3,
|
||
"cuatro": 4,
|
||
"cinco": 5,
|
||
"seis": 6,
|
||
"siete": 7,
|
||
"ocho": 8,
|
||
"nueve": 9,
|
||
"diez": 10,
|
||
"once": 11,
|
||
"doce": 12,
|
||
"trece": 13,
|
||
"catorce": 14,
|
||
"quince": 15,
|
||
"dieciseis": 16,
|
||
u"diecis<EFBFBD>is": 16,
|
||
"diecisiete": 17,
|
||
"dieciocho": 18,
|
||
"diecinueve": 19,
|
||
"veinte": 20,
|
||
"veintiuno": 21,
|
||
u"veintid<EFBFBD>s": 22,
|
||
u"veintitr<EFBFBD>s": 23,
|
||
"veintidos": 22,
|
||
"veintitres": 23,
|
||
"veinticuatro": 24,
|
||
"veinticinco": 25,
|
||
u"veintis<EFBFBD>is": 26,
|
||
"veintiseis": 26,
|
||
"veintisiete": 27,
|
||
"veintiocho": 28,
|
||
"veintinueve": 29,
|
||
"treinta": 30,
|
||
"cuarenta": 40,
|
||
"cincuenta": 50,
|
||
"sesenta": 60,
|
||
"setenta": 70,
|
||
"ochenta": 80,
|
||
"noventa": 90,
|
||
"cien": 100,
|
||
"ciento": 100,
|
||
"doscientos": 200,
|
||
"doscientas": 200,
|
||
"trescientos": 300,
|
||
"trescientas": 300,
|
||
"cuatrocientos": 400,
|
||
"cuatrocientas": 400,
|
||
"quinientos": 500,
|
||
"quinientas": 500,
|
||
"seiscientos": 600,
|
||
"seiscientas": 600,
|
||
"setecientos": 700,
|
||
"setecientas": 700,
|
||
"ochocientos": 800,
|
||
"ochocientas": 800,
|
||
"novecientos": 900,
|
||
"novecientas": 900}
|
||
|
||
|
||
def es_parse(words, i):
|
||
def es_cte(i, s):
|
||
if i < len(words) and s == words[i]:
|
||
return s, i + 1
|
||
return None
|
||
|
||
def es_number_word(i, mi, ma):
|
||
if i < len(words):
|
||
v = es_numbers_xlat.get(words[i])
|
||
if v and v >= mi and v <= ma:
|
||
return v, i + 1
|
||
return None
|
||
|
||
def es_number_1_99(i):
|
||
r1 = es_number_word(i, 1, 29)
|
||
if r1:
|
||
return r1
|
||
|
||
r1 = es_number_word(i, 30, 90)
|
||
if r1:
|
||
v1, i1 = r1
|
||
r2 = es_cte(i1, "y")
|
||
if r2:
|
||
v2, i2 = r2
|
||
r3 = es_number_word(i2, 1, 9)
|
||
if r3:
|
||
v3, i3 = r3
|
||
return v1 + v3, i3
|
||
return r1
|
||
return None
|
||
|
||
def es_number_1_999(i):
|
||
# [2-9]cientos [1-99]?
|
||
r1 = es_number_word(i, 100, 900)
|
||
if r1:
|
||
v1, i1 = r1
|
||
r2 = es_number_1_99(i1)
|
||
if r2:
|
||
v2, i2 = r2
|
||
return v1 + v2, i2
|
||
else:
|
||
return r1
|
||
|
||
# [1-99]
|
||
r1 = es_number_1_99(i)
|
||
if r1:
|
||
return r1
|
||
|
||
return None
|
||
|
||
def es_number(i):
|
||
# check for cero
|
||
r1 = es_number_word(i, 0, 0)
|
||
if r1:
|
||
return r1
|
||
|
||
# check for [1-999] (mil [0-999])?
|
||
r1 = es_number_1_999(i)
|
||
if r1:
|
||
v1, i1 = r1
|
||
r2 = es_cte(i1, "mil")
|
||
if r2:
|
||
v2, i2 = r2
|
||
r3 = es_number_1_999(i2)
|
||
if r3:
|
||
v3, i3 = r3
|
||
return v1 * 1000 + v3, i3
|
||
else:
|
||
return v1 * 1000, i2
|
||
else:
|
||
return r1
|
||
return None
|
||
|
||
return es_number(i)
|
||
|
||
|
||
def normalize_es(text, remove_articles):
|
||
""" Spanish string normalization """
|
||
|
||
words = text.split() # this also removed extra spaces
|
||
|
||
normalized = ""
|
||
i = 0
|
||
while i < len(words):
|
||
word = words[i]
|
||
|
||
if remove_articles and word in es_articles:
|
||
i += 1
|
||
continue
|
||
|
||
# Convert numbers into digits
|
||
r = es_parse(words, i)
|
||
if r:
|
||
v, i = r
|
||
normalized += " " + str(v)
|
||
continue
|
||
|
||
normalized += " " + word
|
||
i += 1
|
||
|
||
return normalized[1:] # strip the initial space
|