1158 lines
43 KiB
Python
1158 lines
43 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright 2017 Mycroft AI Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
from datetime import datetime
|
|
|
|
from dateutil.relativedelta import relativedelta
|
|
|
|
from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
|
|
extract_numbers_generic
|
|
from mycroft.util.lang.format_en import NUM_STRING_EN, LONG_SCALE_EN, \
|
|
SHORT_SCALE_EN, pronounce_number_en
|
|
|
|
SHORT_ORDINAL_STRING_EN = {
|
|
1: 'first',
|
|
2: 'second',
|
|
3: 'third',
|
|
4: 'fourth',
|
|
5: 'fifth',
|
|
6: 'sixth',
|
|
7: 'seventh',
|
|
8: 'eighth',
|
|
9: 'ninth',
|
|
10: 'tenth',
|
|
11: 'eleventh',
|
|
12: 'twelfth',
|
|
13: 'thirteenth',
|
|
14: 'fourteenth',
|
|
15: 'fifteenth',
|
|
16: 'sixteenth',
|
|
17: 'seventeenth',
|
|
18: 'eighteenth',
|
|
19: 'nineteenth',
|
|
20: 'twentieth',
|
|
30: 'thirtieth',
|
|
40: "fortieth",
|
|
50: "fiftieth",
|
|
60: "sixtieth",
|
|
70: "seventieth",
|
|
80: "eightieth",
|
|
90: "ninetieth",
|
|
10e3: "hundredth",
|
|
1e3: "thousandth",
|
|
1e6: "millionth",
|
|
1e9: "billionth",
|
|
1e12: "trillionth",
|
|
1e15: "quadrillionth",
|
|
1e18: "quintillionth",
|
|
1e21: "sextillionth",
|
|
1e24: "septillionth",
|
|
1e27: "octillionth",
|
|
1e30: "nonillionth",
|
|
1e33: "decillionth"
|
|
# TODO > 1e-33
|
|
}
|
|
|
|
LONG_ORDINAL_STRING_EN = {
|
|
1: 'first',
|
|
2: 'second',
|
|
3: 'third',
|
|
4: 'fourth',
|
|
5: 'fifth',
|
|
6: 'sixth',
|
|
7: 'seventh',
|
|
8: 'eighth',
|
|
9: 'ninth',
|
|
10: 'tenth',
|
|
11: 'eleventh',
|
|
12: 'twelfth',
|
|
13: 'thirteenth',
|
|
14: 'fourteenth',
|
|
15: 'fifteenth',
|
|
16: 'sixteenth',
|
|
17: 'seventeenth',
|
|
18: 'eighteenth',
|
|
19: 'nineteenth',
|
|
20: 'twentieth',
|
|
30: 'thirtieth',
|
|
40: "fortieth",
|
|
50: "fiftieth",
|
|
60: "sixtieth",
|
|
70: "seventieth",
|
|
80: "eightieth",
|
|
90: "ninetieth",
|
|
10e3: "hundredth",
|
|
1e3: "thousandth",
|
|
1e6: "millionth",
|
|
1e12: "billionth",
|
|
1e18: "trillionth",
|
|
1e24: "quadrillionth",
|
|
1e30: "quintillionth",
|
|
1e36: "sextillionth",
|
|
1e42: "septillionth",
|
|
1e48: "octillionth",
|
|
1e54: "nonillionth",
|
|
1e60: "decillionth"
|
|
# TODO > 1e60
|
|
}
|
|
|
|
|
|
def extractnumber_en(text, short_scale=True, ordinals=False):
|
|
"""
|
|
This function extracts a number from a text string,
|
|
handles pronunciations in long scale and short scale
|
|
|
|
https://en.wikipedia.org/wiki/Names_of_large_numbers
|
|
|
|
Args:
|
|
text (str): the string to normalize
|
|
short_scale (bool): use short scale if True, long scale if False
|
|
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
|
|
Returns:
|
|
(int) or (float) or False: The extracted number or False if no number
|
|
was found
|
|
|
|
"""
|
|
|
|
string_num_en = {
|
|
"half": 0.5,
|
|
"halves": 0.5,
|
|
"couple": 2,
|
|
"hundred": 100,
|
|
"hundreds": 100,
|
|
"thousand": 1000,
|
|
"thousands": 1000,
|
|
"million": 1000000,
|
|
'millions': 1000000}
|
|
|
|
string_num_ordinal_en = {}
|
|
|
|
for num in NUM_STRING_EN:
|
|
num_string = NUM_STRING_EN[num]
|
|
string_num_en[num_string] = num
|
|
|
|
# first, second...
|
|
if ordinals:
|
|
if short_scale:
|
|
for num in SHORT_ORDINAL_STRING_EN:
|
|
num_string = SHORT_ORDINAL_STRING_EN[num]
|
|
string_num_ordinal_en[num_string] = num
|
|
string_num_en[num_string] = num
|
|
else:
|
|
for num in LONG_ORDINAL_STRING_EN:
|
|
num_string = LONG_ORDINAL_STRING_EN[num]
|
|
string_num_ordinal_en[num_string] = num
|
|
string_num_en[num_string] = num
|
|
|
|
# negate next number (-2 = 0 - 2)
|
|
negatives = ["negative", "minus"]
|
|
|
|
# sum the next number (twenty two = 20 + 2)
|
|
sums = ['twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty',
|
|
'ninety']
|
|
|
|
# multiply the previous number (one hundred = 1 * 100)
|
|
multiplies = ["hundred", "thousand", "hundreds", "thousands", "million",
|
|
"millions"]
|
|
|
|
# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
|
|
fraction_marker = [" and "]
|
|
|
|
# decimal marker ( 1 point 5 = 1 + 0.5)
|
|
decimal_marker = [" point ", " dot "]
|
|
|
|
if short_scale:
|
|
for num in SHORT_SCALE_EN:
|
|
num_string = SHORT_SCALE_EN[num]
|
|
string_num_en[num_string] = num
|
|
string_num_en[num_string + "s"] = num
|
|
multiplies.append(num_string)
|
|
multiplies.append(num_string + "s")
|
|
else:
|
|
for num in LONG_SCALE_EN:
|
|
num_string = LONG_SCALE_EN[num]
|
|
string_num_en[num_string] = num
|
|
string_num_en[num_string + "s"] = num
|
|
multiplies.append(num_string)
|
|
multiplies.append(num_string + "s")
|
|
|
|
# 2 and 3/4
|
|
for c in fraction_marker:
|
|
components = text.split(c)
|
|
|
|
if len(components) == 2:
|
|
# ensure first is not a fraction and second is a fraction
|
|
num1 = extractnumber_en(components[0])
|
|
num2 = extractnumber_en(components[1])
|
|
if num1 is not None and num2 is not None \
|
|
and num1 >= 1 and 0 < num2 < 1:
|
|
return num1 + num2
|
|
|
|
# 2 point 5
|
|
for c in decimal_marker:
|
|
components = text.split(c)
|
|
if len(components) == 2:
|
|
number = extractnumber_en(components[0])
|
|
decimal = extractnumber_en(components[1])
|
|
if number is not None and decimal is not None:
|
|
# TODO handle number dot number number number
|
|
if "." not in str(decimal):
|
|
return number + float("0." + str(decimal))
|
|
|
|
aWords = text.split()
|
|
aWords = [word for word in aWords if word not in ["the", "a", "an"]]
|
|
val = False
|
|
prev_val = None
|
|
to_sum = []
|
|
for idx, word in enumerate(aWords):
|
|
|
|
if not word:
|
|
continue
|
|
prev_word = aWords[idx - 1] if idx > 0 else ""
|
|
next_word = aWords[idx + 1] if idx + 1 < len(aWords) else ""
|
|
|
|
# is this word already a number ?
|
|
if is_numeric(word):
|
|
# if word.isdigit(): # doesn't work with decimals
|
|
val = float(word)
|
|
|
|
# is this word the name of a number ?
|
|
if word in string_num_en:
|
|
val = string_num_en[word]
|
|
|
|
# is the prev word an ordinal number and current word is one?
|
|
# second one, third one
|
|
if ordinals and prev_word in string_num_ordinal_en and val is 1:
|
|
val = prev_val
|
|
|
|
# is the prev word a number and should we sum it?
|
|
# twenty two, fifty six
|
|
if prev_word in sums and word in string_num_en:
|
|
if val and val < 10:
|
|
val = prev_val + val
|
|
|
|
# is the prev word a number and should we multiply it?
|
|
# twenty hundred, six hundred
|
|
if word in multiplies:
|
|
if not prev_val:
|
|
prev_val = 1
|
|
val = prev_val * val
|
|
|
|
# is this a spoken fraction?
|
|
# half cup
|
|
if val is False:
|
|
val = isFractional_en(word, short_scale=short_scale)
|
|
|
|
# 2 fifths
|
|
if not ordinals:
|
|
next_value = isFractional_en(next_word, short_scale=short_scale)
|
|
if next_value:
|
|
if not val:
|
|
val = 1
|
|
val = val * next_value
|
|
|
|
# is this a negative number?
|
|
if val and prev_word and prev_word in negatives:
|
|
val = 0 - val
|
|
|
|
# let's make sure it isn't a fraction
|
|
if not val:
|
|
# look for fractions like "2/3"
|
|
aPieces = word.split('/')
|
|
if look_for_fractions(aPieces):
|
|
val = float(aPieces[0]) / float(aPieces[1])
|
|
|
|
else:
|
|
prev_val = val
|
|
|
|
# handle long numbers
|
|
# six hundred sixty six
|
|
# two million five hundred thousand
|
|
if word in multiplies and next_word not in multiplies:
|
|
to_sum.append(val)
|
|
val = 0
|
|
prev_val = 0
|
|
|
|
if val is not None:
|
|
for v in to_sum:
|
|
val = val + v
|
|
return val
|
|
|
|
|
|
def extract_datetime_en(string, dateNow, default_time):
|
|
""" Convert a human date reference into an exact datetime
|
|
|
|
Convert things like
|
|
"today"
|
|
"tomorrow afternoon"
|
|
"next Tuesday at 4pm"
|
|
"August 3rd"
|
|
into a datetime. If a reference date is not provided, the current
|
|
local time is used. Also consumes the words used to define the date
|
|
returning the remaining string. For example, the string
|
|
"what is Tuesday's weather forecast"
|
|
returns the date for the forthcoming Tuesday relative to the reference
|
|
date and the remainder string
|
|
"what is weather forecast".
|
|
|
|
Args:
|
|
string (str): string containing date words
|
|
dateNow (datetime): A reference date/time for "tommorrow", etc
|
|
default_time (time): Time to set if no time was found in the string
|
|
|
|
Returns:
|
|
[datetime, str]: An array containing the datetime and the remaining
|
|
text not consumed in the parsing, or None if no
|
|
date or time related text was found.
|
|
"""
|
|
|
|
def clean_string(s):
|
|
# clean unneeded punctuation and capitalization among other things.
|
|
s = s.lower().replace('?', '').replace('.', '').replace(',', '') \
|
|
.replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \
|
|
.replace("o' clock", "o'clock").replace("o clock", "o'clock") \
|
|
.replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \
|
|
.replace("oclock", "o'clock").replace("couple", "2") \
|
|
.replace("centuries", "century").replace("decades", "decade") \
|
|
.replace("millenniums", "millennium")
|
|
|
|
wordList = s.split()
|
|
for idx, word in enumerate(wordList):
|
|
word = word.replace("'s", "")
|
|
|
|
ordinals = ["rd", "st", "nd", "th"]
|
|
if word[0].isdigit():
|
|
for ordinal in ordinals:
|
|
# "second" is the only case we should not do this
|
|
if ordinal in word and "second" not in word:
|
|
word = word.replace(ordinal, "")
|
|
wordList[idx] = word
|
|
|
|
return wordList
|
|
|
|
def date_found():
|
|
return found or \
|
|
(
|
|
datestr != "" or
|
|
yearOffset != 0 or monthOffset != 0 or
|
|
dayOffset is True or hrOffset != 0 or
|
|
hrAbs or minOffset != 0 or
|
|
minAbs or secOffset != 0
|
|
)
|
|
|
|
if string == "" or not dateNow:
|
|
return None
|
|
|
|
found = False
|
|
daySpecified = False
|
|
dayOffset = False
|
|
monthOffset = 0
|
|
yearOffset = 0
|
|
today = dateNow.strftime("%w")
|
|
currentYear = dateNow.strftime("%Y")
|
|
fromFlag = False
|
|
datestr = ""
|
|
hasYear = False
|
|
timeQualifier = ""
|
|
|
|
timeQualifiersAM = ['morning']
|
|
timeQualifiersPM = ['afternoon', 'evening', 'tonight', 'night']
|
|
timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM)
|
|
markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of', "within"]
|
|
days = ['monday', 'tuesday', 'wednesday',
|
|
'thursday', 'friday', 'saturday', 'sunday']
|
|
months = ['january', 'february', 'march', 'april', 'may', 'june',
|
|
'july', 'august', 'september', 'october', 'november',
|
|
'december']
|
|
monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
|
|
'sept', 'oct', 'nov', 'dec']
|
|
year_multiples = ["decade", "century", "millennium"]
|
|
day_multiples = ["weeks", "months", "years"]
|
|
|
|
words = clean_string(string)
|
|
|
|
for idx, word in enumerate(words):
|
|
if word == "":
|
|
continue
|
|
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
|
wordPrev = words[idx - 1] if idx > 0 else ""
|
|
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
|
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
|
|
|
# this isn't in clean string because I don't want to save back to words
|
|
word = word.rstrip('s')
|
|
start = idx
|
|
used = 0
|
|
# save timequalifier for later
|
|
|
|
if word == "now" and not datestr:
|
|
resultStr = " ".join(words[idx + 1:])
|
|
resultStr = ' '.join(resultStr.split())
|
|
extractedDate = dateNow.replace(microsecond=0)
|
|
return [extractedDate, resultStr]
|
|
elif wordNext in year_multiples:
|
|
multiplier = None
|
|
if is_numeric(word):
|
|
multiplier = extractnumber_en(word)
|
|
multiplier = multiplier or 1
|
|
multiplier = int(multiplier)
|
|
used += 2
|
|
if wordNext == "decade":
|
|
yearOffset = multiplier * 10
|
|
elif wordNext == "century":
|
|
yearOffset = multiplier * 100
|
|
elif wordNext == "millennium":
|
|
yearOffset = multiplier * 1000
|
|
# couple of
|
|
elif word == "2" and wordNext == "of" and \
|
|
wordNextNext in year_multiples:
|
|
multiplier = 2
|
|
used += 3
|
|
if wordNextNext == "decade":
|
|
yearOffset = multiplier * 10
|
|
elif wordNextNext == "century":
|
|
yearOffset = multiplier * 100
|
|
elif wordNextNext == "millennium":
|
|
yearOffset = multiplier * 1000
|
|
elif word == "2" and wordNext == "of" and \
|
|
wordNextNext in day_multiples:
|
|
multiplier = 2
|
|
used += 3
|
|
if wordNextNext == "years":
|
|
yearOffset = multiplier
|
|
elif wordNextNext == "months":
|
|
monthOffset = multiplier
|
|
elif wordNextNext == "weeks":
|
|
dayOffset = multiplier * 7
|
|
elif word in timeQualifiersList:
|
|
timeQualifier = word
|
|
# parse today, tomorrow, day after tomorrow
|
|
elif word == "today" and not fromFlag:
|
|
dayOffset = 0
|
|
used += 1
|
|
elif word == "tomorrow" and not fromFlag:
|
|
dayOffset = 1
|
|
used += 1
|
|
elif (word == "day" and
|
|
wordNext == "after" and
|
|
wordNextNext == "tomorrow" and
|
|
not fromFlag and
|
|
not wordPrev[0].isdigit()):
|
|
dayOffset = 2
|
|
used = 3
|
|
if wordPrev == "the":
|
|
start -= 1
|
|
used += 1
|
|
# parse 5 days, 10 weeks, last week, next week
|
|
elif word == "day":
|
|
if wordPrev[0].isdigit():
|
|
dayOffset += int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif word == "week" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
dayOffset += int(wordPrev) * 7
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "next":
|
|
dayOffset = 7
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "last":
|
|
dayOffset = -7
|
|
start -= 1
|
|
used = 2
|
|
# parse 10 months, next month, last month
|
|
elif word == "month" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
monthOffset = int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "next":
|
|
monthOffset = 1
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "last":
|
|
monthOffset = -1
|
|
start -= 1
|
|
used = 2
|
|
# parse 5 years, next year, last year
|
|
elif word == "year" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
yearOffset = int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "next":
|
|
yearOffset = 1
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "last":
|
|
yearOffset = -1
|
|
start -= 1
|
|
used = 2
|
|
# parse Monday, Tuesday, etc., and next Monday,
|
|
# last Tuesday, etc.
|
|
elif word in days and not fromFlag:
|
|
d = days.index(word)
|
|
dayOffset = (d + 1) - int(today)
|
|
used = 1
|
|
if dayOffset < 0:
|
|
dayOffset += 7
|
|
if wordPrev == "next":
|
|
dayOffset += 7
|
|
used += 1
|
|
start -= 1
|
|
elif wordPrev == "last":
|
|
dayOffset -= 7
|
|
used += 1
|
|
start -= 1
|
|
# parse 15 of July, June 20th, Feb 18, 19 of February
|
|
elif word in months or word in monthsShort and not fromFlag:
|
|
try:
|
|
m = months.index(word)
|
|
except ValueError:
|
|
m = monthsShort.index(word)
|
|
used += 1
|
|
datestr = months[m]
|
|
if wordPrev and (wordPrev[0].isdigit() or
|
|
(wordPrev == "of" and wordPrevPrev[0].isdigit())):
|
|
if wordPrev == "of" and wordPrevPrev[0].isdigit():
|
|
datestr += " " + words[idx - 2]
|
|
used += 1
|
|
start -= 1
|
|
else:
|
|
datestr += " " + wordPrev
|
|
start -= 1
|
|
used += 1
|
|
if wordNext and wordNext[0].isdigit():
|
|
datestr += " " + wordNext
|
|
used += 1
|
|
hasYear = True
|
|
else:
|
|
hasYear = False
|
|
|
|
elif wordNext and wordNext[0].isdigit():
|
|
datestr += " " + wordNext
|
|
used += 1
|
|
if wordNextNext and wordNextNext[0].isdigit():
|
|
datestr += " " + wordNextNext
|
|
used += 1
|
|
hasYear = True
|
|
else:
|
|
hasYear = False
|
|
# parse 5 days from tomorrow, 10 weeks from next thursday,
|
|
# 2 months from July
|
|
validFollowups = days + months + monthsShort
|
|
validFollowups.append("today")
|
|
validFollowups.append("tomorrow")
|
|
validFollowups.append("next")
|
|
validFollowups.append("last")
|
|
validFollowups.append("now")
|
|
if (word == "from" or word == "after") and wordNext in validFollowups:
|
|
used = 2
|
|
fromFlag = True
|
|
if wordNext == "tomorrow":
|
|
dayOffset += 1
|
|
elif wordNext in days:
|
|
d = days.index(wordNext)
|
|
tmpOffset = (d + 1) - int(today)
|
|
used = 2
|
|
if tmpOffset < 0:
|
|
tmpOffset += 7
|
|
dayOffset += tmpOffset
|
|
elif wordNextNext and wordNextNext in days:
|
|
d = days.index(wordNextNext)
|
|
tmpOffset = (d + 1) - int(today)
|
|
used = 3
|
|
if wordNext == "next":
|
|
tmpOffset += 7
|
|
used += 1
|
|
start -= 1
|
|
elif wordNext == "last":
|
|
tmpOffset -= 7
|
|
used += 1
|
|
start -= 1
|
|
dayOffset += tmpOffset
|
|
if used > 0:
|
|
if start - 1 > 0 and words[start - 1] == "this":
|
|
start -= 1
|
|
used += 1
|
|
|
|
for i in range(0, used):
|
|
words[i + start] = ""
|
|
|
|
if start - 1 >= 0 and words[start - 1] in markers:
|
|
words[start - 1] = ""
|
|
found = True
|
|
daySpecified = True
|
|
|
|
# parse time
|
|
hrOffset = 0
|
|
minOffset = 0
|
|
secOffset = 0
|
|
hrAbs = None
|
|
minAbs = None
|
|
military = False
|
|
|
|
for idx, word in enumerate(words):
|
|
if word == "":
|
|
continue
|
|
|
|
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
|
wordPrev = words[idx - 1] if idx > 0 else ""
|
|
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
|
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
|
# parse noon, midnight, morning, afternoon, evening
|
|
used = 0
|
|
if word == "noon":
|
|
hrAbs = 12
|
|
used += 1
|
|
elif word == "midnight":
|
|
hrAbs = 0
|
|
used += 1
|
|
elif word == "morning":
|
|
if hrAbs is None:
|
|
hrAbs = 8
|
|
used += 1
|
|
elif word == "afternoon":
|
|
if hrAbs is None:
|
|
hrAbs = 15
|
|
used += 1
|
|
elif word == "evening":
|
|
if hrAbs is None:
|
|
hrAbs = 19
|
|
used += 1
|
|
# couple of time_unit
|
|
elif word == "2" and wordNext == "of" and \
|
|
wordNextNext in ["hours", "minutes", "seconds"]:
|
|
used += 3
|
|
if wordNextNext == "hours":
|
|
hrOffset = 2
|
|
elif wordNextNext == "minutes":
|
|
minOffset = 2
|
|
elif wordNextNext == "seconds":
|
|
secOffset = 2
|
|
# parse half an hour, quarter hour
|
|
elif word == "hour" and \
|
|
(wordPrev in markers or wordPrevPrev in markers):
|
|
if wordPrev == "half":
|
|
minOffset = 30
|
|
elif wordPrev == "quarter":
|
|
minOffset = 15
|
|
elif wordPrevPrev == "quarter":
|
|
minOffset = 15
|
|
if idx > 2 and words[idx - 3] in markers:
|
|
words[idx - 3] = ""
|
|
if words[idx - 3] == "this":
|
|
daySpecified = True
|
|
words[idx - 2] = ""
|
|
elif wordPrev == "within":
|
|
hrOffset = 1
|
|
else:
|
|
hrOffset = 1
|
|
if wordPrevPrev in markers:
|
|
words[idx - 2] = ""
|
|
if wordPrevPrev == "this":
|
|
daySpecified = True
|
|
words[idx - 1] = ""
|
|
used += 1
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
# parse 5:00 am, 12:00 p.m., etc
|
|
# parse in a minute
|
|
elif word == "minute" and wordPrev == "in":
|
|
minOffset = 1
|
|
words[idx - 1] = ""
|
|
used += 1
|
|
# parse in a second
|
|
elif word == "second" and wordPrev == "in":
|
|
secOffset = 1
|
|
words[idx - 1] = ""
|
|
used += 1
|
|
elif word[0].isdigit():
|
|
isTime = True
|
|
strHH = ""
|
|
strMM = ""
|
|
remainder = ""
|
|
if ':' in word:
|
|
# parse colons
|
|
# "3:00 in the morning"
|
|
stage = 0
|
|
length = len(word)
|
|
for i in range(length):
|
|
if stage == 0:
|
|
if word[i].isdigit():
|
|
strHH += word[i]
|
|
elif word[i] == ":":
|
|
stage = 1
|
|
else:
|
|
stage = 2
|
|
i -= 1
|
|
elif stage == 1:
|
|
if word[i].isdigit():
|
|
strMM += word[i]
|
|
else:
|
|
stage = 2
|
|
i -= 1
|
|
elif stage == 2:
|
|
remainder = word[i:].replace(".", "")
|
|
break
|
|
if remainder == "":
|
|
nextWord = wordNext.replace(".", "")
|
|
if nextWord == "am" or nextWord == "pm":
|
|
remainder = nextWord
|
|
used += 1
|
|
elif nextWord == "tonight":
|
|
remainder = "pm"
|
|
used += 1
|
|
elif wordNext == "in" and wordNextNext == "the" and \
|
|
words[idx + 3] == "morning":
|
|
remainder = "am"
|
|
used += 3
|
|
elif wordNext == "in" and wordNextNext == "the" and \
|
|
words[idx + 3] == "afternoon":
|
|
remainder = "pm"
|
|
used += 3
|
|
elif wordNext == "in" and wordNextNext == "the" and \
|
|
words[idx + 3] == "evening":
|
|
remainder = "pm"
|
|
used += 3
|
|
elif wordNext == "in" and wordNextNext == "morning":
|
|
remainder = "am"
|
|
used += 2
|
|
elif wordNext == "in" and wordNextNext == "afternoon":
|
|
remainder = "pm"
|
|
used += 2
|
|
elif wordNext == "in" and wordNextNext == "evening":
|
|
remainder = "pm"
|
|
used += 2
|
|
elif wordNext == "this" and wordNextNext == "morning":
|
|
remainder = "am"
|
|
used = 2
|
|
daySpecified = True
|
|
elif wordNext == "this" and wordNextNext == "afternoon":
|
|
remainder = "pm"
|
|
used = 2
|
|
daySpecified = True
|
|
elif wordNext == "this" and wordNextNext == "evening":
|
|
remainder = "pm"
|
|
used = 2
|
|
daySpecified = True
|
|
elif wordNext == "at" and wordNextNext == "night":
|
|
if strHH and int(strHH) > 5:
|
|
remainder = "pm"
|
|
else:
|
|
remainder = "am"
|
|
used += 2
|
|
else:
|
|
if timeQualifier != "":
|
|
military = True
|
|
if strHH and int(strHH) <= 12 and \
|
|
(timeQualifier in timeQualifiersPM):
|
|
strHH += str(int(strHH) + 12)
|
|
else:
|
|
# try to parse numbers without colons
|
|
# 5 hours, 10 minutes etc.
|
|
length = len(word)
|
|
strNum = ""
|
|
remainder = ""
|
|
for i in range(length):
|
|
if word[i].isdigit():
|
|
strNum += word[i]
|
|
else:
|
|
remainder += word[i]
|
|
|
|
if remainder == "":
|
|
remainder = wordNext.replace(".", "").lstrip().rstrip()
|
|
if (
|
|
remainder == "pm" or
|
|
wordNext == "pm" or
|
|
remainder == "p.m." or
|
|
wordNext == "p.m."):
|
|
strHH = strNum
|
|
remainder = "pm"
|
|
used = 1
|
|
elif (
|
|
remainder == "am" or
|
|
wordNext == "am" or
|
|
remainder == "a.m." or
|
|
wordNext == "a.m."):
|
|
strHH = strNum
|
|
remainder = "am"
|
|
used = 1
|
|
else:
|
|
if (
|
|
int(strNum) > 100 and
|
|
(
|
|
wordPrev == "o" or
|
|
wordPrev == "oh"
|
|
)):
|
|
# 0800 hours (pronounced oh-eight-hundred)
|
|
strHH = str(int(strNum) // 100)
|
|
strMM = str(int(strNum) % 100)
|
|
military = True
|
|
if wordNext == "hours":
|
|
used += 1
|
|
elif (
|
|
(wordNext == "hours" or wordNext == "hour" or
|
|
remainder == "hours" or remainder == "hour") and
|
|
word[0] != '0' and
|
|
(
|
|
int(strNum) < 100 or
|
|
int(strNum) > 2400
|
|
)):
|
|
# ignores military time
|
|
# "in 3 hours"
|
|
hrOffset = int(strNum)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
|
|
elif wordNext == "minutes" or wordNext == "minute" or \
|
|
remainder == "minutes" or remainder == "minute":
|
|
# "in 10 minutes"
|
|
minOffset = int(strNum)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
elif wordNext == "seconds" or wordNext == "second" \
|
|
or remainder == "seconds" or remainder == "second":
|
|
# in 5 seconds
|
|
secOffset = int(strNum)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
elif int(strNum) > 100:
|
|
# military time, eg. "3300 hours"
|
|
strHH = str(int(strNum) // 100)
|
|
strMM = str(int(strNum) % 100)
|
|
military = True
|
|
if wordNext == "hours" or wordNext == "hour" or \
|
|
remainder == "hours" or remainder == "hour":
|
|
used += 1
|
|
elif wordNext and wordNext[0].isdigit():
|
|
# military time, e.g. "04 38 hours"
|
|
strHH = strNum
|
|
strMM = wordNext
|
|
military = True
|
|
used += 1
|
|
if (wordNextNext == "hours" or
|
|
wordNextNext == "hour" or
|
|
remainder == "hours" or remainder == "hour"):
|
|
used += 1
|
|
elif (
|
|
wordNext == "" or wordNext == "o'clock" or
|
|
(
|
|
wordNext == "in" and
|
|
(
|
|
wordNextNext == "the" or
|
|
wordNextNext == timeQualifier
|
|
)
|
|
)):
|
|
strHH = strNum
|
|
strMM = "00"
|
|
if wordNext == "o'clock":
|
|
used += 1
|
|
if wordNext == "in" or wordNextNext == "in":
|
|
used += (1 if wordNext == "in" else 2)
|
|
wordNextNextNext = words[idx + 3] \
|
|
if idx + 3 < len(words) else ""
|
|
|
|
if (wordNextNext and
|
|
(wordNextNext in timeQualifier or
|
|
wordNextNextNext in timeQualifier)):
|
|
if (wordNextNext in timeQualifiersPM or
|
|
wordNextNextNext in timeQualifiersPM):
|
|
remainder = "pm"
|
|
used += 1
|
|
if (wordNextNext in timeQualifiersAM or
|
|
wordNextNextNext in timeQualifiersAM):
|
|
remainder = "am"
|
|
used += 1
|
|
if timeQualifier != "":
|
|
used += 1 # TODO: Unsure if this is 100% accurate
|
|
military = True
|
|
else:
|
|
isTime = False
|
|
|
|
HH = int(strHH) if strHH else 0
|
|
MM = int(strMM) if strMM else 0
|
|
HH = HH + 12 if remainder == "pm" and HH < 12 else HH
|
|
HH = HH - 12 if remainder == "am" and HH >= 12 else HH
|
|
|
|
if (not military and
|
|
remainder not in ['am', 'pm', 'hours', 'minutes',
|
|
"second", "seconds",
|
|
"hour", "minute"] and
|
|
((not daySpecified) or dayOffset < 1)):
|
|
# ambiguous time, detect whether they mean this evening or
|
|
# the next morning based on whether it has already passed
|
|
if dateNow.hour < HH:
|
|
pass # No modification needed
|
|
elif dateNow.hour < HH + 12:
|
|
HH += 12
|
|
else:
|
|
# has passed, assume the next morning
|
|
dayOffset += 1
|
|
|
|
if timeQualifier in timeQualifiersPM and HH < 12:
|
|
HH += 12
|
|
|
|
if HH > 24 or MM > 59:
|
|
isTime = False
|
|
used = 0
|
|
if isTime:
|
|
hrAbs = HH
|
|
minAbs = MM
|
|
used += 1
|
|
if used > 0:
|
|
# removed parsed words from the sentence
|
|
for i in range(used):
|
|
if idx + i >= len(words):
|
|
break
|
|
words[idx + i] = ""
|
|
|
|
if wordPrev == "o" or wordPrev == "oh":
|
|
words[words.index(wordPrev)] = ""
|
|
|
|
if wordPrev == "early":
|
|
hrOffset = -1
|
|
words[idx - 1] = ""
|
|
idx -= 1
|
|
elif wordPrev == "late":
|
|
hrOffset = 1
|
|
words[idx - 1] = ""
|
|
idx -= 1
|
|
if idx > 0 and wordPrev in markers:
|
|
words[idx - 1] = ""
|
|
if wordPrev == "this":
|
|
daySpecified = True
|
|
if idx > 1 and wordPrevPrev in markers:
|
|
words[idx - 2] = ""
|
|
if wordPrevPrev == "this":
|
|
daySpecified = True
|
|
|
|
idx += used - 1
|
|
found = True
|
|
|
|
# check that we found a date
|
|
if not date_found:
|
|
return None
|
|
|
|
if dayOffset is False:
|
|
dayOffset = 0
|
|
|
|
# perform date manipulation
|
|
|
|
extractedDate = dateNow.replace(microsecond=0)
|
|
|
|
if datestr != "":
|
|
# date included an explicit date, e.g. "june 5" or "june 2, 2017"
|
|
try:
|
|
temp = datetime.strptime(datestr, "%B %d")
|
|
except ValueError:
|
|
# Try again, allowing the year
|
|
temp = datetime.strptime(datestr, "%B %d %Y")
|
|
extractedDate = extractedDate.replace(hour=0, minute=0, second=0)
|
|
if not hasYear:
|
|
temp = temp.replace(year=extractedDate.year,
|
|
tzinfo=extractedDate.tzinfo)
|
|
if extractedDate < temp:
|
|
extractedDate = extractedDate.replace(
|
|
year=int(currentYear),
|
|
month=int(temp.strftime("%m")),
|
|
day=int(temp.strftime("%d")),
|
|
tzinfo=extractedDate.tzinfo)
|
|
else:
|
|
extractedDate = extractedDate.replace(
|
|
year=int(currentYear) + 1,
|
|
month=int(temp.strftime("%m")),
|
|
day=int(temp.strftime("%d")),
|
|
tzinfo=extractedDate.tzinfo)
|
|
else:
|
|
extractedDate = extractedDate.replace(
|
|
year=int(temp.strftime("%Y")),
|
|
month=int(temp.strftime("%m")),
|
|
day=int(temp.strftime("%d")),
|
|
tzinfo=extractedDate.tzinfo)
|
|
else:
|
|
# ignore the current HH:MM:SS if relative using days or greater
|
|
if hrOffset == 0 and minOffset == 0 and secOffset == 0:
|
|
extractedDate = extractedDate.replace(hour=0, minute=0, second=0)
|
|
|
|
if yearOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(years=yearOffset)
|
|
if monthOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(months=monthOffset)
|
|
if dayOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(days=dayOffset)
|
|
if hrAbs != -1 and minAbs != -1:
|
|
# If no time was supplied in the string set the time to default
|
|
# time if it's available
|
|
if hrAbs is None and minAbs is None and default_time is not None:
|
|
hrAbs, minAbs = default_time.hour, default_time.minute
|
|
else:
|
|
hrAbs = hrAbs or 0
|
|
minAbs = minAbs or 0
|
|
|
|
extractedDate = extractedDate + relativedelta(hours=hrAbs,
|
|
minutes=minAbs)
|
|
if (hrAbs != 0 or minAbs != 0) and datestr == "":
|
|
if not daySpecified and dateNow > extractedDate:
|
|
extractedDate = extractedDate + relativedelta(days=1)
|
|
if hrOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(hours=hrOffset)
|
|
if minOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(minutes=minOffset)
|
|
if secOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(seconds=secOffset)
|
|
for idx, word in enumerate(words):
|
|
if words[idx] == "and" and \
|
|
words[idx - 1] == "" and words[idx + 1] == "":
|
|
words[idx] = ""
|
|
|
|
resultStr = " ".join(words)
|
|
resultStr = ' '.join(resultStr.split())
|
|
return [extractedDate, resultStr]
|
|
|
|
|
|
def isFractional_en(input_str, short_scale=True):
|
|
"""
|
|
This function takes the given text and checks if it is a fraction.
|
|
|
|
Args:
|
|
input_str (str): the string to check if fractional
|
|
short_scale (bool): use short scale if True, long scale if False
|
|
Returns:
|
|
(bool) or (float): False if not a fraction, otherwise the fraction
|
|
|
|
"""
|
|
if input_str.endswith('s', -1):
|
|
input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
|
|
|
|
fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4}
|
|
if short_scale:
|
|
for num in SHORT_ORDINAL_STRING_EN:
|
|
if num > 2:
|
|
fracts[SHORT_ORDINAL_STRING_EN[num]] = num
|
|
else:
|
|
for num in LONG_ORDINAL_STRING_EN:
|
|
if num > 2:
|
|
fracts[LONG_ORDINAL_STRING_EN[num]] = num
|
|
|
|
if input_str.lower() in fracts:
|
|
return 1.0 / fracts[input_str.lower()]
|
|
return False
|
|
|
|
|
|
def extract_numbers_en(text, short_scale=True, ordinals=False):
|
|
"""
|
|
Takes in a string and extracts a list of numbers.
|
|
|
|
Args:
|
|
text (str): the string to extract a number from
|
|
short_scale (bool): Use "short scale" or "long scale" for large
|
|
numbers -- over a million. The default is short scale, which
|
|
is now common in most English speaking countries.
|
|
See https://en.wikipedia.org/wiki/Names_of_large_numbers
|
|
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
|
|
Returns:
|
|
list: list of extracted numbers as floats
|
|
"""
|
|
return extract_numbers_generic(text, pronounce_number_en, extractnumber_en,
|
|
short_scale=short_scale, ordinals=ordinals)
|
|
|
|
|
|
def normalize_en(text, remove_articles):
|
|
""" English string normalization """
|
|
|
|
words = text.split() # this also removed extra spaces
|
|
normalized = ""
|
|
for word in words:
|
|
if remove_articles and word in ["the", "a", "an"]:
|
|
continue
|
|
|
|
# Expand common contractions, e.g. "isn't" -> "is not"
|
|
contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
|
|
"didn't", "doesn't", "don't", "gonna", "gotta",
|
|
"hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
|
|
"how'd", "how'll", "how's", "I'd", "I'll", "I'm",
|
|
"I've", "isn't", "it'd", "it'll", "it's", "mightn't",
|
|
"might've", "mustn't", "must've", "needn't",
|
|
"oughtn't",
|
|
"shan't", "she'd", "she'll", "she's", "shouldn't",
|
|
"should've", "somebody's", "someone'd", "someone'll",
|
|
"someone's", "that'll", "that's", "that'd", "there'd",
|
|
"there're", "there's", "they'd", "they'll", "they're",
|
|
"they've", "wasn't", "we'd", "we'll", "we're", "we've",
|
|
"weren't", "what'd", "what'll", "what're", "what's",
|
|
"whats", # technically incorrect but some STT outputs
|
|
"what've", "when's", "when'd", "where'd", "where's",
|
|
"where've", "who'd", "who'd've", "who'll", "who're",
|
|
"who's", "who've", "why'd", "why're", "why's", "won't",
|
|
"won't've", "would've", "wouldn't", "wouldn't've",
|
|
"y'all", "ya'll", "you'd", "you'd've", "you'll",
|
|
"y'aint", "y'ain't", "you're", "you've"]
|
|
if word in contraction:
|
|
expansion = ["is not", "are not", "can not", "could have",
|
|
"could not", "did not", "does not", "do not",
|
|
"going to", "got to", "had not", "has not",
|
|
"have not", "he would", "he will", "he is",
|
|
"how did",
|
|
"how will", "how is", "I would", "I will", "I am",
|
|
"I have", "is not", "it would", "it will", "it is",
|
|
"might not", "might have", "must not", "must have",
|
|
"need not", "ought not", "shall not", "she would",
|
|
"she will", "she is", "should not", "should have",
|
|
"somebody is", "someone would", "someone will",
|
|
"someone is", "that will", "that is", "that would",
|
|
"there would", "there are", "there is", "they would",
|
|
"they will", "they are", "they have", "was not",
|
|
"we would", "we will", "we are", "we have",
|
|
"were not", "what did", "what will", "what are",
|
|
"what is",
|
|
"what is", "what have", "when is", "when did",
|
|
"where did", "where is", "where have", "who would",
|
|
"who would have", "who will", "who are", "who is",
|
|
"who have", "why did", "why are", "why is",
|
|
"will not", "will not have", "would have",
|
|
"would not", "would not have", "you all", "you all",
|
|
"you would", "you would have", "you will",
|
|
"you are not", "you are not", "you are", "you have"]
|
|
word = expansion[contraction.index(word)]
|
|
|
|
normalized += " " + word
|
|
|
|
# replace extracted numbers
|
|
numbers = extract_numbers_en(normalized)
|
|
# sort by string size, "twenty two" should be replaced before "two"
|
|
numbers.sort(key=lambda s: len(pronounce_number_en(s)), reverse=True)
|
|
for n in numbers:
|
|
txt = pronounce_number_en(n)
|
|
n = str(n)
|
|
if n.endswith(".0"):
|
|
n = n[:-2]
|
|
normalized = normalized.replace(txt, n)
|
|
# prnounced may be different from txt, ie
|
|
# pronounce(0.5) != half
|
|
# extract(half) == 0.5
|
|
# TODO account for this
|
|
|
|
return normalized[1:] # strip the initial space
|