1448 lines
52 KiB
Python
1448 lines
52 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright 2017 Mycroft AI Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
from datetime import datetime
|
|
|
|
from dateutil.relativedelta import relativedelta
|
|
|
|
from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
|
|
from mycroft.util.lang.common_data_en import ARTICLES, NUM_STRING_EN, \
|
|
LONG_ORDINAL_STRING_EN, LONG_SCALE_EN, \
|
|
SHORT_SCALE_EN, SHORT_ORDINAL_STRING_EN
|
|
|
|
import re
|
|
|
|
|
|
class _Token():
|
|
|
|
def __init__(self, word, index):
|
|
self.word = word
|
|
self.index = index
|
|
|
|
def __setattr__(self, key, value):
|
|
try:
|
|
getattr(self, key)
|
|
except AttributeError:
|
|
super().__setattr__(key, value)
|
|
else:
|
|
raise Exception("Immutable!")
|
|
|
|
def __str__(self):
|
|
return "({w}, {i})".format(w=self.word, i=self.index)
|
|
|
|
def __repr__(self):
|
|
return "{n}({w}, {i})".format(n=self.__class__.__name__,
|
|
w=self.word, i=self.index)
|
|
|
|
|
|
def _tokenize(text):
|
|
return [_Token(word, index) for index, word in enumerate(text.split())]
|
|
|
|
|
|
class _ReplaceableNumber():
|
|
|
|
def __init__(self, value, tokens: [_Token]):
|
|
self.value = value
|
|
self.tokens = tokens
|
|
|
|
def __bool__(self):
|
|
return bool(self.value is not None and self.value is not False)
|
|
|
|
@property
|
|
def start_index(self):
|
|
return self.tokens[0].index
|
|
|
|
@property
|
|
def end_index(self):
|
|
return self.tokens[-1].index
|
|
|
|
@property
|
|
def text(self):
|
|
return ' '.join([t.word for t in self.tokens])
|
|
|
|
def __setattr__(self, key, value):
|
|
try:
|
|
getattr(self, key)
|
|
except AttributeError:
|
|
super().__setattr__(key, value)
|
|
else:
|
|
raise Exception("Immutable!")
|
|
|
|
def __str__(self):
|
|
return "({v}, {t})".format(v=self.value, t=self.tokens)
|
|
|
|
def __repr__(self):
|
|
return "{n}({v}, {t})".format(n=self.__class__.__name__,
|
|
v=self.value,
|
|
t=self.tokens)
|
|
|
|
|
|
def _invert_dict(original):
|
|
"""
|
|
Produce a dictionary with the keys and values
|
|
inverted, relative to the dict passed in.
|
|
|
|
Args:
|
|
original dict: The dict like object to invert
|
|
|
|
Returns:
|
|
dict
|
|
|
|
"""
|
|
return {value: key for key, value in original.items()}
|
|
|
|
|
|
def _generate_plurals(originals):
|
|
"""
|
|
Return a new set or dict containing the original values,
|
|
all with 's' appended to them.
|
|
|
|
Args:
|
|
originals set(str) or dict(str, any): values to pluralize
|
|
|
|
Returns:
|
|
set(str) or dict(str, any)
|
|
|
|
"""
|
|
if type(originals) == dict:
|
|
return {key + 's': value for key, value in originals.items()}
|
|
return {value + "s" for value in originals}
|
|
|
|
|
|
# negate next number (-2 = 0 - 2)
|
|
_NEGATIVES = {"negative", "minus"}
|
|
|
|
# sum the next number (twenty two = 20 + 2)
|
|
_SUMS = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50',
|
|
'sixty', '60', 'seventy', '70', 'eighty', '80', 'ninety', '90'}
|
|
|
|
_MULTIPLIES_LONG_SCALE_EN = set(LONG_SCALE_EN.values()) |\
|
|
_generate_plurals(LONG_SCALE_EN.values())
|
|
|
|
_MULTIPLIES_SHORT_SCALE_EN = set(SHORT_SCALE_EN.values()) |\
|
|
_generate_plurals(SHORT_SCALE_EN.values())
|
|
|
|
|
|
# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
|
|
_FRACTION_MARKER = {"and"}
|
|
|
|
# decimal marker ( 1 point 5 = 1 + 0.5)
|
|
_DECIMAL_MARKER = {"point", "dot"}
|
|
|
|
_STRING_NUM_EN = _invert_dict(NUM_STRING_EN)
|
|
_STRING_NUM_EN.update(_generate_plurals(_STRING_NUM_EN))
|
|
_STRING_NUM_EN.update({
|
|
"half": 0.5,
|
|
"halves": 0.5,
|
|
"couple": 2
|
|
})
|
|
|
|
_STRING_SHORT_ORDINAL_EN = _invert_dict(SHORT_ORDINAL_STRING_EN)
|
|
_STRING_LONG_ORDINAL_EN = _invert_dict(LONG_ORDINAL_STRING_EN)
|
|
|
|
|
|
def _partition_list(items, split_on):
|
|
"""
|
|
Partition a list of items.
|
|
|
|
Works similarly to str.partition
|
|
|
|
Args:
|
|
items:
|
|
split_on callable:
|
|
Should return a boolean. Each item will be passed to
|
|
this callable in succession, and partitions will be
|
|
created any time it returns True.
|
|
|
|
Returns:
|
|
|
|
"""
|
|
splits = []
|
|
current_split = []
|
|
for item in items:
|
|
if split_on(item):
|
|
splits.append(current_split)
|
|
splits.append([item])
|
|
current_split = []
|
|
else:
|
|
current_split.append(item)
|
|
splits.append(current_split)
|
|
return list(filter(lambda x: len(x) != 0, splits))
|
|
|
|
|
|
def convert_words_to_numbers(text, short_scale=True, ordinals=False):
|
|
text = text.lower()
|
|
tokens = _tokenize(text)
|
|
numbers_to_replace = _extract_numbers_with_text(tokens, short_scale, ordinals)
|
|
numbers_to_replace.sort(key=lambda number: number.start_index)
|
|
|
|
results = []
|
|
for token in tokens:
|
|
if not numbers_to_replace or token.index < numbers_to_replace[0].start_index:
|
|
results.append(token.word)
|
|
else:
|
|
if numbers_to_replace and token.index == numbers_to_replace[0].start_index:
|
|
results.append(str(numbers_to_replace[0].value))
|
|
if numbers_to_replace and token.index == numbers_to_replace[0].end_index:
|
|
numbers_to_replace.pop(0)
|
|
|
|
return ' '.join(results)
|
|
|
|
|
|
def _extract_numbers_with_text(tokens, short_scale=True, ordinals=False, fractional_numbers=True):
|
|
"""
|
|
Extract all numbers from a string, with the words that represent them.
|
|
|
|
Args:
|
|
text str: The text to parse.
|
|
short_scale bool: True if short scale numbers should be used, False for
|
|
long scale. True by default.
|
|
ordinals bool: True if ordinal words (first, second, third, etc) should
|
|
be parsed.
|
|
|
|
Returns:
|
|
[(number, str)]: A list of tuples, each containing a number and a
|
|
string.
|
|
|
|
"""
|
|
placeholder = "<placeholder>" # inserted to maintain correct indices
|
|
results = []
|
|
while True:
|
|
to_replace = \
|
|
_extract_number_with_text_en(tokens, short_scale, ordinals, fractional_numbers)
|
|
|
|
if not to_replace:
|
|
break
|
|
|
|
results.append(to_replace)
|
|
|
|
tokens = [t if not to_replace.start_index <= t.index <= to_replace.end_index else \
|
|
_Token(placeholder, t.index) for t in tokens]
|
|
results.sort(key=lambda n: n.start_index)
|
|
return results
|
|
|
|
|
|
def _extract_number_with_text_en(tokens, short_scale=True, ordinals=False, fractional_numbers=True):
|
|
"""
|
|
This function extracts a number from a text string,
|
|
handles pronunciations in long scale and short scale
|
|
|
|
https://en.wikipedia.org/wiki/Names_of_large_numbers
|
|
|
|
Args:
|
|
text (str): the string to normalize
|
|
short_scale (bool): use short scale if True, long scale if False
|
|
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
|
|
Returns:
|
|
(int, str) or (float, str)
|
|
None if no number is found.
|
|
|
|
"""
|
|
number, tokens = _extract_number_with_text_en_helper(tokens, short_scale, ordinals, fractional_numbers)
|
|
while tokens and tokens[0].word in ARTICLES:
|
|
tokens.pop(0)
|
|
return _ReplaceableNumber(number, tokens)
|
|
|
|
|
|
def _extract_number_with_text_en_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True):
|
|
"""
|
|
|
|
Args:
|
|
tokens [_Token]:
|
|
short_scale boolean:
|
|
ordinals boolean:
|
|
|
|
Returns:
|
|
|
|
"""
|
|
if fractional_numbers:
|
|
fraction, fraction_text = _extract_fraction_with_text_en(tokens, short_scale, ordinals)
|
|
if fraction:
|
|
return fraction, fraction_text
|
|
|
|
decimal, decimal_text = _extract_decimal_with_text_en(tokens, short_scale, ordinals)
|
|
if decimal:
|
|
return decimal, decimal_text
|
|
|
|
return _extract_whole_number_with_text_en(tokens, short_scale, ordinals)
|
|
|
|
|
|
def _extract_fraction_with_text_en(tokens, short_scale, ordinals):
|
|
"""
|
|
Extract fraction numbers from a string.
|
|
|
|
This is a helper function for extractnumber_en. It is not intended
|
|
to be used on it's own.
|
|
|
|
This function handles text such as '2 and 3/4'.
|
|
|
|
Notes:
|
|
While this is a helper for extractnumber_en, it also depends on
|
|
extractnumber_en, to parse out the components of the fraction.
|
|
|
|
Args:
|
|
tokens [_Token]: words and their indexes in the original string.
|
|
|
|
Returns:
|
|
(int or float, [_Token])
|
|
The value found, and the list of relevant tokens.
|
|
(None, None) if no fraction value is found.
|
|
|
|
"""
|
|
for c in _FRACTION_MARKER:
|
|
partitions = _partition_list(tokens, lambda t: t.word == c)
|
|
|
|
if len(partitions) == 3:
|
|
numbers1 = _extract_numbers_with_text(partitions[0], short_scale, ordinals, fractional_numbers=False)
|
|
numbers2 = _extract_numbers_with_text(partitions[2], short_scale, ordinals, fractional_numbers=True)
|
|
|
|
if not numbers1 or not numbers2:
|
|
return None, None
|
|
|
|
# ensure first is not a fraction and second is a fraction
|
|
num1 = numbers1[-1]
|
|
num2 = numbers2[0]
|
|
if num1.value >= 1 and 0 < num2.value < 1:
|
|
return num1.value + num2.value, \
|
|
num1.tokens + partitions[1] + num2.tokens
|
|
|
|
return None, None
|
|
|
|
|
|
def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
|
|
"""
|
|
Extract decimal numbers from a string.
|
|
|
|
This is a helper function for extractnumber_en. It is not intended
|
|
to be used on it's own.
|
|
|
|
This function handles text such as '2 point 5'.
|
|
|
|
Notes:
|
|
While this is a helper for extractnumber_en, it also depends on
|
|
extractnumber_en, to parse out the components of the decimal.
|
|
|
|
This does not currently handle things like:
|
|
number dot number number number
|
|
|
|
Args:
|
|
tokens [_Token]: The text to parse.
|
|
|
|
Returns:
|
|
(float, [_Token])
|
|
The value found and relevant tokens.
|
|
(None, None) if no decimal value is found.
|
|
|
|
"""
|
|
for c in _DECIMAL_MARKER:
|
|
partitions = _partition_list(tokens, lambda t: t.word == c)
|
|
|
|
if len(partitions) == 3:
|
|
numbers1 = _extract_numbers_with_text(partitions[0], short_scale, ordinals, fractional_numbers=False)
|
|
numbers2 = _extract_numbers_with_text(partitions[2], short_scale, ordinals, fractional_numbers=False)
|
|
|
|
if not numbers1 or not numbers2:
|
|
return None, None
|
|
|
|
number = numbers1[-1]# type: _ReplaceableNumber
|
|
decimal = numbers2[0] # type: _ReplaceableNumber
|
|
|
|
# TODO handle number dot number number number
|
|
if "." not in str(decimal.text):
|
|
return number.value + float('0.' + str(decimal.value)), \
|
|
number.tokens + partitions[1] + decimal.tokens
|
|
return None, None
|
|
|
|
|
|
def _extract_whole_number_with_text_en(tokens, short_scale, ordinals):
|
|
multiplies, string_num_ordinal, string_num_scale = \
|
|
_initialize_number_data(short_scale)
|
|
|
|
number_words = [] # type: [_Token]
|
|
val = False
|
|
prev_val = None
|
|
next_val = None
|
|
to_sum = []
|
|
for idx, token in enumerate(tokens):
|
|
word = token.word
|
|
if word in ARTICLES or word in _NEGATIVES:
|
|
number_words.append(token)
|
|
continue
|
|
|
|
prev_word = tokens[idx - 1].word if idx > 0 else ""
|
|
next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else ""
|
|
|
|
if word not in string_num_scale and \
|
|
word not in _STRING_NUM_EN and \
|
|
word not in _SUMS and \
|
|
word not in multiplies and \
|
|
not (ordinals and word in string_num_ordinal) and \
|
|
not is_numeric(word) and \
|
|
not isFractional_en(word, short_scale=short_scale) and \
|
|
not look_for_fractions(word.split('/')):
|
|
words_only = [token.word for token in number_words]
|
|
if number_words and not all([w in ARTICLES |
|
|
_NEGATIVES for w in words_only]):
|
|
break
|
|
else:
|
|
number_words = []
|
|
continue
|
|
elif word not in multiplies \
|
|
and prev_word not in multiplies \
|
|
and prev_word not in _SUMS \
|
|
and not (ordinals and prev_word in string_num_ordinal) \
|
|
and prev_word not in _NEGATIVES \
|
|
and prev_word not in ARTICLES:
|
|
number_words = [token]
|
|
elif prev_word in _SUMS and word in _SUMS:
|
|
number_words = [token]
|
|
else:
|
|
number_words.append(token)
|
|
|
|
# is this word already a number ?
|
|
if is_numeric(word):
|
|
if word.isdigit(): # doesn't work with decimals
|
|
val = int(word)
|
|
else:
|
|
val = float(word)
|
|
|
|
# is this word the name of a number ?
|
|
if word in _STRING_NUM_EN:
|
|
val = _STRING_NUM_EN.get(word)
|
|
elif word in string_num_scale:
|
|
val = string_num_scale.get(word)
|
|
elif ordinals and word in string_num_ordinal:
|
|
val = string_num_ordinal[word]
|
|
|
|
# is the prev word an ordinal number and current word is one?
|
|
# second one, third one
|
|
if ordinals and prev_word in string_num_ordinal and val is 1:
|
|
val = prev_val
|
|
|
|
# is the prev word a number and should we sum it?
|
|
# twenty two, fifty six
|
|
if prev_word in _SUMS and (
|
|
word in _STRING_NUM_EN or
|
|
word in string_num_scale or
|
|
(ordinals and word in string_num_ordinal)):
|
|
if val and val < 10:
|
|
val = prev_val + val
|
|
|
|
# is the prev word a number and should we multiply it?
|
|
# twenty hundred, six hundred
|
|
if word in multiplies:
|
|
if not prev_val:
|
|
prev_val = 1
|
|
val = prev_val * val
|
|
|
|
# is this a spoken fraction?
|
|
# half cup
|
|
if val is False:
|
|
val = isFractional_en(word, short_scale=short_scale)
|
|
|
|
# 2 fifths
|
|
if not ordinals:
|
|
next_value = isFractional_en(next_word, short_scale=short_scale)
|
|
if next_value:
|
|
if not val:
|
|
val = 1
|
|
val = val * next_value
|
|
|
|
# is this a negative number?
|
|
if val and prev_word and prev_word in _NEGATIVES:
|
|
val = 0 - val
|
|
|
|
# let's make sure it isn't a fraction
|
|
if not val:
|
|
# look for fractions like "2/3"
|
|
aPieces = word.split('/')
|
|
if look_for_fractions(aPieces):
|
|
val = float(aPieces[0]) / float(aPieces[1])
|
|
|
|
else:
|
|
prev_val = val
|
|
|
|
# handle long numbers
|
|
# six hundred sixty six
|
|
# two million five hundred thousand
|
|
if word in multiplies and next_word not in multiplies:
|
|
to_sum.append(val)
|
|
val = 0
|
|
prev_val = 0
|
|
|
|
if val is not None and to_sum:
|
|
val += sum(to_sum)
|
|
|
|
return val, number_words
|
|
|
|
|
|
def _initialize_number_data(short_scale):
|
|
"""
|
|
Generate dictionaries of words to numbers, based on scale.
|
|
|
|
This is a helper function for extractnumber_en.
|
|
|
|
Args:
|
|
short_scale boolean:
|
|
|
|
Returns:
|
|
(set(str), dict(str, number), dict(str, number))
|
|
multiplies, string_num_ordinal, string_num_scale
|
|
|
|
"""
|
|
multiplies = _MULTIPLIES_SHORT_SCALE_EN if short_scale \
|
|
else _MULTIPLIES_LONG_SCALE_EN
|
|
|
|
string_num_ordinal_en = _STRING_SHORT_ORDINAL_EN if short_scale \
|
|
else _STRING_LONG_ORDINAL_EN
|
|
|
|
string_num_scale_en = SHORT_SCALE_EN if short_scale else LONG_SCALE_EN
|
|
string_num_scale_en = _invert_dict(string_num_scale_en)
|
|
string_num_scale_en.update(_generate_plurals(string_num_scale_en))
|
|
|
|
return multiplies, string_num_ordinal_en, string_num_scale_en
|
|
|
|
|
|
def extractnumber_en(text, short_scale=True, ordinals=False):
|
|
"""
|
|
This function extracts a number from a text string,
|
|
handles pronunciations in long scale and short scale
|
|
|
|
https://en.wikipedia.org/wiki/Names_of_large_numbers
|
|
|
|
Args:
|
|
text (str): the string to normalize
|
|
short_scale (bool): use short scale if True, long scale if False
|
|
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
|
|
Returns:
|
|
(int) or (float) or False: The extracted number or False if no number
|
|
was found
|
|
|
|
"""
|
|
return _extract_number_with_text_en(_tokenize(text), short_scale, ordinals).value
|
|
|
|
|
|
def extract_duration_en(text):
|
|
""" Convert an english phrase into a number of seconds
|
|
|
|
Convert things like:
|
|
"10 minute"
|
|
"2 and a half hours"
|
|
"3 days 8 hours 10 minutes and 49 seconds"
|
|
into an int, representing the total number of seconds.
|
|
|
|
The words used in the duration will be consumed, and
|
|
the remainder returned.
|
|
|
|
As an example, "set a timer for 5 minutes" would return
|
|
(300, "set a timer for").
|
|
|
|
Args:
|
|
text (str): string containing a duration
|
|
|
|
Returns:
|
|
(float, str): A tuple containing the duration and the remaining text
|
|
not consumed in the parsing. The first value will
|
|
be None if no duration is found. The text returned
|
|
will have whitespace stripped from the ends.
|
|
|
|
"""
|
|
if not text:
|
|
return None
|
|
|
|
time_unit_to_seconds_map = {
|
|
'second': 1,
|
|
'minute': 60,
|
|
'hour': 3600,
|
|
'day': 86400,
|
|
'week': 604800
|
|
}
|
|
|
|
pattern = "(?P<value>\d+(?:\.?\d+)?)\s+{unit}s?"
|
|
text = convert_words_to_numbers(text)
|
|
|
|
pieces = []
|
|
for unit, value in time_unit_to_seconds_map.items():
|
|
unit_pattern = pattern.format(unit=unit)
|
|
matches = re.findall(unit_pattern, text)
|
|
matches = map(lambda s: float(s) * value, matches)
|
|
pieces += matches
|
|
text = re.sub(unit_pattern, '', text)
|
|
|
|
text = text.strip()
|
|
duration = sum(pieces)
|
|
|
|
return (duration, text) if pieces else (None, text)
|
|
|
|
|
|
def extract_datetime_en(string, dateNow, default_time):
|
|
""" Convert a human date reference into an exact datetime
|
|
|
|
Convert things like
|
|
"today"
|
|
"tomorrow afternoon"
|
|
"next Tuesday at 4pm"
|
|
"August 3rd"
|
|
into a datetime. If a reference date is not provided, the current
|
|
local time is used. Also consumes the words used to define the date
|
|
returning the remaining string. For example, the string
|
|
"what is Tuesday's weather forecast"
|
|
returns the date for the forthcoming Tuesday relative to the reference
|
|
date and the remainder string
|
|
"what is weather forecast".
|
|
|
|
Args:
|
|
string (str): string containing date words
|
|
dateNow (datetime): A reference date/time for "tommorrow", etc
|
|
default_time (time): Time to set if no time was found in the string
|
|
|
|
Returns:
|
|
[datetime, str]: An array containing the datetime and the remaining
|
|
text not consumed in the parsing, or None if no
|
|
date or time related text was found.
|
|
"""
|
|
|
|
def clean_string(s):
|
|
# clean unneeded punctuation and capitalization among other things.
|
|
s = s.lower().replace('?', '').replace('.', '').replace(',', '') \
|
|
.replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \
|
|
.replace("o' clock", "o'clock").replace("o clock", "o'clock") \
|
|
.replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \
|
|
.replace("oclock", "o'clock").replace("couple", "2") \
|
|
.replace("centuries", "century").replace("decades", "decade") \
|
|
.replace("millenniums", "millennium")
|
|
|
|
wordList = s.split()
|
|
for idx, word in enumerate(wordList):
|
|
word = word.replace("'s", "")
|
|
|
|
ordinals = ["rd", "st", "nd", "th"]
|
|
if word[0].isdigit():
|
|
for ordinal in ordinals:
|
|
# "second" is the only case we should not do this
|
|
if ordinal in word and "second" not in word:
|
|
word = word.replace(ordinal, "")
|
|
wordList[idx] = word
|
|
|
|
return wordList
|
|
|
|
def date_found():
|
|
return found or \
|
|
(
|
|
datestr != "" or
|
|
yearOffset != 0 or monthOffset != 0 or
|
|
dayOffset is True or hrOffset != 0 or
|
|
hrAbs or minOffset != 0 or
|
|
minAbs or secOffset != 0
|
|
)
|
|
|
|
if string == "" or not dateNow:
|
|
return None
|
|
|
|
found = False
|
|
daySpecified = False
|
|
dayOffset = False
|
|
monthOffset = 0
|
|
yearOffset = 0
|
|
today = dateNow.strftime("%w")
|
|
currentYear = dateNow.strftime("%Y")
|
|
fromFlag = False
|
|
datestr = ""
|
|
hasYear = False
|
|
timeQualifier = ""
|
|
|
|
timeQualifiersAM = ['morning']
|
|
timeQualifiersPM = ['afternoon', 'evening', 'tonight', 'night']
|
|
timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM)
|
|
markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of', "within"]
|
|
days = ['monday', 'tuesday', 'wednesday',
|
|
'thursday', 'friday', 'saturday', 'sunday']
|
|
months = ['january', 'february', 'march', 'april', 'may', 'june',
|
|
'july', 'august', 'september', 'october', 'november',
|
|
'december']
|
|
monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
|
|
'sept', 'oct', 'nov', 'dec']
|
|
year_multiples = ["decade", "century", "millennium"]
|
|
day_multiples = ["weeks", "months", "years"]
|
|
|
|
words = clean_string(string)
|
|
|
|
for idx, word in enumerate(words):
|
|
if word == "":
|
|
continue
|
|
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
|
wordPrev = words[idx - 1] if idx > 0 else ""
|
|
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
|
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
|
|
|
# this isn't in clean string because I don't want to save back to words
|
|
word = word.rstrip('s')
|
|
start = idx
|
|
used = 0
|
|
# save timequalifier for later
|
|
|
|
if word == "now" and not datestr:
|
|
resultStr = " ".join(words[idx + 1:])
|
|
resultStr = ' '.join(resultStr.split())
|
|
extractedDate = dateNow.replace(microsecond=0)
|
|
return [extractedDate, resultStr]
|
|
elif wordNext in year_multiples:
|
|
multiplier = None
|
|
if is_numeric(word):
|
|
multiplier = extractnumber_en(word)
|
|
multiplier = multiplier or 1
|
|
multiplier = int(multiplier)
|
|
used += 2
|
|
if wordNext == "decade":
|
|
yearOffset = multiplier * 10
|
|
elif wordNext == "century":
|
|
yearOffset = multiplier * 100
|
|
elif wordNext == "millennium":
|
|
yearOffset = multiplier * 1000
|
|
# couple of
|
|
elif word == "2" and wordNext == "of" and \
|
|
wordNextNext in year_multiples:
|
|
multiplier = 2
|
|
used += 3
|
|
if wordNextNext == "decade":
|
|
yearOffset = multiplier * 10
|
|
elif wordNextNext == "century":
|
|
yearOffset = multiplier * 100
|
|
elif wordNextNext == "millennium":
|
|
yearOffset = multiplier * 1000
|
|
elif word == "2" and wordNext == "of" and \
|
|
wordNextNext in day_multiples:
|
|
multiplier = 2
|
|
used += 3
|
|
if wordNextNext == "years":
|
|
yearOffset = multiplier
|
|
elif wordNextNext == "months":
|
|
monthOffset = multiplier
|
|
elif wordNextNext == "weeks":
|
|
dayOffset = multiplier * 7
|
|
elif word in timeQualifiersList:
|
|
timeQualifier = word
|
|
# parse today, tomorrow, day after tomorrow
|
|
elif word == "today" and not fromFlag:
|
|
dayOffset = 0
|
|
used += 1
|
|
elif word == "tomorrow" and not fromFlag:
|
|
dayOffset = 1
|
|
used += 1
|
|
elif (word == "day" and
|
|
wordNext == "after" and
|
|
wordNextNext == "tomorrow" and
|
|
not fromFlag and
|
|
not wordPrev[0].isdigit()):
|
|
dayOffset = 2
|
|
used = 3
|
|
if wordPrev == "the":
|
|
start -= 1
|
|
used += 1
|
|
# parse 5 days, 10 weeks, last week, next week
|
|
elif word == "day":
|
|
if wordPrev[0].isdigit():
|
|
dayOffset += int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif word == "week" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
dayOffset += int(wordPrev) * 7
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "next":
|
|
dayOffset = 7
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "last":
|
|
dayOffset = -7
|
|
start -= 1
|
|
used = 2
|
|
# parse 10 months, next month, last month
|
|
elif word == "month" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
monthOffset = int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "next":
|
|
monthOffset = 1
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "last":
|
|
monthOffset = -1
|
|
start -= 1
|
|
used = 2
|
|
# parse 5 years, next year, last year
|
|
elif word == "year" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
yearOffset = int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "next":
|
|
yearOffset = 1
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev == "last":
|
|
yearOffset = -1
|
|
start -= 1
|
|
used = 2
|
|
# parse Monday, Tuesday, etc., and next Monday,
|
|
# last Tuesday, etc.
|
|
elif word in days and not fromFlag:
|
|
d = days.index(word)
|
|
dayOffset = (d + 1) - int(today)
|
|
used = 1
|
|
if dayOffset < 0:
|
|
dayOffset += 7
|
|
if wordPrev == "next":
|
|
dayOffset += 7
|
|
used += 1
|
|
start -= 1
|
|
elif wordPrev == "last":
|
|
dayOffset -= 7
|
|
used += 1
|
|
start -= 1
|
|
# parse 15 of July, June 20th, Feb 18, 19 of February
|
|
elif word in months or word in monthsShort and not fromFlag:
|
|
try:
|
|
m = months.index(word)
|
|
except ValueError:
|
|
m = monthsShort.index(word)
|
|
used += 1
|
|
datestr = months[m]
|
|
if wordPrev and (wordPrev[0].isdigit() or
|
|
(wordPrev == "of" and wordPrevPrev[0].isdigit())):
|
|
if wordPrev == "of" and wordPrevPrev[0].isdigit():
|
|
datestr += " " + words[idx - 2]
|
|
used += 1
|
|
start -= 1
|
|
else:
|
|
datestr += " " + wordPrev
|
|
start -= 1
|
|
used += 1
|
|
if wordNext and wordNext[0].isdigit():
|
|
datestr += " " + wordNext
|
|
used += 1
|
|
hasYear = True
|
|
else:
|
|
hasYear = False
|
|
|
|
elif wordNext and wordNext[0].isdigit():
|
|
datestr += " " + wordNext
|
|
used += 1
|
|
if wordNextNext and wordNextNext[0].isdigit():
|
|
datestr += " " + wordNextNext
|
|
used += 1
|
|
hasYear = True
|
|
else:
|
|
hasYear = False
|
|
# parse 5 days from tomorrow, 10 weeks from next thursday,
|
|
# 2 months from July
|
|
validFollowups = days + months + monthsShort
|
|
validFollowups.append("today")
|
|
validFollowups.append("tomorrow")
|
|
validFollowups.append("next")
|
|
validFollowups.append("last")
|
|
validFollowups.append("now")
|
|
if (word == "from" or word == "after") and wordNext in validFollowups:
|
|
used = 2
|
|
fromFlag = True
|
|
if wordNext == "tomorrow":
|
|
dayOffset += 1
|
|
elif wordNext in days:
|
|
d = days.index(wordNext)
|
|
tmpOffset = (d + 1) - int(today)
|
|
used = 2
|
|
if tmpOffset < 0:
|
|
tmpOffset += 7
|
|
dayOffset += tmpOffset
|
|
elif wordNextNext and wordNextNext in days:
|
|
d = days.index(wordNextNext)
|
|
tmpOffset = (d + 1) - int(today)
|
|
used = 3
|
|
if wordNext == "next":
|
|
tmpOffset += 7
|
|
used += 1
|
|
start -= 1
|
|
elif wordNext == "last":
|
|
tmpOffset -= 7
|
|
used += 1
|
|
start -= 1
|
|
dayOffset += tmpOffset
|
|
if used > 0:
|
|
if start - 1 > 0 and words[start - 1] == "this":
|
|
start -= 1
|
|
used += 1
|
|
|
|
for i in range(0, used):
|
|
words[i + start] = ""
|
|
|
|
if start - 1 >= 0 and words[start - 1] in markers:
|
|
words[start - 1] = ""
|
|
found = True
|
|
daySpecified = True
|
|
|
|
# parse time
|
|
hrOffset = 0
|
|
minOffset = 0
|
|
secOffset = 0
|
|
hrAbs = None
|
|
minAbs = None
|
|
military = False
|
|
|
|
for idx, word in enumerate(words):
|
|
if word == "":
|
|
continue
|
|
|
|
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
|
wordPrev = words[idx - 1] if idx > 0 else ""
|
|
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
|
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
|
# parse noon, midnight, morning, afternoon, evening
|
|
used = 0
|
|
if word == "noon":
|
|
hrAbs = 12
|
|
used += 1
|
|
elif word == "midnight":
|
|
hrAbs = 0
|
|
used += 1
|
|
elif word == "morning":
|
|
if hrAbs is None:
|
|
hrAbs = 8
|
|
used += 1
|
|
elif word == "afternoon":
|
|
if hrAbs is None:
|
|
hrAbs = 15
|
|
used += 1
|
|
elif word == "evening":
|
|
if hrAbs is None:
|
|
hrAbs = 19
|
|
used += 1
|
|
# couple of time_unit
|
|
elif word == "2" and wordNext == "of" and \
|
|
wordNextNext in ["hours", "minutes", "seconds"]:
|
|
used += 3
|
|
if wordNextNext == "hours":
|
|
hrOffset = 2
|
|
elif wordNextNext == "minutes":
|
|
minOffset = 2
|
|
elif wordNextNext == "seconds":
|
|
secOffset = 2
|
|
# parse half an hour, quarter hour
|
|
elif word == "hour" and \
|
|
(wordPrev in markers or wordPrevPrev in markers):
|
|
if wordPrev == "half":
|
|
minOffset = 30
|
|
elif wordPrev == "quarter":
|
|
minOffset = 15
|
|
elif wordPrevPrev == "quarter":
|
|
minOffset = 15
|
|
if idx > 2 and words[idx - 3] in markers:
|
|
words[idx - 3] = ""
|
|
if words[idx - 3] == "this":
|
|
daySpecified = True
|
|
words[idx - 2] = ""
|
|
elif wordPrev == "within":
|
|
hrOffset = 1
|
|
else:
|
|
hrOffset = 1
|
|
if wordPrevPrev in markers:
|
|
words[idx - 2] = ""
|
|
if wordPrevPrev == "this":
|
|
daySpecified = True
|
|
words[idx - 1] = ""
|
|
used += 1
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
# parse 5:00 am, 12:00 p.m., etc
|
|
# parse in a minute
|
|
elif word == "minute" and wordPrev == "in":
|
|
minOffset = 1
|
|
words[idx - 1] = ""
|
|
used += 1
|
|
# parse in a second
|
|
elif word == "second" and wordPrev == "in":
|
|
secOffset = 1
|
|
words[idx - 1] = ""
|
|
used += 1
|
|
elif word[0].isdigit():
|
|
isTime = True
|
|
strHH = ""
|
|
strMM = ""
|
|
remainder = ""
|
|
if ':' in word:
|
|
# parse colons
|
|
# "3:00 in the morning"
|
|
stage = 0
|
|
length = len(word)
|
|
for i in range(length):
|
|
if stage == 0:
|
|
if word[i].isdigit():
|
|
strHH += word[i]
|
|
elif word[i] == ":":
|
|
stage = 1
|
|
else:
|
|
stage = 2
|
|
i -= 1
|
|
elif stage == 1:
|
|
if word[i].isdigit():
|
|
strMM += word[i]
|
|
else:
|
|
stage = 2
|
|
i -= 1
|
|
elif stage == 2:
|
|
remainder = word[i:].replace(".", "")
|
|
break
|
|
if remainder == "":
|
|
nextWord = wordNext.replace(".", "")
|
|
if nextWord == "am" or nextWord == "pm":
|
|
remainder = nextWord
|
|
used += 1
|
|
elif nextWord == "tonight":
|
|
remainder = "pm"
|
|
used += 1
|
|
elif wordNext == "in" and wordNextNext == "the" and \
|
|
words[idx + 3] == "morning":
|
|
remainder = "am"
|
|
used += 3
|
|
elif wordNext == "in" and wordNextNext == "the" and \
|
|
words[idx + 3] == "afternoon":
|
|
remainder = "pm"
|
|
used += 3
|
|
elif wordNext == "in" and wordNextNext == "the" and \
|
|
words[idx + 3] == "evening":
|
|
remainder = "pm"
|
|
used += 3
|
|
elif wordNext == "in" and wordNextNext == "morning":
|
|
remainder = "am"
|
|
used += 2
|
|
elif wordNext == "in" and wordNextNext == "afternoon":
|
|
remainder = "pm"
|
|
used += 2
|
|
elif wordNext == "in" and wordNextNext == "evening":
|
|
remainder = "pm"
|
|
used += 2
|
|
elif wordNext == "this" and wordNextNext == "morning":
|
|
remainder = "am"
|
|
used = 2
|
|
daySpecified = True
|
|
elif wordNext == "this" and wordNextNext == "afternoon":
|
|
remainder = "pm"
|
|
used = 2
|
|
daySpecified = True
|
|
elif wordNext == "this" and wordNextNext == "evening":
|
|
remainder = "pm"
|
|
used = 2
|
|
daySpecified = True
|
|
elif wordNext == "at" and wordNextNext == "night":
|
|
if strHH and int(strHH) > 5:
|
|
remainder = "pm"
|
|
else:
|
|
remainder = "am"
|
|
used += 2
|
|
else:
|
|
if timeQualifier != "":
|
|
military = True
|
|
if strHH and int(strHH) <= 12 and \
|
|
(timeQualifier in timeQualifiersPM):
|
|
strHH += str(int(strHH) + 12)
|
|
else:
|
|
# try to parse numbers without colons
|
|
# 5 hours, 10 minutes etc.
|
|
length = len(word)
|
|
strNum = ""
|
|
remainder = ""
|
|
for i in range(length):
|
|
if word[i].isdigit():
|
|
strNum += word[i]
|
|
else:
|
|
remainder += word[i]
|
|
|
|
if remainder == "":
|
|
remainder = wordNext.replace(".", "").lstrip().rstrip()
|
|
if (
|
|
remainder == "pm" or
|
|
wordNext == "pm" or
|
|
remainder == "p.m." or
|
|
wordNext == "p.m."):
|
|
strHH = strNum
|
|
remainder = "pm"
|
|
used = 1
|
|
elif (
|
|
remainder == "am" or
|
|
wordNext == "am" or
|
|
remainder == "a.m." or
|
|
wordNext == "a.m."):
|
|
strHH = strNum
|
|
remainder = "am"
|
|
used = 1
|
|
else:
|
|
if (
|
|
int(strNum) > 100 and
|
|
(
|
|
wordPrev == "o" or
|
|
wordPrev == "oh"
|
|
)):
|
|
# 0800 hours (pronounced oh-eight-hundred)
|
|
strHH = str(int(strNum) // 100)
|
|
strMM = str(int(strNum) % 100)
|
|
military = True
|
|
if wordNext == "hours":
|
|
used += 1
|
|
elif (
|
|
(wordNext == "hours" or wordNext == "hour" or
|
|
remainder == "hours" or remainder == "hour") and
|
|
word[0] != '0' and
|
|
(
|
|
int(strNum) < 100 or
|
|
int(strNum) > 2400
|
|
)):
|
|
# ignores military time
|
|
# "in 3 hours"
|
|
hrOffset = int(strNum)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
|
|
elif wordNext == "minutes" or wordNext == "minute" or \
|
|
remainder == "minutes" or remainder == "minute":
|
|
# "in 10 minutes"
|
|
minOffset = int(strNum)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
elif wordNext == "seconds" or wordNext == "second" \
|
|
or remainder == "seconds" or remainder == "second":
|
|
# in 5 seconds
|
|
secOffset = int(strNum)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
elif int(strNum) > 100:
|
|
# military time, eg. "3300 hours"
|
|
strHH = str(int(strNum) // 100)
|
|
strMM = str(int(strNum) % 100)
|
|
military = True
|
|
if wordNext == "hours" or wordNext == "hour" or \
|
|
remainder == "hours" or remainder == "hour":
|
|
used += 1
|
|
elif wordNext and wordNext[0].isdigit():
|
|
# military time, e.g. "04 38 hours"
|
|
strHH = strNum
|
|
strMM = wordNext
|
|
military = True
|
|
used += 1
|
|
if (wordNextNext == "hours" or
|
|
wordNextNext == "hour" or
|
|
remainder == "hours" or remainder == "hour"):
|
|
used += 1
|
|
elif (
|
|
wordNext == "" or wordNext == "o'clock" or
|
|
(
|
|
wordNext == "in" and
|
|
(
|
|
wordNextNext == "the" or
|
|
wordNextNext == timeQualifier
|
|
)
|
|
)):
|
|
strHH = strNum
|
|
strMM = "00"
|
|
if wordNext == "o'clock":
|
|
used += 1
|
|
if wordNext == "in" or wordNextNext == "in":
|
|
used += (1 if wordNext == "in" else 2)
|
|
wordNextNextNext = words[idx + 3] \
|
|
if idx + 3 < len(words) else ""
|
|
|
|
if (wordNextNext and
|
|
(wordNextNext in timeQualifier or
|
|
wordNextNextNext in timeQualifier)):
|
|
if (wordNextNext in timeQualifiersPM or
|
|
wordNextNextNext in timeQualifiersPM):
|
|
remainder = "pm"
|
|
used += 1
|
|
if (wordNextNext in timeQualifiersAM or
|
|
wordNextNextNext in timeQualifiersAM):
|
|
remainder = "am"
|
|
used += 1
|
|
if timeQualifier != "":
|
|
used += 1 # TODO: Unsure if this is 100% accurate
|
|
military = True
|
|
else:
|
|
isTime = False
|
|
|
|
HH = int(strHH) if strHH else 0
|
|
MM = int(strMM) if strMM else 0
|
|
HH = HH + 12 if remainder == "pm" and HH < 12 else HH
|
|
HH = HH - 12 if remainder == "am" and HH >= 12 else HH
|
|
|
|
if (not military and
|
|
remainder not in ['am', 'pm', 'hours', 'minutes',
|
|
"second", "seconds",
|
|
"hour", "minute"] and
|
|
((not daySpecified) or dayOffset < 1)):
|
|
# ambiguous time, detect whether they mean this evening or
|
|
# the next morning based on whether it has already passed
|
|
if dateNow.hour < HH:
|
|
pass # No modification needed
|
|
elif dateNow.hour < HH + 12:
|
|
HH += 12
|
|
else:
|
|
# has passed, assume the next morning
|
|
dayOffset += 1
|
|
|
|
if timeQualifier in timeQualifiersPM and HH < 12:
|
|
HH += 12
|
|
|
|
if HH > 24 or MM > 59:
|
|
isTime = False
|
|
used = 0
|
|
if isTime:
|
|
hrAbs = HH
|
|
minAbs = MM
|
|
used += 1
|
|
if used > 0:
|
|
# removed parsed words from the sentence
|
|
for i in range(used):
|
|
if idx + i >= len(words):
|
|
break
|
|
words[idx + i] = ""
|
|
|
|
if wordPrev == "o" or wordPrev == "oh":
|
|
words[words.index(wordPrev)] = ""
|
|
|
|
if wordPrev == "early":
|
|
hrOffset = -1
|
|
words[idx - 1] = ""
|
|
idx -= 1
|
|
elif wordPrev == "late":
|
|
hrOffset = 1
|
|
words[idx - 1] = ""
|
|
idx -= 1
|
|
if idx > 0 and wordPrev in markers:
|
|
words[idx - 1] = ""
|
|
if wordPrev == "this":
|
|
daySpecified = True
|
|
if idx > 1 and wordPrevPrev in markers:
|
|
words[idx - 2] = ""
|
|
if wordPrevPrev == "this":
|
|
daySpecified = True
|
|
|
|
idx += used - 1
|
|
found = True
|
|
|
|
# check that we found a date
|
|
if not date_found:
|
|
return None
|
|
|
|
if dayOffset is False:
|
|
dayOffset = 0
|
|
|
|
# perform date manipulation
|
|
|
|
extractedDate = dateNow.replace(microsecond=0)
|
|
|
|
if datestr != "":
|
|
# date included an explicit date, e.g. "june 5" or "june 2, 2017"
|
|
try:
|
|
temp = datetime.strptime(datestr, "%B %d")
|
|
except ValueError:
|
|
# Try again, allowing the year
|
|
temp = datetime.strptime(datestr, "%B %d %Y")
|
|
extractedDate = extractedDate.replace(hour=0, minute=0, second=0)
|
|
if not hasYear:
|
|
temp = temp.replace(year=extractedDate.year,
|
|
tzinfo=extractedDate.tzinfo)
|
|
if extractedDate < temp:
|
|
extractedDate = extractedDate.replace(
|
|
year=int(currentYear),
|
|
month=int(temp.strftime("%m")),
|
|
day=int(temp.strftime("%d")),
|
|
tzinfo=extractedDate.tzinfo)
|
|
else:
|
|
extractedDate = extractedDate.replace(
|
|
year=int(currentYear) + 1,
|
|
month=int(temp.strftime("%m")),
|
|
day=int(temp.strftime("%d")),
|
|
tzinfo=extractedDate.tzinfo)
|
|
else:
|
|
extractedDate = extractedDate.replace(
|
|
year=int(temp.strftime("%Y")),
|
|
month=int(temp.strftime("%m")),
|
|
day=int(temp.strftime("%d")),
|
|
tzinfo=extractedDate.tzinfo)
|
|
else:
|
|
# ignore the current HH:MM:SS if relative using days or greater
|
|
if hrOffset == 0 and minOffset == 0 and secOffset == 0:
|
|
extractedDate = extractedDate.replace(hour=0, minute=0, second=0)
|
|
|
|
if yearOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(years=yearOffset)
|
|
if monthOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(months=monthOffset)
|
|
if dayOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(days=dayOffset)
|
|
if hrAbs != -1 and minAbs != -1:
|
|
# If no time was supplied in the string set the time to default
|
|
# time if it's available
|
|
if hrAbs is None and minAbs is None and default_time is not None:
|
|
hrAbs, minAbs = default_time.hour, default_time.minute
|
|
else:
|
|
hrAbs = hrAbs or 0
|
|
minAbs = minAbs or 0
|
|
|
|
extractedDate = extractedDate + relativedelta(hours=hrAbs,
|
|
minutes=minAbs)
|
|
if (hrAbs != 0 or minAbs != 0) and datestr == "":
|
|
if not daySpecified and dateNow > extractedDate:
|
|
extractedDate = extractedDate + relativedelta(days=1)
|
|
if hrOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(hours=hrOffset)
|
|
if minOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(minutes=minOffset)
|
|
if secOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(seconds=secOffset)
|
|
for idx, word in enumerate(words):
|
|
if words[idx] == "and" and \
|
|
words[idx - 1] == "" and words[idx + 1] == "":
|
|
words[idx] = ""
|
|
|
|
resultStr = " ".join(words)
|
|
resultStr = ' '.join(resultStr.split())
|
|
return [extractedDate, resultStr]
|
|
|
|
|
|
def isFractional_en(input_str, short_scale=True):
|
|
"""
|
|
This function takes the given text and checks if it is a fraction.
|
|
|
|
Args:
|
|
input_str (str): the string to check if fractional
|
|
short_scale (bool): use short scale if True, long scale if False
|
|
Returns:
|
|
(bool) or (float): False if not a fraction, otherwise the fraction
|
|
|
|
"""
|
|
if input_str.endswith('s', -1):
|
|
input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
|
|
|
|
fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4}
|
|
if short_scale:
|
|
for num in SHORT_ORDINAL_STRING_EN:
|
|
if num > 2:
|
|
fracts[SHORT_ORDINAL_STRING_EN[num]] = num
|
|
else:
|
|
for num in LONG_ORDINAL_STRING_EN:
|
|
if num > 2:
|
|
fracts[LONG_ORDINAL_STRING_EN[num]] = num
|
|
|
|
if input_str.lower() in fracts:
|
|
return 1.0 / fracts[input_str.lower()]
|
|
return False
|
|
|
|
|
|
def extract_numbers_en(text, short_scale=True, ordinals=False):
|
|
"""
|
|
Takes in a string and extracts a list of numbers.
|
|
|
|
Args:
|
|
text (str): the string to extract a number from
|
|
short_scale (bool): Use "short scale" or "long scale" for large
|
|
numbers -- over a million. The default is short scale, which
|
|
is now common in most English speaking countries.
|
|
See https://en.wikipedia.org/wiki/Names_of_large_numbers
|
|
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
|
|
Returns:
|
|
list: list of extracted numbers as floats
|
|
"""
|
|
results = _extract_numbers_with_text(_tokenize(text), short_scale, ordinals)
|
|
return [float(result.value) for result in results]
|
|
|
|
|
|
def normalize_en(text, remove_articles):
|
|
""" English string normalization """
|
|
|
|
words = text.split() # this also removed extra spaces
|
|
normalized = ""
|
|
for word in words:
|
|
if remove_articles and word in ["the", "a", "an"]:
|
|
continue
|
|
|
|
# Expand common contractions, e.g. "isn't" -> "is not"
|
|
contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
|
|
"didn't", "doesn't", "don't", "gonna", "gotta",
|
|
"hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
|
|
"how'd", "how'll", "how's", "I'd", "I'll", "I'm",
|
|
"I've", "isn't", "it'd", "it'll", "it's", "mightn't",
|
|
"might've", "mustn't", "must've", "needn't",
|
|
"oughtn't",
|
|
"shan't", "she'd", "she'll", "she's", "shouldn't",
|
|
"should've", "somebody's", "someone'd", "someone'll",
|
|
"someone's", "that'll", "that's", "that'd", "there'd",
|
|
"there're", "there's", "they'd", "they'll", "they're",
|
|
"they've", "wasn't", "we'd", "we'll", "we're", "we've",
|
|
"weren't", "what'd", "what'll", "what're", "what's",
|
|
"whats", # technically incorrect but some STT outputs
|
|
"what've", "when's", "when'd", "where'd", "where's",
|
|
"where've", "who'd", "who'd've", "who'll", "who're",
|
|
"who's", "who've", "why'd", "why're", "why's", "won't",
|
|
"won't've", "would've", "wouldn't", "wouldn't've",
|
|
"y'all", "ya'll", "you'd", "you'd've", "you'll",
|
|
"y'aint", "y'ain't", "you're", "you've"]
|
|
if word in contraction:
|
|
expansion = ["is not", "are not", "can not", "could have",
|
|
"could not", "did not", "does not", "do not",
|
|
"going to", "got to", "had not", "has not",
|
|
"have not", "he would", "he will", "he is",
|
|
"how did",
|
|
"how will", "how is", "I would", "I will", "I am",
|
|
"I have", "is not", "it would", "it will", "it is",
|
|
"might not", "might have", "must not", "must have",
|
|
"need not", "ought not", "shall not", "she would",
|
|
"she will", "she is", "should not", "should have",
|
|
"somebody is", "someone would", "someone will",
|
|
"someone is", "that will", "that is", "that would",
|
|
"there would", "there are", "there is", "they would",
|
|
"they will", "they are", "they have", "was not",
|
|
"we would", "we will", "we are", "we have",
|
|
"were not", "what did", "what will", "what are",
|
|
"what is",
|
|
"what is", "what have", "when is", "when did",
|
|
"where did", "where is", "where have", "who would",
|
|
"who would have", "who will", "who are", "who is",
|
|
"who have", "why did", "why are", "why is",
|
|
"will not", "will not have", "would have",
|
|
"would not", "would not have", "you all", "you all",
|
|
"you would", "you would have", "you will",
|
|
"you are not", "you are not", "you are", "you have"]
|
|
word = expansion[contraction.index(word)]
|
|
|
|
# Convert numbers into digits, e.g. "two" -> "2"
|
|
textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
|
|
"seven", "eight", "nine", "ten", "eleven", "twelve",
|
|
"thirteen", "fourteen", "fifteen", "sixteen",
|
|
"seventeen", "eighteen", "nineteen", "twenty"]
|
|
|
|
if word in textNumbers:
|
|
word = str(textNumbers.index(word))
|
|
|
|
normalized += " " + word
|
|
|
|
return normalized[1:] # strip the initial space
|