Split format.py and parse.py per language
Move the language specific functions and constants into separate files. This will avoid many unnecessary conflicts due to involuntary encoding changes.pull/1354/head
parent
5d842fd369
commit
0114ce473e
|
@ -1,5 +1,3 @@
|
||||||
# -*- coding: iso-8859-15 -*-
|
|
||||||
#
|
|
||||||
# Copyright 2017 Mycroft AI Inc.
|
# Copyright 2017 Mycroft AI Inc.
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
@ -14,52 +12,10 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
FRACTION_STRING_EN = {
|
|
||||||
2: 'half',
|
|
||||||
3: 'third',
|
|
||||||
4: 'forth',
|
|
||||||
5: 'fifth',
|
|
||||||
6: 'sixth',
|
|
||||||
7: 'seventh',
|
|
||||||
8: 'eigth',
|
|
||||||
9: 'ninth',
|
|
||||||
10: 'tenth',
|
|
||||||
11: 'eleventh',
|
|
||||||
12: 'twelveth',
|
|
||||||
13: 'thirteenth',
|
|
||||||
14: 'fourteenth',
|
|
||||||
15: 'fifteenth',
|
|
||||||
16: 'sixteenth',
|
|
||||||
17: 'seventeenth',
|
|
||||||
18: 'eighteenth',
|
|
||||||
19: 'nineteenth',
|
|
||||||
20: 'twentyith'
|
|
||||||
}
|
|
||||||
|
|
||||||
FRACTION_STRING_PT = {
|
from mycroft.util.lang.format_en import *
|
||||||
2: 'meio',
|
from mycroft.util.lang.format_es import *
|
||||||
3: u'terço',
|
from mycroft.util.lang.format_pt import *
|
||||||
4: 'quarto',
|
|
||||||
5: 'quinto',
|
|
||||||
6: 'sexto',
|
|
||||||
7: u'sétimo',
|
|
||||||
8: 'oitavo',
|
|
||||||
9: 'nono',
|
|
||||||
10: u'décimo',
|
|
||||||
11: 'onze avos',
|
|
||||||
12: 'doze avos',
|
|
||||||
13: 'treze avos',
|
|
||||||
14: 'catorze avos',
|
|
||||||
15: 'quinze avos',
|
|
||||||
16: 'dezasseis avos',
|
|
||||||
17: 'dezassete avos',
|
|
||||||
18: 'dezoito avos',
|
|
||||||
19: 'dezanove avos',
|
|
||||||
20: u'vigésimo',
|
|
||||||
30: u'trigésimo',
|
|
||||||
100: u'centésimo',
|
|
||||||
1000: u'milésimo'
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def nice_number(number, lang="en-us", speech=True, denominators=None):
|
def nice_number(number, lang="en-us", speech=True, denominators=None):
|
||||||
|
@ -95,55 +51,6 @@ def nice_number(number, lang="en-us", speech=True, denominators=None):
|
||||||
return str(number)
|
return str(number)
|
||||||
|
|
||||||
|
|
||||||
def nice_number_en(result):
|
|
||||||
""" English conversion for nice_number """
|
|
||||||
whole, num, den = result
|
|
||||||
if num == 0:
|
|
||||||
return str(whole)
|
|
||||||
den_str = FRACTION_STRING_EN[den]
|
|
||||||
if whole == 0:
|
|
||||||
if num == 1:
|
|
||||||
return_string = 'a {}'.format(den_str)
|
|
||||||
else:
|
|
||||||
return_string = '{} {}'.format(num, den_str)
|
|
||||||
elif num == 1:
|
|
||||||
return_string = '{} and a {}'.format(whole, den_str)
|
|
||||||
else:
|
|
||||||
return_string = '{} and {} {}'.format(whole, num, den_str)
|
|
||||||
if num > 1:
|
|
||||||
return_string += 's'
|
|
||||||
return return_string
|
|
||||||
|
|
||||||
|
|
||||||
def nice_number_pt(result):
|
|
||||||
""" Portuguese conversion for nice_number """
|
|
||||||
whole, num, den = result
|
|
||||||
if num == 0:
|
|
||||||
return str(whole)
|
|
||||||
# denominador
|
|
||||||
den_str = FRACTION_STRING_PT[den]
|
|
||||||
# fracções
|
|
||||||
if whole == 0:
|
|
||||||
if num == 1:
|
|
||||||
# um décimo
|
|
||||||
return_string = 'um {}'.format(den_str)
|
|
||||||
else:
|
|
||||||
# três meio
|
|
||||||
return_string = '{} {}'.format(num, den_str)
|
|
||||||
# inteiros >10
|
|
||||||
elif num == 1:
|
|
||||||
# trinta e um
|
|
||||||
return_string = '{} e {}'.format(whole, den_str)
|
|
||||||
# inteiros >10 com fracções
|
|
||||||
else:
|
|
||||||
# vinte e 3 décimo
|
|
||||||
return_string = '{} e {} {}'.format(whole, num, den_str)
|
|
||||||
# plural
|
|
||||||
if num > 1:
|
|
||||||
return_string += 's'
|
|
||||||
return return_string
|
|
||||||
|
|
||||||
|
|
||||||
def convert_number(number, denominators):
|
def convert_number(number, denominators):
|
||||||
""" Convert floats to mixed fractions """
|
""" Convert floats to mixed fractions """
|
||||||
int_number = int(number)
|
int_number = int(number)
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
# Copyright 2017 Mycroft AI Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
|
@ -0,0 +1,59 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# Copyright 2017 Mycroft AI Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
FRACTION_STRING_EN = {
|
||||||
|
2: 'half',
|
||||||
|
3: 'third',
|
||||||
|
4: 'forth',
|
||||||
|
5: 'fifth',
|
||||||
|
6: 'sixth',
|
||||||
|
7: 'seventh',
|
||||||
|
8: 'eigth',
|
||||||
|
9: 'ninth',
|
||||||
|
10: 'tenth',
|
||||||
|
11: 'eleventh',
|
||||||
|
12: 'twelveth',
|
||||||
|
13: 'thirteenth',
|
||||||
|
14: 'fourteenth',
|
||||||
|
15: 'fifteenth',
|
||||||
|
16: 'sixteenth',
|
||||||
|
17: 'seventeenth',
|
||||||
|
18: 'eighteenth',
|
||||||
|
19: 'nineteenth',
|
||||||
|
20: 'twentyith'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def nice_number_en(result):
|
||||||
|
""" English conversion for nice_number """
|
||||||
|
whole, num, den = result
|
||||||
|
if num == 0:
|
||||||
|
return str(whole)
|
||||||
|
den_str = FRACTION_STRING_EN[den]
|
||||||
|
if whole == 0:
|
||||||
|
if num == 1:
|
||||||
|
return_string = 'a {}'.format(den_str)
|
||||||
|
else:
|
||||||
|
return_string = '{} {}'.format(num, den_str)
|
||||||
|
elif num == 1:
|
||||||
|
return_string = '{} and a {}'.format(whole, den_str)
|
||||||
|
else:
|
||||||
|
return_string = '{} and {} {}'.format(whole, num, den_str)
|
||||||
|
if num > 1:
|
||||||
|
return_string += 's'
|
||||||
|
return return_string
|
|
@ -0,0 +1,71 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# Copyright 2017 Mycroft AI Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
FRACTION_STRING_PT = {
|
||||||
|
2: 'meio',
|
||||||
|
3: u'terço',
|
||||||
|
4: 'quarto',
|
||||||
|
5: 'quinto',
|
||||||
|
6: 'sexto',
|
||||||
|
7: u'sétimo',
|
||||||
|
8: 'oitavo',
|
||||||
|
9: 'nono',
|
||||||
|
10: u'décimo',
|
||||||
|
11: 'onze avos',
|
||||||
|
12: 'doze avos',
|
||||||
|
13: 'treze avos',
|
||||||
|
14: 'catorze avos',
|
||||||
|
15: 'quinze avos',
|
||||||
|
16: 'dezasseis avos',
|
||||||
|
17: 'dezassete avos',
|
||||||
|
18: 'dezoito avos',
|
||||||
|
19: 'dezanove avos',
|
||||||
|
20: u'vigésimo',
|
||||||
|
30: u'trigésimo',
|
||||||
|
100: u'centésimo',
|
||||||
|
1000: u'milésimo'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def nice_number_pt(result):
|
||||||
|
""" Portuguese conversion for nice_number """
|
||||||
|
whole, num, den = result
|
||||||
|
if num == 0:
|
||||||
|
return str(whole)
|
||||||
|
# denominador
|
||||||
|
den_str = FRACTION_STRING_PT[den]
|
||||||
|
# fracções
|
||||||
|
if whole == 0:
|
||||||
|
if num == 1:
|
||||||
|
# um décimo
|
||||||
|
return_string = 'um {}'.format(den_str)
|
||||||
|
else:
|
||||||
|
# três meio
|
||||||
|
return_string = '{} {}'.format(num, den_str)
|
||||||
|
# inteiros >10
|
||||||
|
elif num == 1:
|
||||||
|
# trinta e um
|
||||||
|
return_string = '{} e {}'.format(whole, den_str)
|
||||||
|
# inteiros >10 com fracções
|
||||||
|
else:
|
||||||
|
# vinte e 3 décimo
|
||||||
|
return_string = '{} e {} {}'.format(whole, num, den_str)
|
||||||
|
# plural
|
||||||
|
if num > 1:
|
||||||
|
return_string += 's'
|
||||||
|
return return_string
|
|
@ -0,0 +1,51 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# Copyright 2017 Mycroft AI Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
def is_numeric(input_str):
|
||||||
|
"""
|
||||||
|
Takes in a string and tests to see if it is a number.
|
||||||
|
Args:
|
||||||
|
text (str): string to test if a number
|
||||||
|
Returns:
|
||||||
|
(bool): True if a number, else False
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
float(input_str)
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def look_for_fractions(split_list):
|
||||||
|
""""
|
||||||
|
This function takes a list made by fraction & determines if a fraction.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
split_list (list): list created by splitting on '/'
|
||||||
|
Returns:
|
||||||
|
(bool): False if not a fraction, otherwise True
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if len(split_list) == 2:
|
||||||
|
if is_numeric(split_list[0]) and is_numeric(split_list[1]):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
|
@ -0,0 +1,835 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# Copyright 2017 Mycroft AI Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from dateutil.relativedelta import relativedelta
|
||||||
|
from mycroft.util.lang.parse_common import *
|
||||||
|
|
||||||
|
|
||||||
|
def extractnumber_en(text):
|
||||||
|
"""
|
||||||
|
This function prepares the given text for parsing by making
|
||||||
|
numbers consistent, getting rid of contractions, etc.
|
||||||
|
Args:
|
||||||
|
text (str): the string to normalize
|
||||||
|
Returns:
|
||||||
|
(int) or (float): The value of extracted number
|
||||||
|
|
||||||
|
"""
|
||||||
|
aWords = text.split()
|
||||||
|
aWords = [word for word in aWords if word not in ["the", "a", "an"]]
|
||||||
|
andPass = False
|
||||||
|
valPreAnd = False
|
||||||
|
val = False
|
||||||
|
count = 0
|
||||||
|
while count < len(aWords):
|
||||||
|
word = aWords[count]
|
||||||
|
if is_numeric(word):
|
||||||
|
# if word.isdigit(): # doesn't work with decimals
|
||||||
|
val = float(word)
|
||||||
|
elif word == "first":
|
||||||
|
val = 1
|
||||||
|
elif word == "second":
|
||||||
|
val = 2
|
||||||
|
elif isFractional_en(word):
|
||||||
|
val = isFractional_en(word)
|
||||||
|
else:
|
||||||
|
if word == "one":
|
||||||
|
val = 1
|
||||||
|
elif word == "two":
|
||||||
|
val = 2
|
||||||
|
elif word == "three":
|
||||||
|
val = 3
|
||||||
|
elif word == "four":
|
||||||
|
val = 4
|
||||||
|
elif word == "five":
|
||||||
|
val = 5
|
||||||
|
elif word == "six":
|
||||||
|
val = 6
|
||||||
|
elif word == "seven":
|
||||||
|
val = 7
|
||||||
|
elif word == "eight":
|
||||||
|
val = 8
|
||||||
|
elif word == "nine":
|
||||||
|
val = 9
|
||||||
|
elif word == "ten":
|
||||||
|
val = 10
|
||||||
|
if val:
|
||||||
|
if count < (len(aWords) - 1):
|
||||||
|
wordNext = aWords[count + 1]
|
||||||
|
else:
|
||||||
|
wordNext = ""
|
||||||
|
valNext = isFractional_en(wordNext)
|
||||||
|
|
||||||
|
if valNext:
|
||||||
|
val = val * valNext
|
||||||
|
aWords[count + 1] = ""
|
||||||
|
|
||||||
|
# if val == False:
|
||||||
|
if not val:
|
||||||
|
# look for fractions like "2/3"
|
||||||
|
aPieces = word.split('/')
|
||||||
|
# if (len(aPieces) == 2 and is_numeric(aPieces[0])
|
||||||
|
# and is_numeric(aPieces[1])):
|
||||||
|
if look_for_fractions(aPieces):
|
||||||
|
val = float(aPieces[0]) / float(aPieces[1])
|
||||||
|
elif andPass:
|
||||||
|
# added to value, quit here
|
||||||
|
val = valPreAnd
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
aWords[count] = ""
|
||||||
|
|
||||||
|
if (andPass):
|
||||||
|
aWords[count - 1] = '' # remove "and"
|
||||||
|
val += valPreAnd
|
||||||
|
elif count + 1 < len(aWords) and aWords[count + 1] == 'and':
|
||||||
|
andPass = True
|
||||||
|
valPreAnd = val
|
||||||
|
val = False
|
||||||
|
count += 2
|
||||||
|
continue
|
||||||
|
elif count + 2 < len(aWords) and aWords[count + 2] == 'and':
|
||||||
|
andPass = True
|
||||||
|
valPreAnd = val
|
||||||
|
val = False
|
||||||
|
count += 3
|
||||||
|
continue
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
# if val == False:
|
||||||
|
if not val:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Return the $str with the number related words removed
|
||||||
|
# (now empty strings, so strlen == 0)
|
||||||
|
aWords = [word for word in aWords if len(word) > 0]
|
||||||
|
text = ' '.join(aWords)
|
||||||
|
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
|
def extract_datetime_en(str, currentDate=None):
|
||||||
|
def clean_string(str):
|
||||||
|
# cleans the input string of unneeded punctuation and capitalization
|
||||||
|
# among other things
|
||||||
|
str = str.lower().replace('?', '').replace('.', '').replace(',', '') \
|
||||||
|
.replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ')
|
||||||
|
wordList = str.split()
|
||||||
|
for idx, word in enumerate(wordList):
|
||||||
|
word = word.replace("'s", "")
|
||||||
|
|
||||||
|
ordinals = ["rd", "st", "nd", "th"]
|
||||||
|
if word[0].isdigit():
|
||||||
|
for ord in ordinals:
|
||||||
|
if ord in word:
|
||||||
|
word = word.replace(ord, "")
|
||||||
|
wordList[idx] = word
|
||||||
|
|
||||||
|
return wordList
|
||||||
|
|
||||||
|
def date_found():
|
||||||
|
return found or \
|
||||||
|
(
|
||||||
|
datestr != "" or timeStr != "" or
|
||||||
|
yearOffset != 0 or monthOffset != 0 or
|
||||||
|
dayOffset is True or hrOffset != 0 or
|
||||||
|
hrAbs != 0 or minOffset != 0 or
|
||||||
|
minAbs != 0 or secOffset != 0
|
||||||
|
)
|
||||||
|
|
||||||
|
if str == "":
|
||||||
|
return None
|
||||||
|
if currentDate is None:
|
||||||
|
currentDate = datetime.now()
|
||||||
|
|
||||||
|
found = False
|
||||||
|
daySpecified = False
|
||||||
|
dayOffset = False
|
||||||
|
monthOffset = 0
|
||||||
|
yearOffset = 0
|
||||||
|
dateNow = currentDate
|
||||||
|
today = dateNow.strftime("%w")
|
||||||
|
currentYear = dateNow.strftime("%Y")
|
||||||
|
fromFlag = False
|
||||||
|
datestr = ""
|
||||||
|
hasYear = False
|
||||||
|
timeQualifier = ""
|
||||||
|
|
||||||
|
timeQualifiersList = ['morning', 'afternoon', 'evening']
|
||||||
|
markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of']
|
||||||
|
days = ['monday', 'tuesday', 'wednesday',
|
||||||
|
'thursday', 'friday', 'saturday', 'sunday']
|
||||||
|
months = ['january', 'february', 'march', 'april', 'may', 'june',
|
||||||
|
'july', 'august', 'september', 'october', 'november',
|
||||||
|
'december']
|
||||||
|
monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
|
||||||
|
'sept', 'oct', 'nov', 'dec']
|
||||||
|
|
||||||
|
words = clean_string(str)
|
||||||
|
|
||||||
|
for idx, word in enumerate(words):
|
||||||
|
if word == "":
|
||||||
|
continue
|
||||||
|
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
||||||
|
wordPrev = words[idx - 1] if idx > 0 else ""
|
||||||
|
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
||||||
|
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
||||||
|
|
||||||
|
# this isn't in clean string because I don't want to save back to words
|
||||||
|
word = word.rstrip('s')
|
||||||
|
start = idx
|
||||||
|
used = 0
|
||||||
|
# save timequalifier for later
|
||||||
|
if word in timeQualifiersList:
|
||||||
|
timeQualifier = word
|
||||||
|
# parse today, tomorrow, day after tomorrow
|
||||||
|
elif word == "today" and not fromFlag:
|
||||||
|
dayOffset = 0
|
||||||
|
used += 1
|
||||||
|
elif word == "tomorrow" and not fromFlag:
|
||||||
|
dayOffset = 1
|
||||||
|
used += 1
|
||||||
|
elif (word == "day" and
|
||||||
|
wordNext == "after" and
|
||||||
|
wordNextNext == "tomorrow" and
|
||||||
|
not fromFlag and
|
||||||
|
not wordPrev[0].isdigit()):
|
||||||
|
dayOffset = 2
|
||||||
|
used = 3
|
||||||
|
if wordPrev == "the":
|
||||||
|
start -= 1
|
||||||
|
used += 1
|
||||||
|
# parse 5 days, 10 weeks, last week, next week
|
||||||
|
elif word == "day":
|
||||||
|
if wordPrev[0].isdigit():
|
||||||
|
dayOffset += int(wordPrev)
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
elif word == "week" and not fromFlag:
|
||||||
|
if wordPrev[0].isdigit():
|
||||||
|
dayOffset += int(wordPrev) * 7
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
elif wordPrev == "next":
|
||||||
|
dayOffset = 7
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
elif wordPrev == "last":
|
||||||
|
dayOffset = -7
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
# parse 10 months, next month, last month
|
||||||
|
elif word == "month" and not fromFlag:
|
||||||
|
if wordPrev[0].isdigit():
|
||||||
|
monthOffset = int(wordPrev)
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
elif wordPrev == "next":
|
||||||
|
monthOffset = 1
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
elif wordPrev == "last":
|
||||||
|
monthOffset = -1
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
# parse 5 years, next year, last year
|
||||||
|
elif word == "year" and not fromFlag:
|
||||||
|
if wordPrev[0].isdigit():
|
||||||
|
yearOffset = int(wordPrev)
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
elif wordPrev == "next":
|
||||||
|
yearOffset = 1
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
elif wordPrev == "last":
|
||||||
|
yearOffset = -1
|
||||||
|
start -= 1
|
||||||
|
used = 2
|
||||||
|
# parse Monday, Tuesday, etc., and next Monday,
|
||||||
|
# last Tuesday, etc.
|
||||||
|
elif word in days and not fromFlag:
|
||||||
|
d = days.index(word)
|
||||||
|
dayOffset = (d + 1) - int(today)
|
||||||
|
used = 1
|
||||||
|
if dayOffset < 0:
|
||||||
|
dayOffset += 7
|
||||||
|
if wordPrev == "next":
|
||||||
|
dayOffset += 7
|
||||||
|
used += 1
|
||||||
|
start -= 1
|
||||||
|
elif wordPrev == "last":
|
||||||
|
dayOffset -= 7
|
||||||
|
used += 1
|
||||||
|
start -= 1
|
||||||
|
# parse 15 of July, June 20th, Feb 18, 19 of February
|
||||||
|
elif word in months or word in monthsShort and not fromFlag:
|
||||||
|
try:
|
||||||
|
m = months.index(word)
|
||||||
|
except ValueError:
|
||||||
|
m = monthsShort.index(word)
|
||||||
|
used += 1
|
||||||
|
datestr = months[m]
|
||||||
|
if wordPrev and (wordPrev[0].isdigit() or
|
||||||
|
(wordPrev == "of" and wordPrevPrev[0].isdigit())):
|
||||||
|
if wordPrev == "of" and wordPrevPrev[0].isdigit():
|
||||||
|
datestr += " " + words[idx - 2]
|
||||||
|
used += 1
|
||||||
|
start -= 1
|
||||||
|
else:
|
||||||
|
datestr += " " + wordPrev
|
||||||
|
start -= 1
|
||||||
|
used += 1
|
||||||
|
if wordNext and wordNext[0].isdigit():
|
||||||
|
datestr += " " + wordNext
|
||||||
|
used += 1
|
||||||
|
hasYear = True
|
||||||
|
else:
|
||||||
|
hasYear = False
|
||||||
|
|
||||||
|
elif wordNext and wordNext[0].isdigit():
|
||||||
|
datestr += " " + wordNext
|
||||||
|
used += 1
|
||||||
|
if wordNextNext and wordNextNext[0].isdigit():
|
||||||
|
datestr += " " + wordNextNext
|
||||||
|
used += 1
|
||||||
|
hasYear = True
|
||||||
|
else:
|
||||||
|
hasYear = False
|
||||||
|
# parse 5 days from tomorrow, 10 weeks from next thursday,
|
||||||
|
# 2 months from July
|
||||||
|
validFollowups = days + months + monthsShort
|
||||||
|
validFollowups.append("today")
|
||||||
|
validFollowups.append("tomorrow")
|
||||||
|
validFollowups.append("next")
|
||||||
|
validFollowups.append("last")
|
||||||
|
validFollowups.append("now")
|
||||||
|
if (word == "from" or word == "after") and wordNext in validFollowups:
|
||||||
|
used = 2
|
||||||
|
fromFlag = True
|
||||||
|
if wordNext == "tomorrow":
|
||||||
|
dayOffset += 1
|
||||||
|
elif wordNext in days:
|
||||||
|
d = days.index(wordNext)
|
||||||
|
tmpOffset = (d + 1) - int(today)
|
||||||
|
used = 2
|
||||||
|
if tmpOffset < 0:
|
||||||
|
tmpOffset += 7
|
||||||
|
dayOffset += tmpOffset
|
||||||
|
elif wordNextNext and wordNextNext in days:
|
||||||
|
d = days.index(wordNextNext)
|
||||||
|
tmpOffset = (d + 1) - int(today)
|
||||||
|
used = 3
|
||||||
|
if wordNext == "next":
|
||||||
|
tmpOffset += 7
|
||||||
|
used += 1
|
||||||
|
start -= 1
|
||||||
|
elif wordNext == "last":
|
||||||
|
tmpOffset -= 7
|
||||||
|
used += 1
|
||||||
|
start -= 1
|
||||||
|
dayOffset += tmpOffset
|
||||||
|
if used > 0:
|
||||||
|
if start - 1 > 0 and words[start - 1] == "this":
|
||||||
|
start -= 1
|
||||||
|
used += 1
|
||||||
|
|
||||||
|
for i in range(0, used):
|
||||||
|
words[i + start] = ""
|
||||||
|
|
||||||
|
if (start - 1 >= 0 and words[start - 1] in markers):
|
||||||
|
words[start - 1] = ""
|
||||||
|
found = True
|
||||||
|
daySpecified = True
|
||||||
|
|
||||||
|
# parse time
|
||||||
|
timeStr = ""
|
||||||
|
hrOffset = 0
|
||||||
|
minOffset = 0
|
||||||
|
secOffset = 0
|
||||||
|
hrAbs = 0
|
||||||
|
minAbs = 0
|
||||||
|
military = False
|
||||||
|
|
||||||
|
for idx, word in enumerate(words):
|
||||||
|
if word == "":
|
||||||
|
continue
|
||||||
|
|
||||||
|
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
||||||
|
wordPrev = words[idx - 1] if idx > 0 else ""
|
||||||
|
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
||||||
|
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
||||||
|
# parse noon, midnight, morning, afternoon, evening
|
||||||
|
used = 0
|
||||||
|
if word == "noon":
|
||||||
|
hrAbs = 12
|
||||||
|
used += 1
|
||||||
|
elif word == "midnight":
|
||||||
|
hrAbs = 0
|
||||||
|
used += 1
|
||||||
|
elif word == "morning":
|
||||||
|
if hrAbs == 0:
|
||||||
|
hrAbs = 8
|
||||||
|
used += 1
|
||||||
|
elif word == "afternoon":
|
||||||
|
if hrAbs == 0:
|
||||||
|
hrAbs = 15
|
||||||
|
used += 1
|
||||||
|
elif word == "evening":
|
||||||
|
if hrAbs == 0:
|
||||||
|
hrAbs = 19
|
||||||
|
used += 1
|
||||||
|
# parse half an hour, quarter hour
|
||||||
|
elif word == "hour" and \
|
||||||
|
(wordPrev in markers or wordPrevPrev in markers):
|
||||||
|
if wordPrev == "half":
|
||||||
|
minOffset = 30
|
||||||
|
elif wordPrev == "quarter":
|
||||||
|
minOffset = 15
|
||||||
|
elif wordPrevPrev == "quarter":
|
||||||
|
minOffset = 15
|
||||||
|
if idx > 2 and words[idx - 3] in markers:
|
||||||
|
words[idx - 3] = ""
|
||||||
|
words[idx - 2] = ""
|
||||||
|
else:
|
||||||
|
hrOffset = 1
|
||||||
|
if wordPrevPrev in markers:
|
||||||
|
words[idx - 2] = ""
|
||||||
|
words[idx - 1] = ""
|
||||||
|
used += 1
|
||||||
|
hrAbs = -1
|
||||||
|
minAbs = -1
|
||||||
|
# parse 5:00 am, 12:00 p.m., etc
|
||||||
|
elif word[0].isdigit():
|
||||||
|
isTime = True
|
||||||
|
strHH = ""
|
||||||
|
strMM = ""
|
||||||
|
remainder = ""
|
||||||
|
if ':' in word:
|
||||||
|
# parse colons
|
||||||
|
# "3:00 in the morning"
|
||||||
|
stage = 0
|
||||||
|
length = len(word)
|
||||||
|
for i in range(length):
|
||||||
|
if stage == 0:
|
||||||
|
if word[i].isdigit():
|
||||||
|
strHH += word[i]
|
||||||
|
elif word[i] == ":":
|
||||||
|
stage = 1
|
||||||
|
else:
|
||||||
|
stage = 2
|
||||||
|
i -= 1
|
||||||
|
elif stage == 1:
|
||||||
|
if word[i].isdigit():
|
||||||
|
strMM += word[i]
|
||||||
|
else:
|
||||||
|
stage = 2
|
||||||
|
i -= 1
|
||||||
|
elif stage == 2:
|
||||||
|
remainder = word[i:].replace(".", "")
|
||||||
|
break
|
||||||
|
if remainder == "":
|
||||||
|
nextWord = wordNext.replace(".", "")
|
||||||
|
if nextWord == "am" or nextWord == "pm":
|
||||||
|
remainder = nextWord
|
||||||
|
used += 1
|
||||||
|
elif nextWord == "tonight":
|
||||||
|
remainder = "pm"
|
||||||
|
used += 1
|
||||||
|
elif wordNext == "in" and wordNextNext == "the" and \
|
||||||
|
words[idx + 3] == "morning":
|
||||||
|
reaminder = "am"
|
||||||
|
used += 3
|
||||||
|
elif wordNext == "in" and wordNextNext == "the" and \
|
||||||
|
words[idx + 3] == "afternoon":
|
||||||
|
remainder = "pm"
|
||||||
|
used += 3
|
||||||
|
elif wordNext == "in" and wordNextNext == "the" and \
|
||||||
|
words[idx + 3] == "evening":
|
||||||
|
remainder = "pm"
|
||||||
|
used += 3
|
||||||
|
elif wordNext == "in" and wordNextNext == "morning":
|
||||||
|
remainder = "am"
|
||||||
|
used += 2
|
||||||
|
elif wordNext == "in" and wordNextNext == "afternoon":
|
||||||
|
remainder = "pm"
|
||||||
|
used += 2
|
||||||
|
elif wordNext == "in" and wordNextNext == "evening":
|
||||||
|
remainder = "pm"
|
||||||
|
used += 2
|
||||||
|
elif wordNext == "this" and wordNextNext == "morning":
|
||||||
|
remainder = "am"
|
||||||
|
used = 2
|
||||||
|
elif wordNext == "this" and wordNextNext == "afternoon":
|
||||||
|
remainder = "pm"
|
||||||
|
used = 2
|
||||||
|
elif wordNext == "this" and wordNextNext == "evening":
|
||||||
|
remainder = "pm"
|
||||||
|
used = 2
|
||||||
|
elif wordNext == "at" and wordNextNext == "night":
|
||||||
|
if strHH > 5:
|
||||||
|
remainder = "pm"
|
||||||
|
else:
|
||||||
|
remainder = "am"
|
||||||
|
used += 2
|
||||||
|
else:
|
||||||
|
if timeQualifier != "":
|
||||||
|
military = True
|
||||||
|
if strHH <= 12 and \
|
||||||
|
(timeQualifier == "evening" or
|
||||||
|
timeQualifier == "afternoon"):
|
||||||
|
strHH += 12
|
||||||
|
else:
|
||||||
|
# try to parse # s without colons
|
||||||
|
# 5 hours, 10 minutes etc.
|
||||||
|
length = len(word)
|
||||||
|
strNum = ""
|
||||||
|
remainder = ""
|
||||||
|
for i in range(length):
|
||||||
|
if word[i].isdigit():
|
||||||
|
strNum += word[i]
|
||||||
|
else:
|
||||||
|
remainder += word[i]
|
||||||
|
|
||||||
|
if remainder == "":
|
||||||
|
remainder = wordNext.replace(".", "").lstrip().rstrip()
|
||||||
|
|
||||||
|
if (
|
||||||
|
remainder == "pm" or
|
||||||
|
wordNext == "pm" or
|
||||||
|
remainder == "p.m." or
|
||||||
|
wordNext == "p.m."):
|
||||||
|
strHH = strNum
|
||||||
|
remainder = "pm"
|
||||||
|
used = 1
|
||||||
|
elif (
|
||||||
|
remainder == "am" or
|
||||||
|
wordNext == "am" or
|
||||||
|
remainder == "a.m." or
|
||||||
|
wordNext == "a.m."):
|
||||||
|
strHH = strNum
|
||||||
|
remainder = "am"
|
||||||
|
used = 1
|
||||||
|
else:
|
||||||
|
if wordNext == "pm" or wordNext == "p.m.":
|
||||||
|
strHH = strNum
|
||||||
|
reaminder = "pm"
|
||||||
|
used = 1
|
||||||
|
elif wordNext == "am" or wordNext == "a.m.":
|
||||||
|
strHH = strNum
|
||||||
|
remainder = "am"
|
||||||
|
used = 1
|
||||||
|
elif (
|
||||||
|
int(word) > 100 and
|
||||||
|
(
|
||||||
|
wordPrev == "o" or
|
||||||
|
wordPrev == "oh"
|
||||||
|
)):
|
||||||
|
# 0800 hours (pronounced oh-eight-hundred)
|
||||||
|
strHH = int(word) / 100
|
||||||
|
strMM = int(word) - strHH * 100
|
||||||
|
military = True
|
||||||
|
if wordNext == "hours":
|
||||||
|
used += 1
|
||||||
|
elif (
|
||||||
|
wordNext == "hours" and
|
||||||
|
word[0] != '0' and
|
||||||
|
(
|
||||||
|
int(word) < 100 and
|
||||||
|
int(word) > 2400
|
||||||
|
)):
|
||||||
|
# ignores military time
|
||||||
|
# "in 3 hours"
|
||||||
|
hrOffset = int(word)
|
||||||
|
used = 2
|
||||||
|
isTime = False
|
||||||
|
hrAbs = -1
|
||||||
|
minAbs = -1
|
||||||
|
|
||||||
|
elif wordNext == "minutes":
|
||||||
|
# "in 10 minutes"
|
||||||
|
minOffset = int(word)
|
||||||
|
used = 2
|
||||||
|
isTime = False
|
||||||
|
hrAbs = -1
|
||||||
|
minAbs = -1
|
||||||
|
elif wordNext == "seconds":
|
||||||
|
# in 5 seconds
|
||||||
|
secOffset = int(word)
|
||||||
|
used = 2
|
||||||
|
isTime = False
|
||||||
|
hrAbs = -1
|
||||||
|
minAbs = -1
|
||||||
|
elif int(word) > 100:
|
||||||
|
strHH = int(word) / 100
|
||||||
|
strMM = int(word) - strHH * 100
|
||||||
|
military = True
|
||||||
|
if wordNext == "hours":
|
||||||
|
used += 1
|
||||||
|
elif wordNext[0].isdigit():
|
||||||
|
strHH = word
|
||||||
|
strMM = wordNext
|
||||||
|
military = True
|
||||||
|
used += 1
|
||||||
|
if wordNextNext == "hours":
|
||||||
|
used += 1
|
||||||
|
elif (
|
||||||
|
wordNext == "" or wordNext == "o'clock" or
|
||||||
|
(
|
||||||
|
wordNext == "in" and
|
||||||
|
(
|
||||||
|
wordNextNext == "the" or
|
||||||
|
wordNextNext == timeQualifier
|
||||||
|
)
|
||||||
|
)):
|
||||||
|
strHH = word
|
||||||
|
strMM = 00
|
||||||
|
if wordNext == "o'clock":
|
||||||
|
used += 1
|
||||||
|
if wordNext == "in" or wordNextNext == "in":
|
||||||
|
used += (1 if wordNext == "in" else 2)
|
||||||
|
if (wordNextNext and
|
||||||
|
wordNextNext in timeQualifier or
|
||||||
|
(words[words.index(wordNextNext) + 1] and
|
||||||
|
words[words.index(wordNextNext) + 1] in
|
||||||
|
timeQualifier)):
|
||||||
|
if (wordNextNext == "afternoon" or
|
||||||
|
(len(words) >
|
||||||
|
words.index(wordNextNext) + 1 and
|
||||||
|
words[words.index(
|
||||||
|
wordNextNext) + 1] == "afternoon")):
|
||||||
|
remainder = "pm"
|
||||||
|
if (wordNextNext == "evening" or
|
||||||
|
(len(words) >
|
||||||
|
(words.index(wordNextNext) + 1) and
|
||||||
|
words[words.index(
|
||||||
|
wordNextNext) + 1] == "evening")):
|
||||||
|
remainder = "pm"
|
||||||
|
if (wordNextNext == "morning" or
|
||||||
|
(len(words) >
|
||||||
|
words.index(wordNextNext) + 1 and
|
||||||
|
words[words.index(
|
||||||
|
wordNextNext) + 1] == "morning")):
|
||||||
|
remainder = "am"
|
||||||
|
if timeQualifier != "":
|
||||||
|
military = True
|
||||||
|
else:
|
||||||
|
isTime = False
|
||||||
|
|
||||||
|
strHH = int(strHH) if strHH else 0
|
||||||
|
strMM = int(strMM) if strMM else 0
|
||||||
|
strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
|
||||||
|
strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
|
||||||
|
if strHH > 24 or strMM > 59:
|
||||||
|
isTime = False
|
||||||
|
used = 0
|
||||||
|
if isTime:
|
||||||
|
hrAbs = strHH * 1
|
||||||
|
minAbs = strMM * 1
|
||||||
|
used += 1
|
||||||
|
if used > 0:
|
||||||
|
# removed parsed words from the sentence
|
||||||
|
for i in range(used):
|
||||||
|
words[idx + i] = ""
|
||||||
|
|
||||||
|
if wordPrev == "o" or wordPrev == "oh":
|
||||||
|
words[words.index(wordPrev)] = ""
|
||||||
|
|
||||||
|
if wordPrev == "early":
|
||||||
|
hrOffset = -1
|
||||||
|
words[idx - 1] = ""
|
||||||
|
idx -= 1
|
||||||
|
elif wordPrev == "late":
|
||||||
|
hrOffset = 1
|
||||||
|
words[idx - 1] = ""
|
||||||
|
idx -= 1
|
||||||
|
if idx > 0 and wordPrev in markers:
|
||||||
|
words[idx - 1] = ""
|
||||||
|
if idx > 1 and wordPrevPrev in markers:
|
||||||
|
words[idx - 2] = ""
|
||||||
|
|
||||||
|
idx += used - 1
|
||||||
|
found = True
|
||||||
|
|
||||||
|
# check that we found a date
|
||||||
|
if not date_found:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if dayOffset is False:
|
||||||
|
dayOffset = 0
|
||||||
|
|
||||||
|
# perform date manipulation
|
||||||
|
|
||||||
|
extractedDate = dateNow
|
||||||
|
extractedDate = extractedDate.replace(microsecond=0,
|
||||||
|
second=0,
|
||||||
|
minute=0,
|
||||||
|
hour=0)
|
||||||
|
if datestr != "":
|
||||||
|
temp = datetime.strptime(datestr, "%B %d")
|
||||||
|
if not hasYear:
|
||||||
|
temp = temp.replace(year=extractedDate.year)
|
||||||
|
if extractedDate < temp:
|
||||||
|
extractedDate = extractedDate.replace(year=int(currentYear),
|
||||||
|
month=int(
|
||||||
|
temp.strftime(
|
||||||
|
"%m")),
|
||||||
|
day=int(temp.strftime(
|
||||||
|
"%d")))
|
||||||
|
else:
|
||||||
|
extractedDate = extractedDate.replace(
|
||||||
|
year=int(currentYear) + 1,
|
||||||
|
month=int(temp.strftime("%m")),
|
||||||
|
day=int(temp.strftime("%d")))
|
||||||
|
else:
|
||||||
|
extractedDate = extractedDate.replace(
|
||||||
|
year=int(temp.strftime("%Y")),
|
||||||
|
month=int(temp.strftime("%m")),
|
||||||
|
day=int(temp.strftime("%d")))
|
||||||
|
|
||||||
|
if timeStr != "":
|
||||||
|
temp = datetime(timeStr)
|
||||||
|
extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
|
||||||
|
minute=temp.strftime("%M"),
|
||||||
|
second=temp.strftime("%S"))
|
||||||
|
|
||||||
|
if yearOffset != 0:
|
||||||
|
extractedDate = extractedDate + relativedelta(years=yearOffset)
|
||||||
|
if monthOffset != 0:
|
||||||
|
extractedDate = extractedDate + relativedelta(months=monthOffset)
|
||||||
|
if dayOffset != 0:
|
||||||
|
extractedDate = extractedDate + relativedelta(days=dayOffset)
|
||||||
|
if hrAbs != -1 and minAbs != -1:
|
||||||
|
|
||||||
|
extractedDate = extractedDate + relativedelta(hours=hrAbs,
|
||||||
|
minutes=minAbs)
|
||||||
|
if (hrAbs != 0 or minAbs != 0) and datestr == "":
|
||||||
|
if not daySpecified and dateNow > extractedDate:
|
||||||
|
extractedDate = extractedDate + relativedelta(days=1)
|
||||||
|
if hrOffset != 0:
|
||||||
|
extractedDate = extractedDate + relativedelta(hours=hrOffset)
|
||||||
|
if minOffset != 0:
|
||||||
|
extractedDate = extractedDate + relativedelta(minutes=minOffset)
|
||||||
|
if secOffset != 0:
|
||||||
|
extractedDate = extractedDate + relativedelta(seconds=secOffset)
|
||||||
|
for idx, word in enumerate(words):
|
||||||
|
if words[idx] == "and" and words[idx - 1] == "" and words[
|
||||||
|
idx + 1] == "":
|
||||||
|
words[idx] = ""
|
||||||
|
|
||||||
|
resultStr = " ".join(words)
|
||||||
|
resultStr = ' '.join(resultStr.split())
|
||||||
|
return [extractedDate, resultStr]
|
||||||
|
|
||||||
|
|
||||||
|
def isFractional_en(input_str):
|
||||||
|
"""
|
||||||
|
This function takes the given text and checks if it is a fraction.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): the string to check if fractional
|
||||||
|
Returns:
|
||||||
|
(bool) or (float): False if not a fraction, otherwise the fraction
|
||||||
|
|
||||||
|
"""
|
||||||
|
if input_str.endswith('s', -1):
|
||||||
|
input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
|
||||||
|
|
||||||
|
aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth",
|
||||||
|
"seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"]
|
||||||
|
|
||||||
|
if input_str.lower() in aFrac:
|
||||||
|
return 1.0 / (aFrac.index(input_str) + 1)
|
||||||
|
if input_str == "quarter":
|
||||||
|
return 1.0 / 4
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_en(text, remove_articles):
|
||||||
|
""" English string normalization """
|
||||||
|
|
||||||
|
words = text.split() # this also removed extra spaces
|
||||||
|
normalized = ""
|
||||||
|
for word in words:
|
||||||
|
if remove_articles and word in ["the", "a", "an"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Expand common contractions, e.g. "isn't" -> "is not"
|
||||||
|
contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
|
||||||
|
"didn't", "doesn't", "don't", "gonna", "gotta",
|
||||||
|
"hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
|
||||||
|
"how'd", "how'll", "how's", "I'd", "I'll", "I'm",
|
||||||
|
"I've", "isn't", "it'd", "it'll", "it's", "mightn't",
|
||||||
|
"might've", "mustn't", "must've", "needn't",
|
||||||
|
"oughtn't",
|
||||||
|
"shan't", "she'd", "she'll", "she's", "shouldn't",
|
||||||
|
"should've", "somebody's", "someone'd", "someone'll",
|
||||||
|
"someone's", "that'll", "that's", "that'd", "there'd",
|
||||||
|
"there're", "there's", "they'd", "they'll", "they're",
|
||||||
|
"they've", "wasn't", "we'd", "we'll", "we're", "we've",
|
||||||
|
"weren't", "what'd", "what'll", "what're", "what's",
|
||||||
|
"whats", # technically incorrect but some STT outputs
|
||||||
|
"what've", "when's", "when'd", "where'd", "where's",
|
||||||
|
"where've", "who'd", "who'd've", "who'll", "who're",
|
||||||
|
"who's", "who've", "why'd", "why're", "why's", "won't",
|
||||||
|
"won't've", "would've", "wouldn't", "wouldn't've",
|
||||||
|
"y'all", "ya'll", "you'd", "you'd've", "you'll",
|
||||||
|
"y'aint", "y'ain't", "you're", "you've"]
|
||||||
|
if word in contraction:
|
||||||
|
expansion = ["is not", "are not", "can not", "could have",
|
||||||
|
"could not", "did not", "does not", "do not",
|
||||||
|
"going to", "got to", "had not", "has not",
|
||||||
|
"have not", "he would", "he will", "he is",
|
||||||
|
"how did",
|
||||||
|
"how will", "how is", "I would", "I will", "I am",
|
||||||
|
"I have", "is not", "it would", "it will", "it is",
|
||||||
|
"might not", "might have", "must not", "must have",
|
||||||
|
"need not", "ought not", "shall not", "she would",
|
||||||
|
"she will", "she is", "should not", "should have",
|
||||||
|
"somebody is", "someone would", "someone will",
|
||||||
|
"someone is", "that will", "that is", "that would",
|
||||||
|
"there would", "there are", "there is", "they would",
|
||||||
|
"they will", "they are", "they have", "was not",
|
||||||
|
"we would", "we will", "we are", "we have",
|
||||||
|
"were not", "what did", "what will", "what are",
|
||||||
|
"what is",
|
||||||
|
"what is", "what have", "when is", "when did",
|
||||||
|
"where did", "where is", "where have", "who would",
|
||||||
|
"who would have", "who will", "who are", "who is",
|
||||||
|
"who have", "why did", "why are", "why is",
|
||||||
|
"will not", "will not have", "would have",
|
||||||
|
"would not", "would not have", "you all", "you all",
|
||||||
|
"you would", "you would have", "you will",
|
||||||
|
"you are not", "you are not", "you are", "you have"]
|
||||||
|
word = expansion[contraction.index(word)]
|
||||||
|
|
||||||
|
# Convert numbers into digits, e.g. "two" -> "2"
|
||||||
|
textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
|
||||||
|
"seven", "eight", "nine", "ten", "eleven", "twelve",
|
||||||
|
"thirteen", "fourteen", "fifteen", "sixteen",
|
||||||
|
"seventeen", "eighteen", "nineteen", "twenty"]
|
||||||
|
if word in textNumbers:
|
||||||
|
word = str(textNumbers.index(word))
|
||||||
|
|
||||||
|
normalized += " " + word
|
||||||
|
|
||||||
|
return normalized[1:] # strip the initial space
|
|
@ -0,0 +1,194 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# Copyright 2017 Mycroft AI Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from dateutil.relativedelta import relativedelta
|
||||||
|
from mycroft.util.lang.parse_common import *
|
||||||
|
"""
|
||||||
|
Parse functions for spanish (es)
|
||||||
|
TODO: numbers greater than 999999
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
|
||||||
|
# in Spanish, "un caballo" means "a horse" or "one horse".
|
||||||
|
es_articles = ["el", "la", "los", "las"]
|
||||||
|
|
||||||
|
es_numbers_xlat = {
|
||||||
|
"un": 1,
|
||||||
|
"uno": 1,
|
||||||
|
"una": 1,
|
||||||
|
"dos": 2,
|
||||||
|
"tres": 3,
|
||||||
|
u"tr�s": 3,
|
||||||
|
"cuatro": 4,
|
||||||
|
"cinco": 5,
|
||||||
|
"seis": 6,
|
||||||
|
"siete": 7,
|
||||||
|
"ocho": 8,
|
||||||
|
"nueve": 9,
|
||||||
|
"diez": 10,
|
||||||
|
"once": 11,
|
||||||
|
"doce": 12,
|
||||||
|
"trece": 13,
|
||||||
|
"catorce": 14,
|
||||||
|
"quince": 15,
|
||||||
|
"dieciseis": 16,
|
||||||
|
u"dieciséis": 16,
|
||||||
|
"diecisiete": 17,
|
||||||
|
"dieciocho": 18,
|
||||||
|
"diecinueve": 19,
|
||||||
|
"veinte": 20,
|
||||||
|
"veintiuno": 21,
|
||||||
|
u"veintid�s": 22,
|
||||||
|
u"veintitr�s": 23,
|
||||||
|
"veintidos": 22,
|
||||||
|
"veintitres": 23,
|
||||||
|
u"veintitrés": 23,
|
||||||
|
"veinticuatro": 24,
|
||||||
|
"veinticinco": 25,
|
||||||
|
u"veintiséis": 26,
|
||||||
|
"veintiseis": 26,
|
||||||
|
"veintisiete": 27,
|
||||||
|
"veintiocho": 28,
|
||||||
|
"veintinueve": 29,
|
||||||
|
"treinta": 30,
|
||||||
|
"cuarenta": 40,
|
||||||
|
"cincuenta": 50,
|
||||||
|
"sesenta": 60,
|
||||||
|
"setenta": 70,
|
||||||
|
"ochenta": 80,
|
||||||
|
"noventa": 90,
|
||||||
|
"cien": 100,
|
||||||
|
"ciento": 100,
|
||||||
|
"doscientos": 200,
|
||||||
|
"doscientas": 200,
|
||||||
|
"trescientos": 300,
|
||||||
|
"trescientas": 300,
|
||||||
|
"cuatrocientos": 400,
|
||||||
|
"cuatrocientas": 400,
|
||||||
|
"quinientos": 500,
|
||||||
|
"quinientas": 500,
|
||||||
|
"seiscientos": 600,
|
||||||
|
"seiscientas": 600,
|
||||||
|
"setecientos": 700,
|
||||||
|
"setecientas": 700,
|
||||||
|
"ochocientos": 800,
|
||||||
|
"ochocientas": 800,
|
||||||
|
"novecientos": 900,
|
||||||
|
"novecientas": 900}
|
||||||
|
|
||||||
|
|
||||||
|
def es_parse(words, i):
|
||||||
|
def es_cte(i, s):
|
||||||
|
if i < len(words) and s == words[i]:
|
||||||
|
return s, i + 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def es_number_word(i, mi, ma):
|
||||||
|
if i < len(words):
|
||||||
|
v = es_numbers_xlat.get(words[i])
|
||||||
|
if v and v >= mi and v <= ma:
|
||||||
|
return v, i + 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def es_number_1_99(i):
|
||||||
|
r1 = es_number_word(i, 1, 29)
|
||||||
|
if r1:
|
||||||
|
return r1
|
||||||
|
|
||||||
|
r1 = es_number_word(i, 30, 90)
|
||||||
|
if r1:
|
||||||
|
v1, i1 = r1
|
||||||
|
r2 = es_cte(i1, "y")
|
||||||
|
if r2:
|
||||||
|
v2, i2 = r2
|
||||||
|
r3 = es_number_word(i2, 1, 9)
|
||||||
|
if r3:
|
||||||
|
v3, i3 = r3
|
||||||
|
return v1 + v3, i3
|
||||||
|
return r1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def es_number_1_999(i):
|
||||||
|
# [2-9]cientos [1-99]?
|
||||||
|
r1 = es_number_word(i, 100, 900)
|
||||||
|
if r1:
|
||||||
|
v1, i1 = r1
|
||||||
|
r2 = es_number_1_99(i1)
|
||||||
|
if r2:
|
||||||
|
v2, i2 = r2
|
||||||
|
return v1 + v2, i2
|
||||||
|
else:
|
||||||
|
return r1
|
||||||
|
|
||||||
|
# [1-99]
|
||||||
|
r1 = es_number_1_99(i)
|
||||||
|
if r1:
|
||||||
|
return r1
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def es_number(i):
|
||||||
|
# check for cero
|
||||||
|
r1 = es_number_word(i, 0, 0)
|
||||||
|
if r1:
|
||||||
|
return r1
|
||||||
|
|
||||||
|
# check for [1-999] (mil [0-999])?
|
||||||
|
r1 = es_number_1_999(i)
|
||||||
|
if r1:
|
||||||
|
v1, i1 = r1
|
||||||
|
r2 = es_cte(i1, "mil")
|
||||||
|
if r2:
|
||||||
|
v2, i2 = r2
|
||||||
|
r3 = es_number_1_999(i2)
|
||||||
|
if r3:
|
||||||
|
v3, i3 = r3
|
||||||
|
return v1 * 1000 + v3, i3
|
||||||
|
else:
|
||||||
|
return v1 * 1000, i2
|
||||||
|
else:
|
||||||
|
return r1
|
||||||
|
return None
|
||||||
|
|
||||||
|
return es_number(i)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_es(text, remove_articles):
|
||||||
|
""" Spanish string normalization """
|
||||||
|
|
||||||
|
words = text.split() # this also removed extra spaces
|
||||||
|
|
||||||
|
normalized = ""
|
||||||
|
i = 0
|
||||||
|
while i < len(words):
|
||||||
|
word = words[i]
|
||||||
|
|
||||||
|
if remove_articles and word in es_articles:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Convert numbers into digits
|
||||||
|
r = es_parse(words, i)
|
||||||
|
if r:
|
||||||
|
v, i = r
|
||||||
|
normalized += " " + str(v)
|
||||||
|
continue
|
||||||
|
|
||||||
|
normalized += " " + word
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return normalized[1:] # strip the initial space
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -375,7 +375,7 @@ class TestNormalize(unittest.TestCase):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
extractnumber("tres quartos de chocolate", lang="pt"),
|
extractnumber("tres quartos de chocolate", lang="pt"),
|
||||||
3.0 / 4.0)
|
3.0 / 4.0)
|
||||||
self.assertEqual(extractnumber(u"tr<EFBFBD>s quarto de chocolate",
|
self.assertEqual(extractnumber(u"três quarto de chocolate",
|
||||||
lang="pt"), 3.0 / 4.0)
|
lang="pt"), 3.0 / 4.0)
|
||||||
self.assertEqual(extractnumber("sete ponto cinco", lang="pt"), 7.5)
|
self.assertEqual(extractnumber("sete ponto cinco", lang="pt"), 7.5)
|
||||||
self.assertEqual(extractnumber("sete ponto 5", lang="pt"), 7.5)
|
self.assertEqual(extractnumber("sete ponto 5", lang="pt"), 7.5)
|
||||||
|
@ -420,9 +420,9 @@ class TestNormalize(unittest.TestCase):
|
||||||
"isto e 1 teste")
|
"isto e 1 teste")
|
||||||
|
|
||||||
def test_numbers_pt(self):
|
def test_numbers_pt(self):
|
||||||
self.assertEqual(normalize(u"isto e o um dois tr<EFBFBD>s teste", lang="pt"),
|
self.assertEqual(normalize(u"isto e o um dois três teste", lang="pt"),
|
||||||
u"isto 1 2 3 teste")
|
u"isto 1 2 3 teste")
|
||||||
self.assertEqual(normalize(u"<EFBFBD> a sete oito nove test", lang="pt"),
|
self.assertEqual(normalize(u"ê a sete oito nove test", lang="pt"),
|
||||||
u"7 8 9 test")
|
u"7 8 9 test")
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
normalize("teste zero dez onze doze treze", lang="pt"),
|
normalize("teste zero dez onze doze treze", lang="pt"),
|
||||||
|
@ -459,9 +459,9 @@ class TestNormalize(unittest.TestCase):
|
||||||
self.assertEqual(res[0], expected_date)
|
self.assertEqual(res[0], expected_date)
|
||||||
self.assertEqual(res[1], expected_leftover)
|
self.assertEqual(res[1], expected_leftover)
|
||||||
|
|
||||||
testExtract(u"que dia <EFBFBD> hoje",
|
testExtract(u"que dia é hoje",
|
||||||
"2017-06-27 00:00:00", u"dia")
|
"2017-06-27 00:00:00", u"dia")
|
||||||
testExtract(u"que dia <EFBFBD> amanha",
|
testExtract(u"que dia é amanha",
|
||||||
"2017-06-28 00:00:00", u"dia")
|
"2017-06-28 00:00:00", u"dia")
|
||||||
testExtract(u"que dia foi ontem",
|
testExtract(u"que dia foi ontem",
|
||||||
"2017-06-26 00:00:00", u"dia")
|
"2017-06-26 00:00:00", u"dia")
|
||||||
|
@ -513,7 +513,7 @@ class TestNormalize(unittest.TestCase):
|
||||||
testExtract("lembra me para ligar a mae no dia 3 de agosto",
|
testExtract("lembra me para ligar a mae no dia 3 de agosto",
|
||||||
"2017-08-03 00:00:00", "lembra ligar mae")
|
"2017-08-03 00:00:00", "lembra ligar mae")
|
||||||
|
|
||||||
testExtract(u"compra facas no 13<EFBFBD> dia de maio",
|
testExtract(u"compra facas no 13º dia de maio",
|
||||||
"2018-05-13 00:00:00", "compra facas")
|
"2018-05-13 00:00:00", "compra facas")
|
||||||
testExtract(u"gasta dinheiro no maio dia 13",
|
testExtract(u"gasta dinheiro no maio dia 13",
|
||||||
"2018-05-13 00:00:00", "gasta dinheiro")
|
"2018-05-13 00:00:00", "gasta dinheiro")
|
||||||
|
@ -588,7 +588,7 @@ class TestNormalize(unittest.TestCase):
|
||||||
self.assertEqual(normalize("diez once doce trece catorce quince",
|
self.assertEqual(normalize("diez once doce trece catorce quince",
|
||||||
lang="es"),
|
lang="es"),
|
||||||
"10 11 12 13 14 15")
|
"10 11 12 13 14 15")
|
||||||
self.assertEqual(normalize(u"diecis<EFBFBD>is diecisiete", lang="es"),
|
self.assertEqual(normalize(u"dieciséis diecisiete", lang="es"),
|
||||||
"16 17")
|
"16 17")
|
||||||
self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"),
|
self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"),
|
||||||
"18 19")
|
"18 19")
|
||||||
|
@ -609,7 +609,7 @@ class TestNormalize(unittest.TestCase):
|
||||||
lang="es"),
|
lang="es"),
|
||||||
"2345")
|
"2345")
|
||||||
self.assertEqual(normalize(
|
self.assertEqual(normalize(
|
||||||
u"ciento veintitr<EFBFBD>s mil cuatrocientas cincuenta y seis",
|
u"ciento veintitrés mil cuatrocientas cincuenta y seis",
|
||||||
lang="es"),
|
lang="es"),
|
||||||
"123456")
|
"123456")
|
||||||
self.assertEqual(normalize(
|
self.assertEqual(normalize(
|
||||||
|
|
Loading…
Reference in New Issue