mycroft-core/mycroft/util/lang/parse_it.py

1326 lines
45 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Parse functions for Italian (IT-IT)
"""
import collections
from datetime import datetime
from dateutil.relativedelta import relativedelta
from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
extract_numbers_generic
from mycroft.util.lang.format_it import LONG_SCALE_IT, SHORT_SCALE_IT, \
pronounce_number_it
SHORT_ORDINAL_STRING_IT = {
1: 'primo',
2: 'secondo',
3: 'terzo',
4: 'quarto',
5: 'quinto',
6: 'sesto',
7: 'settimo',
8: 'ottavo',
9: 'nono',
10: 'decimo',
11: 'undicesimo',
12: 'dodicesimo',
13: 'tredicesimo',
14: 'quattordicesimo',
15: 'quindicesimo',
16: 'sedicesimo',
17: 'diciassettesimo',
18: 'diciottesimo',
19: 'diciannovesimo',
20: 'ventesimo',
30: 'trentesimo',
40: 'quarantesimo',
50: 'cinquantesimo',
60: 'sessantesimo',
70: 'settantesimo',
80: 'ottantesimo',
90: 'novantesimo',
1e2: 'centesimo',
1e3: 'millesimo',
1e6: 'milionesimo',
1e9: 'miliardesimo',
1e12: 'trilionesimo',
1e15: 'quadrilionesimo',
1e18: 'quintilionesim',
1e21: 'sestilionesimo',
1e24: 'settilionesimo',
1e27: 'ottilionesimo',
1e30: 'nonilionesimo',
1e33: 'decilionesimo'
# TODO > 1e-33
}
# per i > 10e12 modificata solo la desinenza: da sistemare a fine debug
LONG_ORDINAL_STRING_IT = {
1: 'primo',
2: 'secondo',
3: 'terzo',
4: 'quarto',
5: 'quinto',
6: 'sesto',
7: 'settimo',
8: 'ottavo',
9: 'nono',
10: 'decimo',
11: 'undicesimo',
12: 'dodicesimo',
13: 'tredicesimo',
14: 'quattordicesimo',
15: 'quindicesimo',
16: 'sedicesimo',
17: 'diciassettesimo',
18: 'diciottesimo',
19: 'diciannovesimo',
20: 'ventesimo',
30: 'trentesimo',
40: 'quarantesimo',
50: 'cinquantesimo',
60: 'sessantesimo',
70: 'settantesimo',
80: 'ottantesimo',
90: 'novantesimo',
1e2: 'centesimo',
1e3: 'millesimo',
1e6: 'milionesimo',
1e12: 'bilionesimo',
1e18: 'trilionesimo',
1e24: 'quadrilionesimo',
1e30: 'quintilionesimo',
1e36: 'sestilionesimo',
1e42: 'settilionesimo',
1e48: 'ottilionesimo',
1e54: 'nonilionesimo',
1e60: 'decilionesimo'
# TODO > 1e60
}
# Undefined articles ['un', 'una', 'un\''] can not be supressed,
# in Italian, 'un cavallo' means 'a horse' or 'one horse'.
ARTICLES_IT = ['il', 'lo', 'la', 'i', 'gli', 'le']
STRING_NUM_ITA = {
'zero': 0,
'un': 1,
'uno': 1,
'una': 1,
'un\'': 1,
'due': 2,
'tre': 3,
'quattro': 4,
'cinque': 5,
'sei': 6,
'sette': 7,
'otto': 8,
'nove': 9,
'dieci': 10,
'undici': 11,
'dodici': 12,
'tredici': 13,
'quattordici': 14,
'quindici': 15,
'sedici': 16,
'diciassette': 17,
'diciotto': 18,
'diciannove': 19,
'venti': 20,
'vent': 20,
'trenta': 30,
'trent': 30,
'quaranta': 40,
'quarant': 40,
'cinquanta': 50,
'cinquant': 50,
'sessanta': 60,
'sessant': 60,
'settanta': 70,
'settant': 70,
'ottanta': 80,
'ottant': 80,
'novanta': 90,
'novant': 90,
'cento': 100,
'duecento': 200,
'trecento': 300,
'quattrocento': 400,
'cinquecento': 500,
'seicento': 600,
'settecento': 700,
'ottocento': 800,
'novecento': 900,
'mille': 1000,
'mila': 1000,
'centomila': 100000,
'milione': 1000000,
'miliardo': 1000000000,
'primo': 1,
'secondo': 2,
'mezzo': 0.5,
'mezza': 0.5,
'paio': 2,
'decina': 10,
'decine': 10,
'dozzina': 12,
'dozzine': 12,
'centinaio': 100,
'centinaia': 100,
'migliaio': 1000,
'migliaia': 1000
}
def isFractional_it(input_str, short_scale=False):
"""
This function takes the given text and checks if it is a fraction.
Updated to italian from en version 18.8.9
Args:
input_str (str): the string to check if fractional
short_scale (bool): use short scale if True, long scale if False
Returns:
(bool) or (float): False if not a fraction, otherwise the fraction
"""
input_str = input_str.lower()
if input_str.endswith('i', -1) and len(input_str) > 2:
input_str = input_str[:-1] + "o" # normalizza plurali
fracts_it = {"intero": 1, "mezza": 2, "mezzo": 2}
if short_scale:
for num in SHORT_ORDINAL_STRING_IT:
if num > 2:
fracts_it[SHORT_ORDINAL_STRING_IT[num]] = num
else:
for num in LONG_ORDINAL_STRING_IT:
if num > 2:
fracts_it[LONG_ORDINAL_STRING_IT[num]] = num
if input_str in fracts_it:
return 1.0 / fracts_it[input_str]
return False
def extractnumber_long_it(word):
"""
This function converts a long textual number like
milleventisette -> 1027 diecimila -> 10041 in
integer value, covers from 0 to 999999999999999
for now limited to 999_e21 but ready for 999_e63
example:
milleventisette -> 1027
diecimilaquarantuno-> 10041
centottomiladuecentotredici -> 108213
Args:
word (str): the word to convert in number
Returns:
(bool) or (int): The extracted number or False if no number
was found
"""
units = {'zero': 0, 'uno': 1, 'due': 2, 'tre': 3, 'quattro': 4,
'cinque': 5, 'sei': 6, 'sette': 7, 'otto': 8, 'nove': 9}
tens = {'dieci': 10, 'venti': 20, 'trenta': 30, 'quaranta': 40,
'cinquanta': 50, 'sessanta': 60, 'settanta': 70, 'ottanta': 80,
'novanta': 90}
tens_short = {'vent': 20, 'trent': 30, 'quarant': 40, 'cinquant': 50,
'sessant': 60, 'settant': 70, 'ottant': 80, 'novant': 90}
nums_long = {'undici': 11, 'dodici': 12, 'tredici': 13, 'quattordici': 14,
'quindici': 15, 'sedici': 16, 'diciassette': 17,
'diciotto': 18, 'diciannove': 19}
multipli_it = collections.OrderedDict([
# (1e63, 'deciliardi'),
# (1e60, 'decilioni'),
# (1e57, 'noviliardi'),
# (1e54, 'novilioni'),
# (1e51, 'ottiliardi'),
# (1e48, 'ottilioni'),
# (1e45, 'settiliardi'),
# (1e42, 'settilioni'),
# (1e39, 'sestiliardi'),
# (1e36, 'sestilioni'),
# (1e33, 'quintiliardi'),
# (1e30, 'quintilioni'),
# (1e27, 'quadriliardi'),
# (1e24, 'quadrilioni'), # yotta
(1e21, 'triliardi'), # zetta
(1e18, 'trilioni'), # exa
(1e15, 'biliardi'), # peta
(1e12, 'bilioni'), # tera
(1e9, 'miliardi'), # giga
(1e6, 'milioni') # mega
])
multiplier = {}
un_multiplier = {}
for num in multipli_it:
if num > 1000 and num <= 1e21:
# plurali
multiplier[multipli_it[num]] = int(num)
# singolari - modificare per eccezioni *liardo
if multipli_it[num][-5:-1] == 'iard':
un_multiplier['un' + multipli_it[num][:-1] + 'o'] = int(num)
else:
un_multiplier['un' + multipli_it[num][:-1] + 'e'] = int(num)
value = False
# normalizza ordinali singoli o plurali -esimo -esimi
if word[-5:-1] == 'esim':
base = word[:-5]
normalize_ita3 = {'tre': '', 'ttr': 'o', 'sei': '', 'ott': 'o'}
normalize_ita2 = {'un': 'o', 'du': 'e', 'qu': 'e', 'tt': 'e',
'ov': 'e'}
if base[-3:] in normalize_ita3:
base += normalize_ita3[base[-3:]]
elif base[-2:] in normalize_ita2:
base += normalize_ita2[base[-2:]]
word = base
for item in un_multiplier:
components = word.split(item, 1)
if len(components) == 2:
if not components[0]: # inizia con un1^x
if not components[1]: # unmilione
word = str(int(un_multiplier[item]))
else: # unmilione + x
word = str(int(un_multiplier[item]) +
extractnumber_long_it(components[1]))
for item in multiplier:
components = word.split(item, 1)
if len(components) == 2:
if not components[0]: # inizia con un1^x
word = str(int(multiplier[item]) +
extractnumber_long_it(components[1]))
else:
if not components[1]:
word = str(extractnumber_long_it(components[0])) + '*' \
+ str(int(multiplier[item]))
else:
word = str(extractnumber_long_it(components[0])) + '*' \
+ str(int(multiplier[item])) + '+' \
+ str(extractnumber_long_it(components[1]))
for item in tens:
word = word.replace(item, '+' + str(tens[item]))
for item in tens_short:
word = word.replace(item, '+' + str(tens_short[item]))
for item in nums_long:
word = word.replace(item, '+' + str(nums_long[item]))
word = word.replace('cento', '+1xx')
word = word.replace('cent', '+1xx')
word = word.replace('mille', '+1000') # unmilionemille
word = word.replace('mila', '*1000') # unmilioneduemila
for item in units:
word = word.replace(item, '+' + str(units[item]))
# normalizzo i cento
occorrenze = word.count('+1xx')
for _ in range(0, occorrenze):
components = word.rsplit('+1xx', 1)
if len(components[0]) > 1 and components[0].endswith('0'):
word = components[0] + '+100' + components[1]
else:
word = components[0] + '*100' + components[1]
components = word.rsplit('*1000', 1)
if len(components) == 2:
if components[0].startswith('*'): # centomila
components[0] = components[0][1:]
word = str(extractnumber_long_it(components[0])) + \
'*1000' + str(components[1])
# gestione eccezioni
if word.startswith('*') or word.startswith('+'):
word = word[1:]
addends = word.split('+')
for c, _ in enumerate(addends):
if '*' in addends[c]:
factors = addends[c].split('*')
result = int(factors[0]) * int(factors[1])
if len(factors) == 3:
result *= int(factors[2])
addends[c] = str(result)
# check if all token are numbers
if all([s.isdecimal() for s in addends]):
value = sum([int(s) for s in addends])
else:
value = False
return value
def extractnumber_it(text, short_scale=False, ordinals=False):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
https://en.wikipedia.org/wiki/Names_of_large_numbers
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
"""
string_num_ordinal_it = {}
# first, second...
if ordinals:
if short_scale:
for num in SHORT_ORDINAL_STRING_IT:
num_string = SHORT_ORDINAL_STRING_IT[num]
string_num_ordinal_it[num_string] = num
STRING_NUM_ITA[num_string] = num
else:
for num in LONG_ORDINAL_STRING_IT:
num_string = LONG_ORDINAL_STRING_IT[num]
string_num_ordinal_it[num_string] = num
STRING_NUM_ITA[num_string] = num
# negate next number (-2 = 0 - 2)
negatives = ['meno'] # 'negativo' non è usuale in italiano
# multiply the previous number (one hundred = 1 * 100)
multiplies = ['decina', 'decine', 'dozzina', 'dozzine',
'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila']
# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
fraction_marker = [' e ']
# decimal marker ( 1 point 5 = 1 + 0.5)
decimal_marker = [' punto ', ' virgola ']
if short_scale:
for num in SHORT_SCALE_IT:
num_string = SHORT_SCALE_IT[num]
STRING_NUM_ITA[num_string] = num
multiplies.append(num_string)
else:
for num in LONG_SCALE_IT:
num_string = LONG_SCALE_IT[num]
STRING_NUM_ITA[num_string] = num
multiplies.append(num_string)
# 2 e 3/4 ed altri casi
for separator in fraction_marker:
components = text.split(separator)
zeros = 0
if len(components) == 2:
# count zeros in fraction part
sub_components = components[1].split(' ')
for element in sub_components:
if element == 'zero' or element == '0':
zeros += 1
else:
break
# ensure first is not a fraction and second is a fraction
num1 = extractnumber_it(components[0])
num2 = extractnumber_it(components[1])
if num1 is not None and num2 is not None \
and num1 >= 1 and 0 < num2 < 1:
return num1 + num2
# sette e quaranta sette e zero zero due
elif num1 is not None and num2 is not None \
and num1 >= 1 and num2 > 1:
return num1 + num2 / pow(10, len(str(num2)) + zeros)
# 2 punto 5
for separator in decimal_marker:
zeros = 0
# count zeros in fraction part
components = text.split(separator)
if len(components) == 2:
sub_components = components[1].split(' ')
for element in sub_components:
if element == 'zero' or element == '0':
zeros += 1
else:
break
number = int(extractnumber_it(components[0]))
decimal = int(extractnumber_it(components[1]))
if number is not None and decimal is not None:
if '.' not in str(decimal):
return number + decimal / pow(10,
len(str(decimal)) + zeros)
all_words = text.split()
val = False
prev_val = None
to_sum = []
for idx, word in enumerate(all_words):
if not word:
continue
prev_word = all_words[idx - 1] if idx > 0 else ''
next_word = all_words[idx + 1] if idx + 1 < len(all_words) else ''
# is this word already a number ?
if is_numeric(word):
val = float(word)
# is this word the name of a number ?
if word in STRING_NUM_ITA:
val = STRING_NUM_ITA[word]
# tre quarti un quarto trenta secondi
if isFractional_it(word) and prev_val:
if word[:-1] == 'second' and not ordinals:
val = prev_val * 2
else:
val = prev_val
# is the prev word a number and should we multiply it?
# twenty hundred, six hundred
if word in multiplies:
if not prev_val:
prev_val = 1
val = prev_val * val
# is this a spoken fraction?
# mezza tazza
if val is False:
val = isFractional_it(word, short_scale=short_scale)
# 2 quinti
if not ordinals:
next_value = isFractional_it(next_word, short_scale=short_scale)
if next_value:
if not val:
val = 1
val = val * next_value
# is this a negative number?
if val and prev_word and prev_word in negatives:
val = 0 - val
if not val:
val = extractnumber_long_it(word)
# let's make sure it isn't a fraction
if not val:
# look for fractions like '2/3'
all_pieces = word.split('/')
if look_for_fractions(all_pieces):
val = float(all_pieces[0]) / float(all_pieces[1])
else:
prev_val = val
# handle long numbers
# six hundred sixty six
# two million five hundred thousand
if word in multiplies and next_word not in multiplies:
to_sum.append(val)
val = 0
prev_val = 0
elif extractnumber_long_it(word) > 100 and \
extractnumber_long_it(next_word) and \
next_word not in multiplies:
to_sum.append(val)
val = 0
prev_val = 0
if val is not None:
for addend in to_sum:
val = val + addend
return val
def normalize_it(text, remove_articles):
""" IT string normalization """
# replace ambiguous words
text = text.replace('un paio', 'due')
words = text.split() # this also removed extra spaces
# Contractions are not common in IT
# Convert numbers into digits, e.g. 'quarantadue' -> '42'
normalized = ''
i = 0
while i < len(words):
word = words[i]
# remove articles
# Italian requires the article to define the grammatical gender
if remove_articles and word in ARTICLES_IT:
i += 1
continue
if word in STRING_NUM_ITA:
word = str(STRING_NUM_ITA[word])
val = int(extractnumber_it(word)) # era extractnumber_long_it
if val:
word = str(val)
normalized += ' ' + word
i += 1
# indefinite articles in it-it can not be removed
return normalized[1:]
def extract_datetime_it(string, dateNow, default_time):
def clean_string(s):
"""
cleans the input string of unneeded punctuation and capitalization
among other things.
Normalize italian plurals
"""
symbols = ['.', ',', ';', '?', '!', 'º', 'ª', '°', 'l\'']
for word in symbols:
s = s.replace(word, '')
s = s.lower().replace('á', 'a').replace('à', 'a').replace('è', "e'")\
.replace('é', "e'").replace('ì', 'i').replace('ù', 'u')\
.replace('ò', 'o').replace('-', ' ').replace('_', '')
# normalizza plurali per semplificare analisi
s = s.replace('secondi', 'secondo').replace('minuti', 'minuto')\
.replace('ore', 'ora').replace('giorni', 'giorno')\
.replace('settimane', 'settimana').replace('mesi', 'mese')\
.replace('anni', 'anno').replace('mattino', 'mattina')\
.replace('prossima', 'prossimo').replace('questa', 'questo')\
.replace('quarti', 'quarto').replace('in punto', 'in_punto')\
.replace('decennio', 'decenni').replace('secoli', 'secolo')\
.replace('millennio', 'millenni').replace(' un ', ' uno ')\
.replace('scorsa', 'scorso').replace('passata', 'passato')\
.replace('uno paio', 'due')
noise_words = ['dello', 'la', 'del', 'al', 'il', 'di', 'tra', 'lo',
'le', 'alle', 'alla', 'dai', 'delle', 'della',
'a', 'e\'', 'era', 'questa', 'questo', 'e', 'nel',
'nello', 'dallo', ' ']
word_list = s.split()
word_list = [x for x in word_list if x not in noise_words]
# normalizza alcuni formati orari
for idx in range(0, len(word_list) - 1):
if word_list[idx][0].isdigit() and word_list[idx+1][0].isdigit():
num0 = int(word_list[idx])
num1 = int(word_list[idx+1])
if 0 <= num0 <= 23 and 10 <= num1 <= 59:
word_list[idx] = str(num0) + ':' + str(num1)
word_list[idx+1] = ''
word_list = [x for x in word_list if x]
return word_list
def date_found():
return found or \
(datestr != '' or time_str != '' or year_offset != 0 or
month_offset != 0 or day_offset is True or hr_offset != 0 or
hr_abs or min_offset != 0 or min_abs or sec_offset != 0)
if string == '' or not dateNow:
return None
found = False
day_specified = False
day_offset = False
month_offset = 0
year_offset = 0
today = dateNow.strftime('%w')
current_year = dateNow.strftime('%Y')
from_flag = False
datestr = ''
has_year = False
time_qualifier = ''
time_qualifiers_am = ['mattina', 'stamani', 'stamane']
time_qualifiers_pm = ['pomeriggio', 'sera', 'stasera', 'stanotte']
time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm)
markers = ['alle', 'in', 'questo', 'per', 'di', 'tra', 'fra', 'entro']
days = ['lunedi', 'martedi', 'mercoledi',
'giovedi', 'venerdi', 'sabato', 'domenica']
months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno',
'luglio', 'agosto', 'settembre', 'ottobre', 'novembre',
'dicembre']
months_short = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago',
'set', 'ott', 'nov', 'dic']
year_multiples = ['decenni', 'secolo', 'millenni'] # decennio <- decenni
time_multiples = ['ora', 'minuto', 'secondo']
day_multiples = ['settimana', 'mese', 'anno']
noise_words_2 = ['tra', 'di', 'per', 'fra', 'un ', 'uno', 'lo', 'del',
'l', 'in_punto', ' ', 'nella', 'dell']
words = clean_string(string)
for idx, word in enumerate(words):
if word == '':
continue
word_prev_prev = words[idx - 2] if idx > 1 else ''
word_prev = words[idx - 1] if idx > 0 else ''
word_next = words[idx + 1] if idx + 1 < len(words) else ''
word_next_next = words[idx + 2] if idx + 2 < len(words) else ''
start = idx
used = 0
# save timequalifier for later
if word == 'adesso' and not datestr:
# word == 'ora' va in conflitto con 'tra un ora'
words = [x for x in words if x != 'adesso']
words = [x for x in words if x]
result_str = ' '.join(words)
extracted_date = dateNow.replace(microsecond=0)
return [extracted_date, result_str]
# un paio di o tra tre settimane --> secoli
elif extractnumber_it(word) and (word_next in year_multiples or
word_next in day_multiples):
multiplier = int(extractnumber_it(word))
used += 2
if word_next == 'decenni':
year_offset = multiplier * 10
elif word_next == 'secolo':
year_offset = multiplier * 100
elif word_next == 'millenni':
year_offset = multiplier * 1000
elif word_next == 'anno':
year_offset = multiplier
elif word_next == 'mese':
month_offset = multiplier
elif word_next == 'settimana':
day_offset = multiplier * 7
elif word in time_qualifiers_list:
time_qualifier = word
# parse today, tomorrow, day after tomorrow
elif word == 'oggi' and not from_flag:
day_offset = 0
used += 1
elif word == 'domani' and not from_flag:
day_offset = 1
used += 1
elif word == 'ieri' and not from_flag:
day_offset -= 1
used += 1
elif word == 'dopodomani' and not from_flag: # after tomorrow
day_offset += 2
used += 1
elif word == 'dopo' and word_next == 'domani' and not from_flag:
day_offset += 1
used += 2
elif word == 'giorno':
if word_prev[0].isdigit():
day_offset += int(word_prev)
start -= 1
used = 2
if word_next == 'dopo' and word_next_next == 'domani':
day_offset += 1
used += 2
elif word == 'settimana' and not from_flag:
if word_prev == 'prossimo':
day_offset = 7
start -= 1
used = 2
elif word_prev == 'passato' or word_prev == 'scorso':
day_offset = -7
start -= 1
used = 2
elif word_next == 'prossimo':
day_offset = 7
used += 2
elif word_next == 'passato' or word_next == 'scorso':
day_offset = -7
used += 2
# parse next month, last month
elif word == 'mese' and not from_flag:
if word_prev == 'prossimo':
month_offset = 1
start -= 1
used = 2
elif word_prev == 'passato' or word_prev == 'scorso':
month_offset = -1
start -= 1
used = 2
elif word_next == 'prossimo':
month_offset = 1
used += 2
elif word_next == 'passato' or word_next == 'scorso':
month_offset = -1
used += 2
# parse next year, last year
elif word == 'anno' and not from_flag:
if word_prev == 'prossimo': # prossimo anno
year_offset = 1
start -= 1
used = 2
elif word_next == 'prossimo': # anno prossimo
year_offset = 1
used = 2
elif word_prev == 'passato' or word_prev == 'scorso':
year_offset = -1
start -= 1
used = 2
elif word_next == 'passato' or word_next == 'scorso':
year_offset = -1
used = 2
elif word == 'decenni' and not from_flag:
if word_prev == 'prossimo': # prossimo mese
year_offset = 10
start -= 1
used = 2
elif word_next == 'prossimo': # mese prossimo
year_offset = 10
used = 2
elif word_prev == 'passato' or word_prev == 'scorso':
year_offset = -10
start -= 1
used = 2
elif word_next == 'passato' or word_next == 'scorso':
year_offset = -10
used = 2
# parse Monday, Tuesday, etc., and next Monday,
# last Tuesday, etc.
elif word in days and not from_flag:
ddd = days.index(word)
day_offset = (ddd + 1) - int(today)
used = 1
if day_offset < 0:
day_offset += 7
if word_prev == 'prossimo':
day_offset += 7
start -= 1
used += 1
elif word_prev == 'passato' or word_prev == 'scorso':
day_offset -= 7
start -= 1
used += 1
if word_next == 'prossimo':
day_offset += 7
used += 1
elif word_next == 'passato' or word_next == 'scorso':
day_offset -= 7
used += 1
# parse 15 of July, June 20th, Feb 18, 19 of February
elif word in months or word in months_short and not from_flag:
try:
mmm = months.index(word)
except ValueError:
mmm = months_short.index(word)
used += 1
datestr = months[mmm]
if word_prev and extractnumber_it(word_prev):
datestr += ' ' + str(int(extractnumber_it(word_prev)))
start -= 1
used += 1
if word_next and extractnumber_it(word_next):
datestr += ' ' + str(int(extractnumber_it(word_next)))
used += 1
has_year = True
else:
has_year = False
elif word_next and word_next[0].isdigit():
datestr += ' ' + word_next
used += 1
if word_next_next and word_next_next[0].isdigit():
datestr += ' ' + word_next_next
used += 1
has_year = True
else:
has_year = False
# parse 5 days from tomorrow, 10 weeks from next thursday,
# 2 months from July
validFollowups = days + months + months_short
validFollowups.append('oggi')
validFollowups.append('domani')
validFollowups.append('prossimo')
validFollowups.append('passato')
validFollowups.append('adesso')
if (word == 'da' or word == 'dopo') and word_next in validFollowups:
used = 0
from_flag = True
if word_next == 'domani':
day_offset += 1
used += 2
elif word_next == 'oggi' or word_next == 'adesso':
used += 2
elif word_next in days:
ddd = days.index(word_next)
tmp_offset = (ddd + 1) - int(today)
used += 2
if tmp_offset < 0:
tmp_offset += 7
if word_next_next == 'prossimo':
tmp_offset += 7
used += 1
elif word_next_next == 'passato' or word_next_next == 'scorso':
tmp_offset = (ddd + 1) - int(today)
used += 1
day_offset += tmp_offset
elif word_next_next and word_next_next in days:
ddd = days.index(word_next_next)
tmp_offset = (ddd + 1) - int(today)
if word_next == 'prossimo':
tmp_offset += 7
# elif word_next == 'passato' or word_next == 'scorso':
# tmp_offset -= 7
day_offset += tmp_offset
used += 3
if used > 0:
if start - 1 > 0 and words[start - 1] == 'questo':
start -= 1
used += 1
for i in range(0, used):
words[i + start] = ''
if start - 1 >= 0 and words[start - 1] in markers:
words[start - 1] = ''
found = True
day_specified = True
# parse time
time_str = ''
hr_offset = 0
min_offset = 0
sec_offset = 0
hr_abs = None
min_abs = None
military = False
for idx, word in enumerate(words):
if word == '':
continue
word_prev_prev = words[idx - 2] if idx > 1 else ''
word_prev = words[idx - 1] if idx > 0 else ''
word_next = words[idx + 1] if idx + 1 < len(words) else ''
word_next_next = words[idx + 2] if idx + 2 < len(words) else ''
# parse noon, midnight, morning, afternoon, evening
used = 0
if word == 'mezzogiorno':
hr_abs = 12
used += 1
elif word == 'mezzanotte':
hr_abs = 24
used += 1
if word == 'mezzo' and word_next == 'giorno':
hr_abs = 12
used += 2
elif word == 'mezza' and word_next == 'notte':
hr_abs = 24
used += 2
elif word == 'mattina':
if not hr_abs:
hr_abs = 8
used += 1
if word_next and word_next[0].isdigit(): # mattina alle 5
hr_abs = int(word_next)
used += 1
elif word == 'pomeriggio':
if not hr_abs:
hr_abs = 15
used += 1
if word_next and word_next[0].isdigit(): # pomeriggio alle 5
hr_abs = int(word_next)
used += 1
if (hr_abs or 0) < 12:
hr_abs = (hr_abs or 0) + 12
elif word == 'sera':
if not hr_abs:
hr_abs = 19
used += 1
if word_next and word_next[0].isdigit() \
and ':' not in word_next:
hr_abs = int(word_next)
used += 1
if (hr_abs or 0) < 12:
hr_abs = (hr_abs or 0) + 12
# da verificare più a fondo
elif word == 'presto':
hr_abs -= 1
used += 1
elif word == 'tardi':
hr_abs += 1
used += 1
# un paio di minuti tra cinque minuti tra 5 ore
elif extractnumber_it(word) and (word_next in time_multiples):
d_time = int(extractnumber_it(word))
used += 2
if word_next == 'ora':
hr_offset = d_time
isTime = False
hr_abs = -1
min_abs = -1
elif word_next == 'minuto':
min_offset = d_time
isTime = False
hr_abs = -1
min_abs = -1
elif word_next == 'secondo':
sec_offset = d_time
isTime = False
hr_abs = -1
min_abs = -1
elif word == 'mezzora':
min_offset = 30
used = 1
isTime = False
hr_abs = -1
min_abs = -1
# if word_prev == 'uno' or word_prev == 'una':
# start -= 1
# used += 1
elif extractnumber_it(word) and word_next and \
word_next == 'quarto' and word_next_next == 'ora':
if int(extractnumber_it(word)) == 1 \
or int(extractnumber_it(word)) == 3:
min_offset = 15 * int(extractnumber_it(word))
else: # elimina eventuali errori
min_offset = 15
used = 3
start -= 1
isTime = False
hr_abs = -1
min_abs = -1
elif word[0].isdigit():
isTime = True
str_hh = ''
str_mm = ''
remainder = ''
if ':' in word:
# parse colons
# '3:00 in the morning'
components = word.split(':')
if len(components) == 2:
num0 = int(extractnumber_it(components[0]))
num1 = int(extractnumber_it(components[1]))
if num0 is not False and num1 is not False \
and 0 <= num0 <= 23 and 0 <= num1 <= 59:
str_hh = str(num0)
str_mm = str(num1)
elif 0 < int(extractnumber_it(word)) < 24 \
and word_next != 'quarto':
str_hh = str(int(word))
str_mm = '00'
elif 100 <= int(word) <= 2400:
str_hh = int(word) / 100
str_mm = int(word) - str_hh * 100
military = True
isTime = False
if extractnumber_it(word) and word_next \
and word_next == 'quarto' and word_next_next != 'ora':
if int(extractnumber_it(word)) == 1 \
or int(extractnumber_it(word)) == 3:
str_mm = str(15 * int(extractnumber_it(word)))
else: # elimina eventuali errori
str_mm = '0'
str_hh = str(hr_abs)
used = 2
words[idx + 1] = ''
isTime = False
if extractnumber_it(word) and word_next \
and word_next == 'in_punto':
str_hh = str(int(extractnumber_it(word)))
used = 2
if word_next == 'pm':
remainder = 'pm'
hr_abs = int(str_hh)
min_abs = int(str_mm)
if hr_abs <= 12:
hr_abs = hr_abs + 12
used = 2
elif word_next == 'am':
remainder = 'am'
hr_abs = int(str_hh)
min_abs = int(str_mm)
used = 2
elif word_next == 'mattina':
# ' 11 del mattina'
hh = int(str_hh)
mm = int(str_mm)
used = 2
remainder = 'am'
isTime = False
hr_abs = hh
min_abs = mm
elif word_next == 'pomeriggio':
# ' 2 del pomeriggio'
hh = int(str_hh)
mm = int(str_mm)
if hh < 12:
hh += 12
used = 2
remainder = 'pm'
isTime = False
hr_abs = hh
min_abs = mm
elif word_next == 'sera':
# 'alle 8 di sera'
hh = int(str_hh)
mm = int(str_mm)
if hh < 12:
hh += 12
used = 2
remainder = 'pm'
isTime = False
hr_abs = hh
min_abs = mm
elif word_next == 'notte':
hh = int(str_hh)
mm = int(str_mm)
if hh > 5:
remainder = 'pm'
else:
remainder = 'am'
used = 2
isTime = False
hr_abs = hh
min_abs = mm
# parse half an hour : undici e mezza
elif word_next and word_next == 'mezza':
hr_abs = int(str_hh)
min_abs = 30
used = 2
isTime = False
elif word_next and word_next == 'in_punto':
hr_abs = int(str_hh)
min_abs = 0
str_mm = '0'
used = 2
isTime = False
else:
# 17:30
remainder = ''
hr_abs = int(str_hh)
min_abs = int(str_mm)
used = 1
isTime = False
if word_prev == 'ora':
words[idx - 1] = ''
if time_qualifier != '':
# military = True
if str_hh and int(str_hh) <= 12 and \
(time_qualifier in time_qualifiers_pm):
str_hh = str(int(str_hh) + 12)
else:
isTime = False
str_hh = int(str_hh) if str_hh else 0
str_mm = int(str_mm) if str_mm else 0
str_hh = str_hh + 12 if remainder == 'pm' \
and str_hh < 12 else str_hh
str_hh = str_hh - 12 if remainder == 'am' \
and str_hh >= 12 else str_hh
if (not military and
remainder not in ['am', 'pm'] and
((not day_specified) or day_offset < 1)):
# ambiguous time, detect whether they mean this evening or
# the next morning based on whether it has already passed
hr_abs = str_hh
if dateNow.hour < str_hh:
pass # No modification needed
elif dateNow.hour < str_hh + 12:
str_hh += 12
hr_abs = str_hh
else:
# has passed, assume the next morning
day_offset += 1
if time_qualifier in time_qualifiers_pm and str_hh < 12:
str_hh += 12
if str_hh > 24 or str_mm > 59:
isTime = False
used = 0
if isTime:
hr_abs = str_hh * 1
min_abs = str_mm * 1
used += 1
if (hr_abs or 0) <= 12 and (time_qualifier == 'sera' or
time_qualifier == 'pomeriggio'):
hr_abs = (hr_abs or 0) + 12
if used > 0:
# removed parsed words from the sentence
for i in range(used):
words[idx + i] = ''
if word_prev == 'o' or word_prev == 'oh':
words[words.index(word_prev)] = ''
if idx > 0 and word_prev in markers:
words[idx - 1] = ''
if idx > 1 and word_prev_prev in markers:
words[idx - 2] = ''
idx += used - 1
found = True
# check that we found a date
if not date_found:
return None
if day_offset is False:
day_offset = 0
# perform date manipulation
extracted_date = dateNow.replace(microsecond=0)
if datestr != '':
en_months = ['january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november',
'december']
en_months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
'aug', 'sept', 'oct', 'nov', 'dec']
for idx, en_month in enumerate(en_months):
datestr = datestr.replace(months[idx], en_month)
for idx, en_month in enumerate(en_months_short):
datestr = datestr.replace(months_short[idx], en_month)
try:
temp = datetime.strptime(datestr, '%B %d')
except ValueError:
# Try again, allowing the year
temp = datetime.strptime(datestr, '%B %d %Y')
extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
if not has_year:
temp = temp.replace(year=extracted_date.year,
tzinfo=extracted_date.tzinfo)
if extracted_date < temp:
extracted_date = extracted_date.replace(
year=int(current_year),
month=int(temp.strftime('%m')),
day=int(temp.strftime('%d')),
tzinfo=extracted_date.tzinfo)
else:
extracted_date = extracted_date.replace(
year=int(current_year) + 1,
month=int(temp.strftime('%m')),
day=int(temp.strftime('%d')),
tzinfo=extracted_date.tzinfo)
else:
extracted_date = extracted_date.replace(
year=int(temp.strftime('%Y')),
month=int(temp.strftime('%m')),
day=int(temp.strftime('%d')),
tzinfo=extracted_date.tzinfo)
else:
# ignore the current HH:MM:SS if relative using days or greater
if hr_offset == 0 and min_offset == 0 and sec_offset == 0:
extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
if year_offset != 0:
extracted_date = extracted_date + relativedelta(years=year_offset)
if month_offset != 0:
extracted_date = extracted_date + relativedelta(months=month_offset)
if day_offset != 0:
extracted_date = extracted_date + relativedelta(days=day_offset)
if hr_abs != -1 and min_abs != -1:
# If no time was supplied in the string set the time to default
# time if it's available
if hr_abs is None and min_abs is None and default_time is not None:
hr_abs, min_abs = default_time.hour, default_time.minute
else:
hr_abs = hr_abs or 0
min_abs = min_abs or 0
extracted_date = extracted_date + relativedelta(hours=hr_abs,
minutes=min_abs)
if (hr_abs != 0 or min_abs != 0) and datestr == '':
if not day_specified and dateNow > extracted_date:
extracted_date = extracted_date + relativedelta(days=1)
if hr_offset != 0:
extracted_date = extracted_date + relativedelta(hours=hr_offset)
if min_offset != 0:
extracted_date = extracted_date + relativedelta(minutes=min_offset)
if sec_offset != 0:
extracted_date = extracted_date + relativedelta(seconds=sec_offset)
words = [x for x in words if x not in noise_words_2]
words = [x for x in words if x]
result_str = ' '.join(words)
return [extracted_date, result_str]
def get_gender_it(word, raw_string=""):
"""
In Italian to define the grammatical gender of a word is necessary
analyze the article that precedes the word and not only the last
letter of the word.
TODO: check if useful
"""
gender = None
words = raw_string.split(' ')
for idx, w in enumerate(words):
if w == word and idx != 0:
previous = words[idx - 1]
gender = get_gender_it(previous)
break
if not gender:
if word[-1] == 'a' or word[-1] == 'e':
gender = 'f'
if word[-1] == 'o' or word[-1] == 'n' \
or word[-1] == 'l' or word[-1] == 'i':
gender = 'm'
return gender
def extract_numbers_it(text, short_scale=False, ordinals=False):
"""
Takes in a string and extracts a list of numbers.
Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
Returns:
list: list of extracted numbers as floats
"""
return extract_numbers_generic(text, pronounce_number_it, extractnumber_it,
short_scale=short_scale, ordinals=ordinals)