# -*- coding: utf-8 -*-
# Copyright 2017 Mycroft AI Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
Parse functions for Italian (IT-IT)
TODO: numbers greater than 999999
TODO: it_number_parse
TODO: it_pruning
from datetime import datetime
from dateutil.relativedelta import relativedelta
from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
# Undefined articles ["un", "una", "un'"] can not be supressed,
# in Italian, "un cavallo" means "a horse" or "one horse".
it_articles = ["il", "lo", "la", "i", "gli", "le"]
it_numbers = {
"zero": 0,
"un": 1,
"uno": 1,
"una": 1,
"un'": 1,
"due": 2,
"tre": 3,
"quattro": 4,
"cinque": 5,
"sei": 6,
"sette": 7,
"otto": 8,
"nove": 9,
"dieci": 10,
"undici": 11,
"dodici": 12,
"tredici": 13,
"quattordici": 14,
"quindici": 15,
"sedici": 16,
"diciassette": 17,
"diciotto": 18,
"diciannove": 19,
"venti": 20,
"vent": 20,
"trenta": 30,
"trent": 30,
"quaranta": 40,
"quarant": 40,
"cinquanta": 50,
"cinquant": 50,
"sessanta": 60,
"sessant": 60,
"settanta": 70,
"settant": 70,
"ottanta": 80,
"ottant": 80,
"novanta": 90,
"novant": 90,
"cento": 100,
"duecento": 200,
"trecento": 300,
"quattrocento": 400,
"cinquecento": 500,
"seicento": 600,
"settecento": 700,
"ottocento": 800,
"novecento": 900,
"primo": 1,
"secondo": 2,
"mille": 1000,
"mila": 1000
def isFractional_it(input_str):
This function takes the given text and checks if it is a fraction.
E' la versione portoghese riadattata in italiano
text (str): the string to check if fractional
(bool) or (float): False if not a fraction, otherwise the fraction
TODO: verificare la corretta gestione dei plurali
aFrac = ["mezz", "terz", "quart", "quint", "sest", "settim", "ottav",
"non", "decim", "undicesim", "dodicesim", "tredicesim",
"quattrodicesim", "quindicesim", "sedicesim",
"diciasettesim", "diciottesim", "diciasettesim",
if input_str[:-1].lower() in aFrac:
return 1.0 / (aFrac.index(input_str[:-1]) + 2)
if input_str[:-1] == "ventesim":
return 1.0 / 20
if input_str[:-1] == "centesim":
return 1.0 / 100
if input_str[:-1] == "millesim":
return 1.0 / 1000
return False
def extractnumber_long_it(word):
Questa funzione converte un numero testuale lungo es.
ventisette -> 27
quarantuno -> 41
nell'equivalente valore intero
text (str): la stringa da normalizzare
(int) : il valore del numero estratto usando tutta la parola
Falso : se la parola non è un numero es."qualcuno"
result = False
value = False
for number in it_numbers.keys(): # ciclo unità
if word.endswith(number):
result = True
value = it_numbers[number]
word = word[0: len(word) - len(number)]
if result: # tolte le unità, dovrebbe rimanere una stringa nota
if word in it_numbers:
value += it_numbers[word]
value = False # non è un numero es. qualcuno
return value
def extractnumber_it(text):
Questa funzione prepara il testo dato per l'analisi rendendo
numeri testuali come interi o frazioni.
In italiano non è un modo abituale ma può essere interessante
per Mycroft
E' la versione portoghese riadattata in italiano
text (str): la stringa da normalizzare
(int) o (float): il valore del numero estratto
aWords = text.split()
count = 0
result = None
while count < len(aWords):
val = 0
word = aWords[count]
next_next_word = None
if count + 1 < len(aWords):
next_word = aWords[count + 1]
if count + 2 < len(aWords):
next_next_word = aWords[count + 2]
next_word = None
# is current word a number?
if word in it_numbers:
if word == "mila":
val = it_numbers[word]
val = result * val
result = 0
val = it_numbers[word]
elif word.isdigit(): # doesn't work with decimals
val = int(word)
elif is_numeric(word):
val = float(word)
elif isFractional_it(word):
if not result:
result = 1
result = result * isFractional_it(word)
# "un terzo" is 1/3 but "il terzo" is 3
if aWords[count - 1] == "il":
result = 1.0 // isFractional_it(word)
count += 1
if not val:
# look for fractions like "2/3"
aPieces = word.split('/')
# if (len(aPieces) == 2 and is_numeric(aPieces[0])
# and is_numeric(aPieces[1])):
if look_for_fractions(aPieces):
val = float(aPieces[0]) / float(aPieces[1])
if not val:
# cerca numero composto come ventuno ventitre centoventi"
val = extractnumber_long_it(word)
if val:
if result is None:
result = 0
# handle fractions
# if next_word != "avos":
result += val
# else:
# result = float(result) / float(val)
if next_word is None:
# number word and fraction
ands = ["e"]
if next_word in ands:
zeros = 0
if result is None:
count += 1
newWords = aWords[count + 2:]
newText = ""
for word in newWords:
newText += word + " "
afterAndVal = extractnumber_it(newText[:-1])
if afterAndVal:
if result < afterAndVal or result < 20:
while afterAndVal > 1:
afterAndVal = afterAndVal / 10.0
for word in newWords:
if word == "zero" or word == "0":
zeros += 1
for _ in range(0, zeros):
afterAndVal = afterAndVal / 10.0
result += afterAndVal
elif next_next_word is not None:
if next_next_word in ands:
newWords = aWords[count + 3:]
newText = ""
for word in newWords:
newText += word + " "
afterAndVal = extractnumber_it(newText[:-1])
if afterAndVal:
if result is None:
result = 0
result += afterAndVal
decimals = ["punto", "virgola", ".", ","]
if next_word in decimals:
zeros = 0
newWords = aWords[count + 2:]
newText = ""
for word in newWords:
newText += word + " "
for word in newWords:
if word == "zero" or word == "0":
zeros += 1
afterDotVal = str(extractnumber_it(newText[:-1]))
afterDotVal = zeros * "0" + afterDotVal
result = float(str(result) + "." + afterDotVal)
count += 1
if result is None:
return False
# Return the $str with the number related words removed
# (now empty strings, so strlen == 0)
# aWords = [word for word in aWords if len(word) > 0]
# text = ' '.join(aWords)
if "." in str(result):
integer, dec = str(result).split(".")
# cast float to int
if dec == "0":
result = int(integer)
return result
def normalize_it(text, remove_articles):
""" IT string normalization """
words = text.split() # this also removed extra spaces
# Contractions are not common in IT
# Convert numbers into digits, e.g. "quarantadue" -> "42"
normalized = ""
i = 0
while i < len(words):
word = words[i]
# remove articles
# Italian requires the article to define the gender
if remove_articles and word in it_articles:
i += 1
if word in it_numbers:
word = str(it_numbers[word])
val = extractnumber_long_it(word)
if val:
word = str(val)
normalized += " " + word
i += 1
# indefinite articles in it-it can not be removed
return normalized[1:]
def extract_datetime_it(string, currentDate=None):
def clean_string(s):
cleans the input string of unneeded punctuation and capitalization
among other things.
Normalize italian plurals
symbols = [".", ",", ";", "?", "!", u"º", u"ª", u"°"]
for word in symbols:
s = s.replace(word, "")
s = s.lower().replace(
" ").replace(
noise_words = ["tra", "la", "del", "al", "il", "di",
"le", "per", "alle", "alla", "dai", "delle",
"a", "e'", "era", "questa", "questo", "e"]
for word in noise_words:
s = s.replace(" " + word + " ", " ")
# normalizza plurali per semplificare analisi
s = s.replace(
wordList = s.split()
# print(wordList) # debug only
return wordList
def date_found():
return found or \
datestr != "" or timeStr != "" or
yearOffset != 0 or monthOffset != 0 or
dayOffset is True or hrOffset != 0 or
hrAbs != 0 or minOffset != 0 or
minAbs != 0 or secOffset != 0
if string == "":
return None
if currentDate is None:
currentDate =
found = False
daySpecified = False
dayOffset = False
monthOffset = 0
yearOffset = 0
dateNow = currentDate
today = dateNow.strftime("%w")
currentYear = dateNow.strftime("%Y")
fromFlag = False
datestr = ""
hasYear = False
timeQualifier = ""
timeQualifiersList = ['mattina', 'pomeriggio', 'sera']
markers = ['alle', 'in', 'questo', 'per', 'di']
days = ['lunedi', 'martedi', 'mercoledi',
'giovedi', 'venerdi', 'sabato', 'domenica']
months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno',
'luglio', 'agosto', 'settembre', 'ottobre', 'novembre',
monthsShort = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago',
'set', 'ott', 'nov', 'dic']
words = clean_string(string)
for idx, word in enumerate(words):
if word == "":
wordPrevPrev = words[idx - 2] if idx > 1 else ""
wordPrev = words[idx - 1] if idx > 0 else ""
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
# wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
# possono esistere casi dove servano tre parole di profondità ?
start = idx
used = 0
# save timequalifier for later
if word in timeQualifiersList:
timeQualifier = word
# parse today, tomorrow, day after tomorrow
elif word == "oggi" and not fromFlag:
dayOffset = 0
used += 1
elif word == "domani" and not fromFlag:
dayOffset = 1
used += 1
elif word == "ieri" and not fromFlag:
dayOffset -= 1
used += 1
elif word == "dopodomani" and not fromFlag: # after tomorrow
dayOffset += 2
used += 1
elif word == "dopo" and wordNext == "domani" and \
not fromFlag:
dayOffset += 1
used += 2
elif word == "giorno":
if wordPrev[0].isdigit():
dayOffset += int(wordPrev)
start -= 1
used = 2
if wordNext == "dopo" and wordNextNext == "domani":
dayOffset += 1
used += 2
elif word == "settimana" and not fromFlag:
if wordPrev[0].isdigit():
dayOffset += int(wordPrev) * 7
start -= 1
used = 2
elif wordPrev == "prossimo":
dayOffset = 7
start -= 1
used = 2
elif wordPrev == "passato":
dayOffset = -7
start -= 1
used = 2
# parse 10 months, next month, last month
elif word == "mese" and not fromFlag:
if wordPrev[0].isdigit():
monthOffset = int(wordPrev)
start -= 1
used = 2
elif wordPrev == "prossimo":
monthOffset = 1
start -= 1
used = 2
elif wordPrev == "passato":
monthOffset = -1
start -= 1
used = 2
# parse 5 years, next year, last year
elif word == "anno" and not fromFlag:
if wordPrev[0].isdigit():
yearOffset = int(wordPrev)
start -= 1
used = 2
elif wordPrev == "prossimo":
yearOffset = 1
start -= 1
used = 2
elif wordPrev == "passato":
yearOffset = -1
start -= 1
used = 2
# parse Monday, Tuesday, etc., and next Monday,
# last Tuesday, etc.
elif word in days and not fromFlag:
d = days.index(word)
dayOffset = (d + 1) - int(today)
used = 1
if dayOffset < 0:
dayOffset += 7
if wordPrev == "prossimo":
dayOffset += 7
used += 1
start -= 1
elif wordPrev == "passato":
dayOffset -= 7
used += 1
start -= 1
if wordNext == "prossimo":
# dayOffset += 7
used += 1
elif wordNext == "passato":
# dayOffset -= 7
used += 1
# parse 15 of July, June 20th, Feb 18, 19 of February
elif word in months or word in monthsShort and not fromFlag:
m = months.index(word)
except ValueError:
m = monthsShort.index(word)
used += 1
datestr = months[m]
if wordPrev and (wordPrev[0].isdigit()):
datestr += " " + wordPrev
start -= 1
used += 1
if wordNext and wordNext[0].isdigit():
datestr += " " + wordNext
used += 1
hasYear = True
hasYear = False
elif wordNext and wordNext[0].isdigit():
datestr += " " + wordNext
used += 1
if wordNextNext and wordNextNext[0].isdigit():
datestr += " " + wordNextNext
used += 1
hasYear = True
hasYear = False
# parse 5 days from tomorrow, 10 weeks from next thursday,
# 2 months from July
validFollowups = days + months + monthsShort
if (word == "da" or word == "dopo") and wordNext in validFollowups:
used = 2
fromFlag = True
if wordNext == "domani":
dayOffset += 1
elif wordNext in days:
d = days.index(wordNext)
tmpOffset = (d + 1) - int(today)
used = 2
if tmpOffset < 0:
tmpOffset += 7
dayOffset += tmpOffset
elif wordNextNext and wordNextNext in days:
d = days.index(wordNextNext)
tmpOffset = (d + 1) - int(today)
used = 3
if wordNext == "prossimo":
tmpOffset += 7
used += 2 # era 1
start -= 1
elif wordNext == "passato":
tmpOffset -= 7
used += 1
start -= 1
dayOffset += tmpOffset
if used > 0:
if start - 1 > 0 and words[start - 1] == "questo":
start -= 1
used += 1
for i in range(0, used):
words[i + start] = ""
if start - 1 >= 0 and words[start - 1] in markers:
words[start - 1] = ""
found = True
daySpecified = True
# parse time
timeStr = ""
hrOffset = 0
minOffset = 0
secOffset = 0
hrAbs = 0
minAbs = 0
for idx, word in enumerate(words):
if word == "":
wordPrevPrev = words[idx - 2] if idx > 1 else ""
wordPrev = words[idx - 1] if idx > 0 else ""
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
# wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
# TODO verfica se esistono casi dove serva profindita 3 x analisi
# parse noon, midnight, morning, afternoon, evening
used = 0
if word == "mezzogiorno":
hrAbs = 12
used += 1
elif word == "mezzanotte":
hrAbs = 24
used += 1
if word == "mezzo" and wordNext == "giorno": # if stt splits the word
hrAbs = 12
used += 2
elif word == "mezza"and wordNext == "notte": # if stt splits the word
hrAbs = 24
used += 2
elif word == "mattina":
if hrAbs == 0:
hrAbs = 8
used += 1
if wordNext and wordNext[0].isdigit(): # mattina alle 5
hrAbs = int(wordNext)
used += 1
elif word == "pomeriggio":
if hrAbs == 0:
hrAbs = 15
used += 1
if wordNext and wordNext[0].isdigit(): # pomeriggio alle 5
hrAbs = int(wordNext)
used += 1
if hrAbs < 12:
hrAbs += 12
elif word == "sera":
if hrAbs == 0:
hrAbs = 19
used += 1
if wordNext and wordNext[0].isdigit(): # sera alle 8
hrAbs = int(wordNext)
used += 1
if hrAbs < 12:
hrAbs += 12
# parse 5:00 am, 12:00 p.m., etc
elif word[0].isdigit():
isTime = True
strHH = ""
strMM = ""
remainder = ""
if ':' in word:
# parse colons
# "3:00 in the morning"
stage = 0
length = len(word)
for i in range(length):
if stage == 0:
if word[i].isdigit():
strHH += word[i]
elif word[i] == ":":
stage = 1
stage = 2
i -= 1
elif stage == 1:
if word[i].isdigit():
strMM += word[i]
stage = 2
i -= 1
elif stage == 2:
remainder = word[i:].replace(".", "")
if remainder == "":
nextWord = wordNext.replace(".", "")
if nextWord == "am" or nextWord == "pm":
remainder = nextWord
used += 1
elif nextWord == "sera":
remainder = "pm"
used += 1
elif wordNext == "mattina":
remainder = "am"
used += 1
elif wordNext == "pomeriggio":
remainder = "pm"
used += 1
elif wordNext == "notte":
remainder = "pm"
used += 1
elif wordNext == "di" and wordNextNext == "notte":
if strHH > 5:
remainder = "pm"
remainder = "am"
used += 2
if timeQualifier != "":
if strHH <= 12 and \
(timeQualifier == "sera" or
timeQualifier == "pomeriggio"):
strHH += 12
# try to parse # s without colons
# 5 hours, 10 minutes etc.
length = len(word)
strNum = ""
remainder = ""
for i in range(length):
if word[i].isdigit():
strNum += word[i]
remainder += word[i]
if remainder == "":
remainder = wordNext.replace(".", "").lstrip().rstrip()
if (
remainder == "pm" or
wordNext == "pm" or
remainder == "p.m." or
wordNext == "p.m."):
strHH = strNum
remainder = "pm"
used = 1
elif (
remainder == "am" or
wordNext == "am" or
remainder == "a.m." or
wordNext == "a.m."):
strHH = strNum
remainder = "am"
used = 1
if wordNext == "pm" or wordNext == "p.m.":
strHH = strNum
remainder = "pm"
used = 1
elif wordNext == "am" or wordNext == "a.m.":
strHH = strNum
remainder = "am"
used = 1
elif (
int(word) > 100 and
wordPrev == "o" or
wordPrev == "oh"
# 0800 hours (pronounced oh-eight-hundred)
strHH = int(word) / 100
strMM = int(word) - strHH * 100
if wordNext == "ora":
used += 1
elif (
wordNext == "ora" and
word[0] != '0' and
int(word) < 100 and
int(word) > 2400
# ignores military time
# "in 3 hours"
hrOffset = int(word)
used = 2
isTime = False
hrAbs = -1
minAbs = -1
elif wordNext == "mattina":
# " 11 del mattina" -> del viene rimosso
hh = int(word)
used = 2
isTime = False
hrAbs = hh
minAbs = 00
elif wordNext == "pomeriggio":
# " 2 del pomeriggio" -> del viene rimosso
hh = int(word)
if hh < 12:
hh += 12
used = 2
isTime = False
hrAbs = hh
minAbs = 00
elif wordNext == "sera":
# "alle 8 di sera" -> alle viene rimosso
hh = int(word)
if hh < 12:
hh += 12
used = 2
isTime = False
hrAbs = hh
minAbs = 00
# parse half an hour : undici e mezza
elif wordNext and wordNext == "mezza":
hrAbs = int(word)
minAbs = 30
used = 2
isTime = False
# parse 1 quarter hour 3 quarters : dieci e tre quarti
elif word and wordNext and \
wordNext == "quarto" and word[0].isdigit():
minAbs = 15 * int(word)
used = 2
if minAbs > 45: # elimina eventuali errori
minAbs = 0
used -= 2
isTime = False
elif wordNext == "minuto":
# "in 10 minutes"
minOffset = int(word)
used = 2
isTime = False
hrAbs = -1
minAbs = -1
elif wordNext == "secondo":
# in 5 seconds
secOffset = int(word)
used = 2
isTime = False
hrAbs = -1
minAbs = -1
elif int(word) > 100:
strHH = int(word) / 100
strMM = int(word) - strHH * 100
if wordNext == "ora":
used += 1
elif wordNext and wordNext[0].isdigit():
strHH = word
strMM = wordNext
used += 1
if wordNextNext == "ora":
used += 1
elif wordNext == "in" and wordNextNext == "punto":
strHH = word
strMM = 00
used += 2
isTime = False
strHH = int(strHH) if strHH else 0
strMM = int(strMM) if strMM else 0
strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
if strHH > 24 or strMM > 59:
isTime = False
used = 0
if isTime:
hrAbs = strHH * 1
minAbs = strMM * 1
used += 1
if hrAbs <= 12 and (timeQualifier == "sera" or
timeQualifier == "pomeriggio"):
hrAbs += 12
if used > 0:
# removed parsed words from the sentence
for i in range(used):
words[idx + i] = ""
if wordPrev == "o" or wordPrev == "oh":
words[words.index(wordPrev)] = ""
if wordPrev == "presto":
hrOffset = -1
words[idx - 1] = ""
idx -= 1
elif wordPrev == "tardi":
hrOffset = 1
words[idx - 1] = ""
idx -= 1
if idx > 0 and wordPrev in markers:
words[idx - 1] = ""
if idx > 1 and wordPrevPrev in markers:
words[idx - 2] = ""
idx += used - 1
found = True
# check that we found a date
if not date_found:
return None
if dayOffset is False:
dayOffset = 0
# perform date manipulation
extractedDate = dateNow
extractedDate = extractedDate.replace(microsecond=0,
if datestr != "":
en_months = ['january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november',
en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
'sept', 'oct', 'nov', 'dec']
for idx, en_month in enumerate(en_months):
datestr = datestr.replace(months[idx], en_month)
for idx, en_month in enumerate(en_monthsShort):
datestr = datestr.replace(monthsShort[idx], en_month)
temp = datetime.strptime(datestr, "%B %d")
if not hasYear:
temp = temp.replace(year=extractedDate.year)
if extractedDate < temp:
extractedDate = extractedDate.replace(year=int(currentYear),
extractedDate = extractedDate.replace(
year=int(currentYear) + 1,
extractedDate = extractedDate.replace(
if timeStr != "":
temp = datetime(timeStr)
extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
if yearOffset != 0:
extractedDate = extractedDate + relativedelta(years=yearOffset)
if monthOffset != 0:
extractedDate = extractedDate + relativedelta(months=monthOffset)
if dayOffset != 0:
extractedDate = extractedDate + relativedelta(days=dayOffset)
if hrAbs != -1 and minAbs != -1:
extractedDate = extractedDate + relativedelta(hours=hrAbs,
if (hrAbs != 0 or minAbs != 0) and datestr == "":
if not daySpecified and dateNow > extractedDate:
extractedDate = extractedDate + relativedelta(days=1)
if hrOffset != 0:
extractedDate = extractedDate + relativedelta(hours=hrOffset)
if minOffset != 0:
extractedDate = extractedDate + relativedelta(minutes=minOffset)
if secOffset != 0:
extractedDate = extractedDate + relativedelta(seconds=secOffset)
for idx, word in enumerate(words):
if words[idx] == "e" and words[idx - 1] == "" and words[
idx + 1] == "":
words[idx] = ""
resultStr = " ".join(words)
resultStr = ' '.join(resultStr.split())
return [extractedDate, resultStr]
def get_gender_it(word, raw_string=""):
Questa potrebbe non essere utile.
In italiano per definire il genere è necessario
analizzare l'articolo che la precede e non la lettera
con cui finisce la parola, ma sono presenti funzioni per
la rimozione degli articoli dalla frase per semplificarne
TODO: verificare se utile
gender = False
words = raw_string.split(" ")
for idx, w in enumerate(words):
if w == word and idx != 0:
previous = words[idx - 1]
gender = get_gender_it(previous)
if not gender:
if word[-1] == "a" or word[-1] == "e":
gender = "f"
if word[-1] == "o" or word[-1] == "n" \
or word[-1] == "l" or word[-1] == "i":
gender = "m"
return gender