Update italian parsing methods

Add extract_datetime_it
Add tests for italian methods
pull/1389/head
Ale 2018-01-27 15:31:36 +01:00 committed by Åke Forslund
parent 02eb1b8277
commit c56b293d3d
3 changed files with 1030 additions and 8 deletions

View File

@ -19,18 +19,18 @@
TODO: numbers greater than 999999
TODO: it_number_parse
TODO: extract_datetime_it
TODO: it_pruning
"""
from datetime import datetime
from dateutil.relativedelta import relativedelta
from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
# Undefined articles ["un", "una", "un'"] can not be supressed,
# in Italian, "un cavallo" means "a horse" or "one horse".
it_articles = ["il", "lo", "la"]
it_articles = ["il", "lo", "la", "i", "gli", "le"]
it_numbers = {
"zero": 0,
@ -57,13 +57,21 @@ it_numbers = {
"diciotto": 18,
"diciannove": 19,
"venti": 20,
"vent": 20,
"trenta": 30,
"trent": 30,
"quaranta": 40,
"quarant": 40,
"cinquanta": 50,
"cinquant": 50,
"sessanta": 60,
"sessant": 60,
"settanta": 70,
"settant": 70,
"ottanta": 80,
"ottant": 80,
"novanta": 90,
"novant": 90,
"cento": 100,
"duecento": 200,
"trecento": 300,
@ -111,6 +119,37 @@ def isFractional_it(input_str):
return False
def extractnumber_long_it(word):
"""
Questa funzione converte un numero testuale lungo es.
ventisette -> 27
quarantuno -> 41
nell'equivalente valore intero
args:
text (str): la stringa da normalizzare
Ritorna:
(int) : il valore del numero estratto usando tutta la parola
Falso : se la parola non è un numero es."qualcuno"
"""
result = False
value = False
for number in it_numbers.keys(): # ciclo unità
if word.endswith(number):
result = True
value = it_numbers[number]
word = word[0: len(word) - len(number)]
break
if result: # tolte le unità, dovrebbe rimanere una stringa nota
if word in it_numbers:
value += it_numbers[word]
else:
value = False # non è un numero es. qualcuno
return value
def extractnumber_it(text):
"""
Questa funzione prepara il testo dato per l'analisi rendendo
@ -171,6 +210,10 @@ def extractnumber_it(text):
if look_for_fractions(aPieces):
val = float(aPieces[0]) / float(aPieces[1])
if not val:
# cerca numero composto come ventuno ventitre centoventi"
val = extractnumber_long_it(word)
if val:
if result is None:
result = 0
@ -261,22 +304,25 @@ def normalize_it(text, remove_articles):
words = text.split() # this also removed extra spaces
# Contractions are not common in IT
# Convert numbers into digits, e.g. "due" -> "2"
# Convert numbers into digits, e.g. "quarantadue" -> "42"
normalized = ""
i = 0
while i < len(words):
word = words[i]
# remove articles
# Italian requires the article to define the gender of the next word
# Italian requires the article to define the gender
if remove_articles and word in it_articles:
i += 1
continue
# NOTE temporary , handle some numbers above >999
if word in it_numbers:
word = str(it_numbers[word])
# end temporary
val = extractnumber_long_it(word)
if val:
word = str(val)
normalized += " " + word
i += 1
@ -285,6 +331,672 @@ def normalize_it(text, remove_articles):
return normalized[1:]
def extract_datetime_it(string, currentDate=None):
def clean_string(s):
"""
cleans the input string of unneeded punctuation and capitalization
among other things.
Normalize italian plurals
"""
symbols = [".", ",", ";", "?", "!", u"º", u"ª", u"°"]
for word in symbols:
s = s.replace(word, "")
s = s.lower().replace(
u"á",
"a").replace(
u"à",
"a").replace(
u"è",
"e'").replace(
u"é",
"e'").replace(
u"ì",
"i").replace(
u"ù",
"u").replace(
u"ò",
"o").replace(
"-",
" ").replace(
"_",
"")
noise_words = ["tra", "la", "del", "al", "il", "di",
"le", "per", "alle", "alla", "dai", "delle",
"a", "e'", "era", "questa", "questo", "e"]
for word in noise_words:
s = s.replace(" " + word + " ", " ")
# normalizza plurali per semplificare analisi
s = s.replace(
"secondi",
"secondo").replace(
"minuti",
"minuto").replace(
"ore",
"ora").replace(
"giorni",
"giorno").replace(
"settimane",
"settimana").replace(
"mesi",
"mese").replace(
"anni",
"anno").replace(
"mattino",
"mattina").replace(
"prossima",
"prossimo").replace(
"questa",
"questo").replace(
"quarti",
"quarto")
wordList = s.split()
# print(wordList) # debug only
return wordList
def date_found():
return found or \
(
datestr != "" or timeStr != "" or
yearOffset != 0 or monthOffset != 0 or
dayOffset is True or hrOffset != 0 or
hrAbs != 0 or minOffset != 0 or
minAbs != 0 or secOffset != 0
)
if string == "":
return None
if currentDate is None:
currentDate = datetime.now()
found = False
daySpecified = False
dayOffset = False
monthOffset = 0
yearOffset = 0
dateNow = currentDate
today = dateNow.strftime("%w")
currentYear = dateNow.strftime("%Y")
fromFlag = False
datestr = ""
hasYear = False
timeQualifier = ""
timeQualifiersList = ['mattina', 'pomeriggio', 'sera']
markers = ['alle', 'in', 'questo', 'per', 'di']
days = ['lunedi', 'martedi', 'mercoledi',
'giovedi', 'venerdi', 'sabato', 'domenica']
months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno',
'luglio', 'agosto', 'settembre', 'ottobre', 'novembre',
'dicembre']
monthsShort = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago',
'set', 'ott', 'nov', 'dic']
words = clean_string(string)
for idx, word in enumerate(words):
if word == "":
continue
wordPrevPrev = words[idx - 2] if idx > 1 else ""
wordPrev = words[idx - 1] if idx > 0 else ""
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
# wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
# possono esistere casi dove servano tre parole di profondità ?
start = idx
used = 0
# save timequalifier for later
if word in timeQualifiersList:
timeQualifier = word
# parse today, tomorrow, day after tomorrow
elif word == "oggi" and not fromFlag:
dayOffset = 0
used += 1
elif word == "domani" and not fromFlag:
dayOffset = 1
used += 1
elif word == "ieri" and not fromFlag:
dayOffset -= 1
used += 1
elif word == "dopodomani" and not fromFlag: # after tomorrow
dayOffset += 2
used += 1
elif word == "dopo" and wordNext == "domani" and \
not fromFlag:
dayOffset += 1
used += 2
elif word == "giorno":
if wordPrev[0].isdigit():
dayOffset += int(wordPrev)
start -= 1
used = 2
if wordNext == "dopo" and wordNextNext == "domani":
dayOffset += 1
used += 2
elif word == "settimana" and not fromFlag:
if wordPrev[0].isdigit():
dayOffset += int(wordPrev) * 7
start -= 1
used = 2
elif wordPrev == "prossimo":
dayOffset = 7
start -= 1
used = 2
elif wordPrev == "passato":
dayOffset = -7
start -= 1
used = 2
# parse 10 months, next month, last month
elif word == "mese" and not fromFlag:
if wordPrev[0].isdigit():
monthOffset = int(wordPrev)
start -= 1
used = 2
elif wordPrev == "prossimo":
monthOffset = 1
start -= 1
used = 2
elif wordPrev == "passato":
monthOffset = -1
start -= 1
used = 2
# parse 5 years, next year, last year
elif word == "anno" and not fromFlag:
if wordPrev[0].isdigit():
yearOffset = int(wordPrev)
start -= 1
used = 2
elif wordPrev == "prossimo":
yearOffset = 1
start -= 1
used = 2
elif wordPrev == "passato":
yearOffset = -1
start -= 1
used = 2
# parse Monday, Tuesday, etc., and next Monday,
# last Tuesday, etc.
elif word in days and not fromFlag:
d = days.index(word)
dayOffset = (d + 1) - int(today)
used = 1
if dayOffset < 0:
dayOffset += 7
if wordPrev == "prossimo":
dayOffset += 7
used += 1
start -= 1
elif wordPrev == "passato":
dayOffset -= 7
used += 1
start -= 1
if wordNext == "prossimo":
# dayOffset += 7
used += 1
elif wordNext == "passato":
# dayOffset -= 7
used += 1
# parse 15 of July, June 20th, Feb 18, 19 of February
elif word in months or word in monthsShort and not fromFlag:
try:
m = months.index(word)
except ValueError:
m = monthsShort.index(word)
used += 1
datestr = months[m]
if wordPrev and (wordPrev[0].isdigit()):
datestr += " " + wordPrev
start -= 1
used += 1
if wordNext and wordNext[0].isdigit():
datestr += " " + wordNext
used += 1
hasYear = True
else:
hasYear = False
elif wordNext and wordNext[0].isdigit():
datestr += " " + wordNext
used += 1
if wordNextNext and wordNextNext[0].isdigit():
datestr += " " + wordNextNext
used += 1
hasYear = True
else:
hasYear = False
# parse 5 days from tomorrow, 10 weeks from next thursday,
# 2 months from July
validFollowups = days + months + monthsShort
validFollowups.append("oggi")
validFollowups.append("domani")
validFollowups.append("prossimo")
validFollowups.append("passato")
validFollowups.append("ora")
if (word == "da" or word == "dopo") and wordNext in validFollowups:
used = 2
fromFlag = True
if wordNext == "domani":
dayOffset += 1
elif wordNext in days:
d = days.index(wordNext)
tmpOffset = (d + 1) - int(today)
used = 2
if tmpOffset < 0:
tmpOffset += 7
dayOffset += tmpOffset
elif wordNextNext and wordNextNext in days:
d = days.index(wordNextNext)
tmpOffset = (d + 1) - int(today)
used = 3
if wordNext == "prossimo":
tmpOffset += 7
used += 2 # era 1
start -= 1
elif wordNext == "passato":
tmpOffset -= 7
used += 1
start -= 1
dayOffset += tmpOffset
if used > 0:
if start - 1 > 0 and words[start - 1] == "questo":
start -= 1
used += 1
for i in range(0, used):
words[i + start] = ""
if start - 1 >= 0 and words[start - 1] in markers:
words[start - 1] = ""
found = True
daySpecified = True
# parse time
timeStr = ""
hrOffset = 0
minOffset = 0
secOffset = 0
hrAbs = 0
minAbs = 0
for idx, word in enumerate(words):
if word == "":
continue
wordPrevPrev = words[idx - 2] if idx > 1 else ""
wordPrev = words[idx - 1] if idx > 0 else ""
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
# wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
# TODO verfica se esistono casi dove serva profindita 3 x analisi
# parse noon, midnight, morning, afternoon, evening
used = 0
if word == "mezzogiorno":
hrAbs = 12
used += 1
elif word == "mezzanotte":
hrAbs = 24
used += 1
if word == "mezzo" and wordNext == "giorno": # if stt splits the word
hrAbs = 12
used += 2
elif word == "mezza"and wordNext == "notte": # if stt splits the word
hrAbs = 24
used += 2
elif word == "mattina":
if hrAbs == 0:
hrAbs = 8
used += 1
if wordNext and wordNext[0].isdigit(): # mattina alle 5
hrAbs = int(wordNext)
used += 1
elif word == "pomeriggio":
if hrAbs == 0:
hrAbs = 15
used += 1
if wordNext and wordNext[0].isdigit(): # pomeriggio alle 5
hrAbs = int(wordNext)
used += 1
if hrAbs < 12:
hrAbs += 12
elif word == "sera":
if hrAbs == 0:
hrAbs = 19
used += 1
if wordNext and wordNext[0].isdigit(): # sera alle 8
hrAbs = int(wordNext)
used += 1
if hrAbs < 12:
hrAbs += 12
# parse 5:00 am, 12:00 p.m., etc
elif word[0].isdigit():
isTime = True
strHH = ""
strMM = ""
remainder = ""
if ':' in word:
# parse colons
# "3:00 in the morning"
stage = 0
length = len(word)
for i in range(length):
if stage == 0:
if word[i].isdigit():
strHH += word[i]
elif word[i] == ":":
stage = 1
else:
stage = 2
i -= 1
elif stage == 1:
if word[i].isdigit():
strMM += word[i]
else:
stage = 2
i -= 1
elif stage == 2:
remainder = word[i:].replace(".", "")
break
if remainder == "":
nextWord = wordNext.replace(".", "")
if nextWord == "am" or nextWord == "pm":
remainder = nextWord
used += 1
elif nextWord == "sera":
remainder = "pm"
used += 1
elif wordNext == "mattina":
remainder = "am"
used += 1
elif wordNext == "pomeriggio":
remainder = "pm"
used += 1
elif wordNext == "notte":
remainder = "pm"
used += 1
elif wordNext == "di" and wordNextNext == "notte":
if strHH > 5:
remainder = "pm"
else:
remainder = "am"
used += 2
else:
if timeQualifier != "":
if strHH <= 12 and \
(timeQualifier == "sera" or
timeQualifier == "pomeriggio"):
strHH += 12
else:
# try to parse # s without colons
# 5 hours, 10 minutes etc.
length = len(word)
strNum = ""
remainder = ""
for i in range(length):
if word[i].isdigit():
strNum += word[i]
else:
remainder += word[i]
if remainder == "":
remainder = wordNext.replace(".", "").lstrip().rstrip()
if (
remainder == "pm" or
wordNext == "pm" or
remainder == "p.m." or
wordNext == "p.m."):
strHH = strNum
remainder = "pm"
used = 1
elif (
remainder == "am" or
wordNext == "am" or
remainder == "a.m." or
wordNext == "a.m."):
strHH = strNum
remainder = "am"
used = 1
else:
if wordNext == "pm" or wordNext == "p.m.":
strHH = strNum
remainder = "pm"
used = 1
elif wordNext == "am" or wordNext == "a.m.":
strHH = strNum
remainder = "am"
used = 1
elif (
int(word) > 100 and
(
wordPrev == "o" or
wordPrev == "oh"
)):
# 0800 hours (pronounced oh-eight-hundred)
strHH = int(word) / 100
strMM = int(word) - strHH * 100
if wordNext == "ora":
used += 1
elif (
wordNext == "ora" and
word[0] != '0' and
(
int(word) < 100 and
int(word) > 2400
)):
# ignores military time
# "in 3 hours"
hrOffset = int(word)
used = 2
isTime = False
hrAbs = -1
minAbs = -1
elif wordNext == "mattina":
# " 11 del mattina" -> del viene rimosso
hh = int(word)
used = 2
isTime = False
hrAbs = hh
minAbs = 00
elif wordNext == "pomeriggio":
# " 2 del pomeriggio" -> del viene rimosso
hh = int(word)
if hh < 12:
hh += 12
used = 2
isTime = False
hrAbs = hh
minAbs = 00
elif wordNext == "sera":
# "alle 8 di sera" -> alle viene rimosso
hh = int(word)
if hh < 12:
hh += 12
used = 2
isTime = False
hrAbs = hh
minAbs = 00
# parse half an hour : undici e mezza
elif wordNext and wordNext == "mezza":
hrAbs = int(word)
minAbs = 30
used = 2
isTime = False
# parse 1 quarter hour 3 quarters : dieci e tre quarti
elif word and wordNext and \
wordNext == "quarto" and word[0].isdigit():
minAbs = 15 * int(word)
used = 2
if minAbs > 45: # elimina eventuali errori
minAbs = 0
used -= 2
isTime = False
elif wordNext == "minuto":
# "in 10 minutes"
minOffset = int(word)
used = 2
isTime = False
hrAbs = -1
minAbs = -1
elif wordNext == "secondo":
# in 5 seconds
secOffset = int(word)
used = 2
isTime = False
hrAbs = -1
minAbs = -1
elif int(word) > 100:
strHH = int(word) / 100
strMM = int(word) - strHH * 100
if wordNext == "ora":
used += 1
elif wordNext and wordNext[0].isdigit():
strHH = word
strMM = wordNext
used += 1
if wordNextNext == "ora":
used += 1
elif wordNext == "in" and wordNextNext == "punto":
strHH = word
strMM = 00
used += 2
else:
isTime = False
strHH = int(strHH) if strHH else 0
strMM = int(strMM) if strMM else 0
strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
if strHH > 24 or strMM > 59:
isTime = False
used = 0
if isTime:
hrAbs = strHH * 1
minAbs = strMM * 1
used += 1
if hrAbs <= 12 and (timeQualifier == "sera" or
timeQualifier == "pomeriggio"):
hrAbs += 12
if used > 0:
# removed parsed words from the sentence
for i in range(used):
words[idx + i] = ""
if wordPrev == "o" or wordPrev == "oh":
words[words.index(wordPrev)] = ""
if wordPrev == "presto":
hrOffset = -1
words[idx - 1] = ""
idx -= 1
elif wordPrev == "tardi":
hrOffset = 1
words[idx - 1] = ""
idx -= 1
if idx > 0 and wordPrev in markers:
words[idx - 1] = ""
if idx > 1 and wordPrevPrev in markers:
words[idx - 2] = ""
idx += used - 1
found = True
# check that we found a date
if not date_found:
return None
if dayOffset is False:
dayOffset = 0
# perform date manipulation
extractedDate = dateNow
extractedDate = extractedDate.replace(microsecond=0,
second=0,
minute=0,
hour=0)
if datestr != "":
en_months = ['january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november',
'december']
en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
'aug',
'sept', 'oct', 'nov', 'dec']
for idx, en_month in enumerate(en_months):
datestr = datestr.replace(months[idx], en_month)
for idx, en_month in enumerate(en_monthsShort):
datestr = datestr.replace(monthsShort[idx], en_month)
temp = datetime.strptime(datestr, "%B %d")
if not hasYear:
temp = temp.replace(year=extractedDate.year)
if extractedDate < temp:
extractedDate = extractedDate.replace(year=int(currentYear),
month=int(
temp.strftime(
"%m")),
day=int(temp.strftime(
"%d")))
else:
extractedDate = extractedDate.replace(
year=int(currentYear) + 1,
month=int(temp.strftime("%m")),
day=int(temp.strftime("%d")))
else:
extractedDate = extractedDate.replace(
year=int(temp.strftime("%Y")),
month=int(temp.strftime("%m")),
day=int(temp.strftime("%d")))
if timeStr != "":
temp = datetime(timeStr)
extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
minute=temp.strftime("%M"),
second=temp.strftime("%S"))
if yearOffset != 0:
extractedDate = extractedDate + relativedelta(years=yearOffset)
if monthOffset != 0:
extractedDate = extractedDate + relativedelta(months=monthOffset)
if dayOffset != 0:
extractedDate = extractedDate + relativedelta(days=dayOffset)
if hrAbs != -1 and minAbs != -1:
extractedDate = extractedDate + relativedelta(hours=hrAbs,
minutes=minAbs)
if (hrAbs != 0 or minAbs != 0) and datestr == "":
if not daySpecified and dateNow > extractedDate:
extractedDate = extractedDate + relativedelta(days=1)
if hrOffset != 0:
extractedDate = extractedDate + relativedelta(hours=hrOffset)
if minOffset != 0:
extractedDate = extractedDate + relativedelta(minutes=minOffset)
if secOffset != 0:
extractedDate = extractedDate + relativedelta(seconds=secOffset)
for idx, word in enumerate(words):
if words[idx] == "e" and words[idx - 1] == "" and words[
idx + 1] == "":
words[idx] = ""
resultStr = " ".join(words)
resultStr = ' '.join(resultStr.split())
return [extractedDate, resultStr]
def get_gender_it(word, raw_string=""):
"""
Questa potrebbe non essere utile.
@ -292,7 +1004,7 @@ def get_gender_it(word, raw_string=""):
analizzare l'articolo che la precede e non la lettera
con cui finisce la parola, ma sono presenti funzioni per
la rimozione degli articoli dalla frase per semplificarne
l'analisi, in particolare se si rimuovono "i", "gli", "le"
l'analisi
TODO: verificare se utile
"""

View File

@ -105,6 +105,8 @@ def extract_datetime(text, anchorDate=None, lang="en-us"):
return extract_datetime_en(text, anchorDate)
elif lang_lower.startswith("pt"):
return extract_datetime_pt(text, anchorDate)
elif lang_lower.startswith("it"):
return extract_datetime_it(text, anchorDate)
return text
# ==============================================================

View File

@ -0,0 +1,308 @@
# -*- coding: utf-8 -*-
#
# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
from datetime import datetime
from mycroft.util.parse import get_gender
from mycroft.util.parse import extract_datetime
from mycroft.util.parse import extractnumber
from mycroft.util.parse import normalize
class TestNormalize(unittest.TestCase):
"""
Test cases for Italian parsing
"""
def test_articles_it(self):
self.assertEqual(normalize(u"questo è il test",
lang="it", remove_articles=True),
u"questo è test")
self.assertEqual(normalize(u"questa è la frase",
lang="it", remove_articles=True),
u"questa è frase")
self.assertEqual(normalize(u"questo è lo scopo", lang="it",
remove_articles=True),
u"questo è scopo")
self.assertEqual(normalize(u"questo è il test extra",
lang="it", remove_articles=False),
u"questo è il test extra")
def test_extractnumber_it(self):
self.assertEqual(extractnumber(u"questo è il primo test",
lang="it"), 1)
self.assertEqual(extractnumber(u"questo è il 2 test",
lang="it"), 2)
self.assertEqual(extractnumber(u"questo è il secondo test",
lang="it"), 2)
self.assertEqual(extractnumber(u"questo è un terzo di test",
lang="it"), 1.0 / 3.0)
self.assertEqual(extractnumber(u"questo è test numero 4",
lang="it"), 4)
self.assertEqual(extractnumber("un terzo di tazza",
lang="it"), 1.0 / 3.0)
self.assertEqual(extractnumber("tre tazze",
lang="it"), 3)
self.assertEqual(extractnumber("1/3 tazze",
lang="it"), 1.0 / 3.0)
self.assertEqual(extractnumber("un quarto di tazza",
lang="it"), 0.25)
self.assertEqual(extractnumber("1/4 tazza",
lang="it"), 0.25)
self.assertEqual(extractnumber("2/3 tazza",
lang="it"), 2.0 / 3.0)
self.assertEqual(extractnumber("3/4 tazza",
lang="it"), 3.0 / 4.0)
self.assertEqual(extractnumber("1 e 1/4 tazza",
lang="it"), 1.25)
self.assertEqual(extractnumber("1 tazza e mezzo",
lang="it"), 1.5)
self.assertEqual(extractnumber("una tazza e mezzo",
lang="it"), 1.5)
self.assertEqual(extractnumber("una e mezza tazza",
lang="it"), 1.5)
self.assertEqual(extractnumber("una e una mezza tazza",
lang="it"), 1.5)
self.assertEqual(extractnumber("tre quarti tazza",
lang="it"), 3.0 / 4.0)
self.assertEqual(extractnumber("tre quarto tazza",
lang="it"), 3.0 / 4.0)
self.assertEqual(extractnumber("sette punto cinque",
lang="it"), 7.5)
self.assertEqual(extractnumber("sette punto 5",
lang="it"), 7.5)
self.assertEqual(extractnumber("sette e mezzo",
lang="it"), 7.5)
self.assertEqual(extractnumber("sette e ottanta",
lang="it"), 7.80)
self.assertEqual(extractnumber("sette e otto",
lang="it"), 7.8)
self.assertEqual(extractnumber("sette e zero otto",
lang="it"), 7.08)
self.assertEqual(extractnumber("sette e zero zero otto",
lang="it"), 7.008)
self.assertEqual(extractnumber("venti tredicesimi",
lang="it"), 20.0 / 13.0)
self.assertEqual(extractnumber("sei virgola sessanta sei",
lang="it"), 6.66)
self.assertEqual(extractnumber("sei virgola sessantasei",
lang="it"), 6.66)
self.assertEqual(extractnumber("seicento sessanta sei",
lang="it"), 666)
self.assertEqual(extractnumber("seicento punto zero sei",
lang="it"), 600.06)
self.assertEqual(extractnumber("seicento punto zero zero sei",
lang="it"), 600.006)
self.assertEqual(extractnumber("seicento punto zero zero zero sei",
lang="it"), 600.0006)
self.assertEqual(extractnumber("tre decimi ",
lang="it"), 0.30000000000000004)
self.assertEqual(extractnumber("dodici centesimi",
lang="it"), 0.12)
self.assertEqual(extractnumber("cinque e quaranta due millesimi",
lang="it"), 5.042)
self.assertEqual(extractnumber("mille e uno",
lang="it"), 1001)
self.assertEqual(extractnumber("due mila venti due dollari ",
lang="it"), 2022)
self.assertEqual(extractnumber(
"cento quattordici mila quattrocento undici dollari ",
lang="it"), 114411)
self.assertEqual(extractnumber("ventitre dollari ", lang="it"), 23)
self.assertEqual(extractnumber("quarantacinque minuti ",
lang="it"), 45)
self.assertEqual(extractnumber("ventuno anni ",
lang="it"), 21)
self.assertEqual(extractnumber("ventotto euro ",
lang="it"), 28)
self.assertEqual(extractnumber("dodici e quarantacinque ",
lang="it"), 12.45)
self.assertEqual(extractnumber("quarantotto euro ",
lang="it"), 48)
self.assertEqual(extractnumber("novantanove euro ",
lang="it"), 99)
self.assertEqual(extractnumber("avvisa se qualcuno arriva ",
lang="it"), False)
def test_spaces_it(self):
self.assertEqual(normalize(u"questo e' il test",
lang="it"), u"questo e' test")
self.assertEqual(normalize(u"questo è un test ",
lang="it"), u"questo è 1 test")
self.assertEqual(normalize(u"un altro test ",
lang="it"), u"1 altro test")
self.assertEqual(normalize(u"questa è un' altra amica ", lang="it",
remove_articles=False),
u"questa è 1 altra amica")
self.assertEqual(normalize(u"questo è un test ", lang="it",
remove_articles=False), u"questo è 1 test")
def test_numbers_it(self):
self.assertEqual(normalize(u"questo è il test uno due tre",
lang="it"), u"questo è test 1 2 3")
self.assertEqual(normalize(u"è un test sette otto nove",
lang="it"), u"è 1 test 7 8 9")
self.assertEqual(normalize("test zero dieci undici dodici tredici",
lang="it"), "test 0 10 11 12 13")
self.assertEqual(normalize("test mille seicento sessanta e sei",
lang="it", remove_articles=False),
"test 1000 600 60 e 6")
self.assertEqual(normalize("test sette e mezzo",
lang="it", remove_articles=False),
"test 7 e mezzo")
self.assertEqual(normalize("test due punto nove",
lang="it"), "test 2 punto 9")
self.assertEqual(normalize("test cento e nove",
lang="it", remove_articles=False),
"test 100 e 9")
self.assertEqual(normalize("test venti e 1",
lang="it"), "test 20 e 1")
self.assertEqual(normalize("test ventuno e ventisette",
lang="it"), "test 21 e 27")
def test_extractdatetime_it(self):
def extractWithFormat(text):
date = datetime(2018, 01, 13, 00, 00)
[extractedDate, leftover] = extract_datetime(text, date,
lang="it")
extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S")
return [extractedDate, leftover]
def testExtract(text, expected_date, expected_leftover):
res = extractWithFormat(text)
self.assertEqual(res[0], expected_date)
self.assertEqual(res[1], expected_leftover)
testExtract(u"quale giorno è oggi",
"2018-01-13 00:00:00", u"quale giorno")
testExtract(u"che giorno è domani",
"2018-01-14 00:00:00", u"che giorno")
testExtract(u"che giorno era ieri",
"2018-01-12 00:00:00", u"che giorno")
testExtract(u"che giorno è dopo domani",
"2018-01-15 00:00:00", u"che giorno")
testExtract(u"fissare la cena tra 5 giorni",
"2018-01-18 00:00:00", u"fissare cena")
testExtract(u"Come è il tempo per dopodomani",
"2018-01-15 00:00:00", u"come tempo")
testExtract(u"ricordami alle 22:45",
"2018-01-13 22:45:00", u"ricordami")
testExtract(u"Come è il tempo venerdì mattina",
"2018-01-19 08:00:00", "come tempo")
testExtract(u"Ricordami di chiamare la mamma"
u" in 8 settimane e 2 giorni.",
"2018-03-12 00:00:00", u"ricordami chiamare mamma")
testExtract(u"Gioca a briscola 2 giorni dopo venerdì",
"2018-01-21 00:00:00", u"gioca briscola")
testExtract(u"Inizia le pulizie alle 15:45 di giovedì",
"2018-01-18 15:45:00", u"inizia pulizie")
testExtract("lunedi compra formaggio",
"2018-01-15 00:00:00", u"compra formaggio")
testExtract("suona musica compleanno tra 5 anni da oggi",
"2023-01-13 00:00:00", "suona musica compleanno")
testExtract(u"Invia Skype alla mamma alle 12:45 di giovedì prossimo.",
"2018-01-18 12:45:00", u"invia skype mamma")
testExtract(u"Come è il tempo questo venerdì?",
"2018-01-19 00:00:00", u"come tempo")
testExtract(u"Come è il tempo questo venerdì pomeriggio?",
"2018-01-19 15:00:00", u"come tempo")
testExtract(u"Come è il tempo questo venerdì a mezza notte?",
"2018-01-20 00:00:00", u"come tempo")
testExtract(u"Come è il tempo questo venerdì a mezzogiorno?",
"2018-01-19 12:00:00", "come tempo")
testExtract(u"Come è il tempo questo venerdì alle 11 del mattino?",
"2018-01-19 11:00:00", "come tempo")
testExtract("Ricordami di chiamare mia madre il 3 agosto.",
"2018-08-03 00:00:00", "ricordami chiamare mia madre")
testExtract(u"comprare fragole il 13 maggio",
"2018-05-13 00:00:00", "comprare fragole")
testExtract(u"fare acquisti il 13 maggio",
"2018-05-13 00:00:00", "fare acquisti")
testExtract(u"compra le candele il 1° maggio",
"2018-05-01 00:00:00", "compra candele")
testExtract(u"bere birra il 13 maggio",
"2018-05-13 00:00:00", "bere birra")
testExtract(u"Come è il tempo 1 giorno dopo domani?",
"2018-01-15 00:00:00", "come tempo")
testExtract(u"Come è il tempo alle ore 0700?",
"2018-01-13 07:00:00", "come tempo ora")
testExtract(u"Come è il tempo domani alle 7 in punto?",
"2018-01-14 07:00:00", "come tempo")
testExtract(u"Come è il tempo domani alle 2 del pomeriggio",
"2018-01-14 14:00:00", "come tempo")
testExtract(u"Come è il tempo domani pomeriggio alle 2",
"2018-01-14 14:00:00", "come tempo")
testExtract(u"Come è il tempo domani per le 2:00",
"2018-01-14 02:00:00", "come tempo")
testExtract(u"Come è il tempo alle 2 del pomeriggio di \
venerdì prossimo?",
"2018-01-19 14:00:00", u"come tempo")
testExtract(u"Ricordami di svegliarmi tra 4 anni",
"2022-01-13 00:00:00", u"ricordami svegliarmi")
testExtract(u"Ricordami di svegliarmi tra 4 anni e 4 giorni",
"2022-01-17 00:00:00", u"ricordami svegliarmi")
testExtract(u"Dormi 3 giorni da domani.",
"2018-01-17 00:00:00", u"dormi")
testExtract(u"segna appuntamento tra 2 settimane e 6 giorni \
dopo sabato",
"2018-02-02 00:00:00", u"segna appuntamento")
testExtract(u"La festa inizia alle 8 di sera di giovedì",
"2018-01-18 20:00:00", u"la festa inizia")
testExtract(u"Come è il meteo 3 tra giorni?",
"2018-01-16 00:00:00", u"come meteo")
testExtract(u"fissa appuntamento dicembre 3",
"2018-12-03 00:00:00", "fissa appuntamento")
testExtract(u"incontriamoci questa sera alle 8 ",
"2018-01-13 20:00:00", "incontriamoci")
testExtract(u"incontriamoci alle 8 questa sera",
"2018-01-13 20:00:00", "incontriamoci")
testExtract(u"impostare sveglia questa sera alle 9 ",
"2018-01-13 21:00:00", "impostare sveglia")
testExtract(u"impostare sveglia questa sera alle 21 ",
"2018-01-13 21:00:00", "impostare sveglia")
testExtract(u"inserire appuntamento domani sera alle 23",
"2018-01-14 23:00:00", "inserire appuntamento")
testExtract(u"inserire appuntamento domani alle 9 e mezza",
"2018-01-14 09:30:00", "inserire appuntamento")
testExtract(u"inserire appuntamento domani sera alle 23 e 3 quarti",
"2018-01-14 23:45:00", "inserire appuntamento")
def test_gender_it(self):
self.assertEqual(get_gender("mucca", lang="it"), "f")
self.assertEqual(get_gender("cavallo", lang="it"), "m")
self.assertEqual(get_gender("mucche", "le mucche", lang="it"), "f")
self.assertEqual(get_gender("bue", "il bue mangia la erba",
lang="it"), "m")
self.assertEqual(get_gender("pesce", "il pesce nuota",
lang="it"), "m")
self.assertEqual(get_gender("tigre", lang="it"), "f")
self.assertEqual(get_gender("uomini", "questi uomini mangiano pasta",
lang="it"), "m")
self.assertEqual(get_gender("ponte", "il ponte", lang="it"), "m")
self.assertEqual(get_gender("ponte", u"questo ponte è caduto",
lang="it"), "m")
self.assertEqual(get_gender("scultrice", "questa scultrice famosa",
lang="it"), "f")
self.assertEqual(get_gender("scultore", "questo scultore famoso",
lang="it"), "m")
self.assertEqual(get_gender("scultori", "gli scultori rinascimentali",
lang="it"), "m")
self.assertEqual(get_gender("scultrici", "le scultrici moderne",
lang="it"), "f")
if __name__ == "__main__":
unittest.main()