Update italian parsing methods

Add extract_datetime_it Add tests for italian methods
2018-01-27 15:31:36 +01:00 · 2018-01-27 15:31:36 +01:00 · c56b293d3d
parent 02eb1b8277
commit c56b293d3d
3 changed files with 1030 additions and 8 deletions
--- a/mycroft/util/lang/parse_it.py
+++ b/mycroft/util/lang/parse_it.py
@ -19,18 +19,18 @@
    TODO: numbers greater than 999999
    TODO: it_number_parse
    TODO: extract_datetime_it
    TODO: it_pruning
 """
-
+from datetime import datetime
 from dateutil.relativedelta import relativedelta
 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
 # Undefined articles ["un", "una", "un'"] can not be supressed,
 # in Italian, "un cavallo" means "a horse" or "one horse".
-it_articles = ["il", "lo", "la"]
+it_articles = ["il", "lo", "la", "i", "gli", "le"]
 it_numbers = {
    "zero": 0,
@ -57,13 +57,21 @@ it_numbers = {
    "diciotto": 18,
    "diciannove": 19,
    "venti": 20,
    "vent": 20,
    "trenta": 30,
    "trent": 30,
    "quaranta": 40,
    "quarant": 40,
    "cinquanta": 50,
    "cinquant": 50,
    "sessanta": 60,
    "sessant": 60,
    "settanta": 70,
    "settant": 70,
    "ottanta": 80,
    "ottant": 80,
    "novanta": 90,
    "novant": 90,
    "cento": 100,
    "duecento": 200,
    "trecento": 300,
@ -111,6 +119,37 @@ def isFractional_it(input_str):
    return False
 def extractnumber_long_it(word):
    """
    Questa funzione converte un numero testuale lungo es.
    ventisette -> 27
    quarantuno -> 41
    nell'equivalente valore intero
     args:
         text (str): la stringa da normalizzare
    Ritorna:
         (int) : il valore del numero estratto usando tutta la parola
         Falso : se la parola non è un numero es."qualcuno"
    """
    result = False
    value = False
    for number in it_numbers.keys():  # ciclo unità
        if word.endswith(number):
            result = True
            value = it_numbers[number]
            word = word[0: len(word) - len(number)]
            break
    if result:  # tolte le unità, dovrebbe rimanere una stringa nota
        if word in it_numbers:
            value += it_numbers[word]
        else:
            value = False  # non è un numero es. qualcuno
    return value
 def extractnumber_it(text):
    """
    Questa funzione prepara il testo dato per l'analisi rendendo
@ -171,6 +210,10 @@ def extractnumber_it(text):
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])
        if not val:
            # cerca numero composto come ventuno ventitre centoventi"
            val = extractnumber_long_it(word)
        if val:
            if result is None:
                result = 0
@ -261,22 +304,25 @@ def normalize_it(text, remove_articles):
    words = text.split()  # this also removed extra spaces
    # Contractions are not common in IT
-    # Convert numbers into digits, e.g. "due" -> "2"
+    # Convert numbers into digits, e.g. "quarantadue" -> "42"
    normalized = ""
    i = 0
    while i < len(words):
        word = words[i]
        # remove articles
-        # Italian requires the article to define the gender of the next word
+        # Italian requires the article to define the gender
        if remove_articles and word in it_articles:
            i += 1
            continue
        # NOTE temporary , handle some numbers above >999
        if word in it_numbers:
            word = str(it_numbers[word])
-        # end temporary
+
        val = extractnumber_long_it(word)
        if val:
            word = str(val)
        normalized += " " + word
        i += 1
@ -285,6 +331,672 @@ def normalize_it(text, remove_articles):
    return normalized[1:]
 def extract_datetime_it(string, currentDate=None):
    def clean_string(s):
        """
            cleans the input string of unneeded punctuation and capitalization
            among other things.
            Normalize italian plurals
        """
        symbols = [".", ",", ";", "?", "!", u"º", u"ª", u"°"]
        for word in symbols:
            s = s.replace(word, "")
        s = s.lower().replace(
            u"á",
            "a").replace(
            u"à",
            "a").replace(
            u"è",
            "e'").replace(
            u"é",
            "e'").replace(
            u"ì",
            "i").replace(
            u"ù",
            "u").replace(
            u"ò",
            "o").replace(
            "-",
            " ").replace(
            "_",
            "")
        noise_words = ["tra", "la", "del", "al", "il", "di",
                       "le", "per", "alle", "alla", "dai", "delle",
                       "a", "e'", "era", "questa", "questo", "e"]
        for word in noise_words:
            s = s.replace(" " + word + " ", " ")
        # normalizza plurali per semplificare analisi
        s = s.replace(
            "secondi",
            "secondo").replace(
            "minuti",
            "minuto").replace(
            "ore",
            "ora").replace(
            "giorni",
            "giorno").replace(
            "settimane",
            "settimana").replace(
            "mesi",
            "mese").replace(
            "anni",
            "anno").replace(
            "mattino",
            "mattina").replace(
            "prossima",
            "prossimo").replace(
            "questa",
            "questo").replace(
            "quarti",
            "quarto")
        wordList = s.split()
        # print(wordList)  # debug only
        return wordList
    def date_found():
        return found or \
            (
                datestr != "" or timeStr != "" or
                yearOffset != 0 or monthOffset != 0 or
                dayOffset is True or hrOffset != 0 or
                hrAbs != 0 or minOffset != 0 or
                minAbs != 0 or secOffset != 0
            )
    if string == "":
        return None
    if currentDate is None:
        currentDate = datetime.now()
    found = False
    daySpecified = False
    dayOffset = False
    monthOffset = 0
    yearOffset = 0
    dateNow = currentDate
    today = dateNow.strftime("%w")
    currentYear = dateNow.strftime("%Y")
    fromFlag = False
    datestr = ""
    hasYear = False
    timeQualifier = ""
    timeQualifiersList = ['mattina', 'pomeriggio', 'sera']
    markers = ['alle', 'in', 'questo',  'per', 'di']
    days = ['lunedi', 'martedi', 'mercoledi',
            'giovedi', 'venerdi', 'sabato', 'domenica']
    months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno',
              'luglio', 'agosto', 'settembre', 'ottobre', 'novembre',
              'dicembre']
    monthsShort = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago',
                   'set', 'ott', 'nov', 'dic']
    words = clean_string(string)
    for idx, word in enumerate(words):
        if word == "":
            continue
        wordPrevPrev = words[idx - 2] if idx > 1 else ""
        wordPrev = words[idx - 1] if idx > 0 else ""
        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
        # wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
        # possono esistere casi dove servano tre parole di profondità ?
        start = idx
        used = 0
        # save timequalifier for later
        if word in timeQualifiersList:
            timeQualifier = word
            # parse today, tomorrow, day after tomorrow
        elif word == "oggi" and not fromFlag:
            dayOffset = 0
            used += 1
        elif word == "domani" and not fromFlag:
            dayOffset = 1
            used += 1
        elif word == "ieri" and not fromFlag:
            dayOffset -= 1
            used += 1
        elif word == "dopodomani" and not fromFlag:  # after tomorrow
            dayOffset += 2
            used += 1
        elif word == "dopo" and wordNext == "domani" and \
                not fromFlag:
            dayOffset += 1
            used += 2
        elif word == "giorno":
            if wordPrev[0].isdigit():
                dayOffset += int(wordPrev)
                start -= 1
                used = 2
                if wordNext == "dopo" and wordNextNext == "domani":
                    dayOffset += 1
                    used += 2
        elif word == "settimana" and not fromFlag:
            if wordPrev[0].isdigit():
                dayOffset += int(wordPrev) * 7
                start -= 1
                used = 2
            elif wordPrev == "prossimo":
                dayOffset = 7
                start -= 1
                used = 2
            elif wordPrev == "passato":
                dayOffset = -7
                start -= 1
                used = 2
                # parse 10 months, next month, last month
        elif word == "mese" and not fromFlag:
            if wordPrev[0].isdigit():
                monthOffset = int(wordPrev)
                start -= 1
                used = 2
            elif wordPrev == "prossimo":
                monthOffset = 1
                start -= 1
                used = 2
            elif wordPrev == "passato":
                monthOffset = -1
                start -= 1
                used = 2
                # parse 5 years, next year, last year
        elif word == "anno" and not fromFlag:
            if wordPrev[0].isdigit():
                yearOffset = int(wordPrev)
                start -= 1
                used = 2
            elif wordPrev == "prossimo":
                yearOffset = 1
                start -= 1
                used = 2
            elif wordPrev == "passato":
                yearOffset = -1
                start -= 1
                used = 2
                # parse Monday, Tuesday, etc., and next Monday,
                # last Tuesday, etc.
        elif word in days and not fromFlag:
            d = days.index(word)
            dayOffset = (d + 1) - int(today)
            used = 1
            if dayOffset < 0:
                dayOffset += 7
            if wordPrev == "prossimo":
                dayOffset += 7
                used += 1
                start -= 1
            elif wordPrev == "passato":
                dayOffset -= 7
                used += 1
                start -= 1
            if wordNext == "prossimo":
                # dayOffset += 7
                used += 1
            elif wordNext == "passato":
                # dayOffset -= 7
                used += 1
                # parse 15 of July, June 20th, Feb 18, 19 of February
        elif word in months or word in monthsShort and not fromFlag:
            try:
                m = months.index(word)
            except ValueError:
                m = monthsShort.index(word)
            used += 1
            datestr = months[m]
            if wordPrev and (wordPrev[0].isdigit()):
                datestr += " " + wordPrev
                start -= 1
                used += 1
                if wordNext and wordNext[0].isdigit():
                    datestr += " " + wordNext
                    used += 1
                    hasYear = True
                else:
                    hasYear = False
            elif wordNext and wordNext[0].isdigit():
                datestr += " " + wordNext
                used += 1
                if wordNextNext and wordNextNext[0].isdigit():
                    datestr += " " + wordNextNext
                    used += 1
                    hasYear = True
                else:
                    hasYear = False
        # parse 5 days from tomorrow, 10 weeks from next thursday,
        # 2 months from July
        validFollowups = days + months + monthsShort
        validFollowups.append("oggi")
        validFollowups.append("domani")
        validFollowups.append("prossimo")
        validFollowups.append("passato")
        validFollowups.append("ora")
        if (word == "da" or word == "dopo") and wordNext in validFollowups:
            used = 2
            fromFlag = True
            if wordNext == "domani":
                dayOffset += 1
            elif wordNext in days:
                d = days.index(wordNext)
                tmpOffset = (d + 1) - int(today)
                used = 2
                if tmpOffset < 0:
                    tmpOffset += 7
                dayOffset += tmpOffset
            elif wordNextNext and wordNextNext in days:
                d = days.index(wordNextNext)
                tmpOffset = (d + 1) - int(today)
                used = 3
                if wordNext == "prossimo":
                    tmpOffset += 7
                    used += 2  # era 1
                    start -= 1
                elif wordNext == "passato":
                    tmpOffset -= 7
                    used += 1
                    start -= 1
                dayOffset += tmpOffset
        if used > 0:
            if start - 1 > 0 and words[start - 1] == "questo":
                start -= 1
                used += 1
            for i in range(0, used):
                words[i + start] = ""
            if start - 1 >= 0 and words[start - 1] in markers:
                words[start - 1] = ""
            found = True
            daySpecified = True
    # parse time
    timeStr = ""
    hrOffset = 0
    minOffset = 0
    secOffset = 0
    hrAbs = 0
    minAbs = 0
    for idx, word in enumerate(words):
        if word == "":
            continue
        wordPrevPrev = words[idx - 2] if idx > 1 else ""
        wordPrev = words[idx - 1] if idx > 0 else ""
        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
        # wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
        # TODO verfica se esistono casi dove serva profindita 3 x analisi
        # parse noon, midnight, morning, afternoon, evening
        used = 0
        if word == "mezzogiorno":
            hrAbs = 12
            used += 1
        elif word == "mezzanotte":
            hrAbs = 24
            used += 1
        if word == "mezzo" and wordNext == "giorno":  # if stt splits the word
            hrAbs = 12
            used += 2
        elif word == "mezza"and wordNext == "notte":  # if stt splits the word
            hrAbs = 24
            used += 2
        elif word == "mattina":
            if hrAbs == 0:
                hrAbs = 8
            used += 1
            if wordNext and wordNext[0].isdigit():  # mattina alle 5
                hrAbs = int(wordNext)
                used += 1
        elif word == "pomeriggio":
            if hrAbs == 0:
                hrAbs = 15
            used += 1
            if wordNext and wordNext[0].isdigit():  # pomeriggio alle 5
                hrAbs = int(wordNext)
                used += 1
                if hrAbs < 12:
                    hrAbs += 12
        elif word == "sera":
            if hrAbs == 0:
                hrAbs = 19
            used += 1
            if wordNext and wordNext[0].isdigit():  # sera alle 8
                hrAbs = int(wordNext)
                used += 1
                if hrAbs < 12:
                    hrAbs += 12
        # parse 5:00 am, 12:00 p.m., etc
        elif word[0].isdigit():
            isTime = True
            strHH = ""
            strMM = ""
            remainder = ""
            if ':' in word:
                # parse colons
                # "3:00 in the morning"
                stage = 0
                length = len(word)
                for i in range(length):
                    if stage == 0:
                        if word[i].isdigit():
                            strHH += word[i]
                        elif word[i] == ":":
                            stage = 1
                        else:
                            stage = 2
                            i -= 1
                    elif stage == 1:
                        if word[i].isdigit():
                            strMM += word[i]
                        else:
                            stage = 2
                            i -= 1
                    elif stage == 2:
                        remainder = word[i:].replace(".", "")
                        break
                if remainder == "":
                    nextWord = wordNext.replace(".", "")
                    if nextWord == "am" or nextWord == "pm":
                        remainder = nextWord
                        used += 1
                    elif nextWord == "sera":
                        remainder = "pm"
                        used += 1
                    elif wordNext == "mattina":
                        remainder = "am"
                        used += 1
                    elif wordNext == "pomeriggio":
                        remainder = "pm"
                        used += 1
                    elif wordNext == "notte":
                        remainder = "pm"
                        used += 1
                    elif wordNext == "di" and wordNextNext == "notte":
                        if strHH > 5:
                            remainder = "pm"
                        else:
                            remainder = "am"
                        used += 2
                    else:
                        if timeQualifier != "":
                            if strHH <= 12 and \
                                    (timeQualifier == "sera" or
                                     timeQualifier == "pomeriggio"):
                                strHH += 12
            else:
                # try to parse # s without colons
                # 5 hours, 10 minutes etc.
                length = len(word)
                strNum = ""
                remainder = ""
                for i in range(length):
                    if word[i].isdigit():
                        strNum += word[i]
                    else:
                        remainder += word[i]
                if remainder == "":
                    remainder = wordNext.replace(".", "").lstrip().rstrip()
                if (
                        remainder == "pm" or
                        wordNext == "pm" or
                        remainder == "p.m." or
                        wordNext == "p.m."):
                    strHH = strNum
                    remainder = "pm"
                    used = 1
                elif (
                        remainder == "am" or
                        wordNext == "am" or
                        remainder == "a.m." or
                        wordNext == "a.m."):
                    strHH = strNum
                    remainder = "am"
                    used = 1
                else:
                    if wordNext == "pm" or wordNext == "p.m.":
                        strHH = strNum
                        remainder = "pm"
                        used = 1
                    elif wordNext == "am" or wordNext == "a.m.":
                        strHH = strNum
                        remainder = "am"
                        used = 1
                    elif (
                            int(word) > 100 and
                            (
                                wordPrev == "o" or
                                wordPrev == "oh"
                            )):
                        # 0800 hours (pronounced oh-eight-hundred)
                        strHH = int(word) / 100
                        strMM = int(word) - strHH * 100
                        if wordNext == "ora":
                            used += 1
                    elif (
                            wordNext == "ora" and
                            word[0] != '0' and
                            (
                                int(word) < 100 and
                                int(word) > 2400
                            )):
                        # ignores military time
                        # "in 3 hours"
                        hrOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1
                    elif wordNext == "mattina":
                        # " 11 del mattina"  -> del viene rimosso
                        hh = int(word)
                        used = 2
                        isTime = False
                        hrAbs = hh
                        minAbs = 00
                    elif wordNext == "pomeriggio":
                        # " 2 del pomeriggio"  -> del viene rimosso
                        hh = int(word)
                        if hh < 12:
                            hh += 12
                        used = 2
                        isTime = False
                        hrAbs = hh
                        minAbs = 00
                    elif wordNext == "sera":
                        # "alle 8 di sera"  -> alle viene rimosso
                        hh = int(word)
                        if hh < 12:
                            hh += 12
                        used = 2
                        isTime = False
                        hrAbs = hh
                        minAbs = 00
                    # parse half an hour : undici e mezza
                    elif wordNext and wordNext == "mezza":
                        hrAbs = int(word)
                        minAbs = 30
                        used = 2
                        isTime = False
                    # parse 1 quarter hour 3 quarters : dieci e tre quarti
                    elif word and wordNext and \
                            wordNext == "quarto" and word[0].isdigit():
                        minAbs = 15 * int(word)
                        used = 2
                        if minAbs > 45:  # elimina eventuali errori
                            minAbs = 0
                            used -= 2
                        isTime = False
                    elif wordNext == "minuto":
                        # "in 10 minutes"
                        minOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1
                    elif wordNext == "secondo":
                        # in 5 seconds
                        secOffset = int(word)
                        used = 2
                        isTime = False
                        hrAbs = -1
                        minAbs = -1
                    elif int(word) > 100:
                        strHH = int(word) / 100
                        strMM = int(word) - strHH * 100
                        if wordNext == "ora":
                            used += 1
                    elif wordNext and wordNext[0].isdigit():
                        strHH = word
                        strMM = wordNext
                        used += 1
                        if wordNextNext == "ora":
                            used += 1
                    elif wordNext == "in" and wordNextNext == "punto":
                        strHH = word
                        strMM = 00
                        used += 2
                    else:
                        isTime = False
            strHH = int(strHH) if strHH else 0
            strMM = int(strMM) if strMM else 0
            strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
            strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
            if strHH > 24 or strMM > 59:
                isTime = False
                used = 0
            if isTime:
                hrAbs = strHH * 1
                minAbs = strMM * 1
                used += 1
            if hrAbs <= 12 and (timeQualifier == "sera" or
                                timeQualifier == "pomeriggio"):
                hrAbs += 12
        if used > 0:
            # removed parsed words from the sentence
            for i in range(used):
                words[idx + i] = ""
            if wordPrev == "o" or wordPrev == "oh":
                words[words.index(wordPrev)] = ""
            if wordPrev == "presto":
                hrOffset = -1
                words[idx - 1] = ""
                idx -= 1
            elif wordPrev == "tardi":
                hrOffset = 1
                words[idx - 1] = ""
                idx -= 1
            if idx > 0 and wordPrev in markers:
                words[idx - 1] = ""
            if idx > 1 and wordPrevPrev in markers:
                words[idx - 2] = ""
            idx += used - 1
            found = True
    # check that we found a date
    if not date_found:
        return None
    if dayOffset is False:
        dayOffset = 0
    # perform date manipulation
    extractedDate = dateNow
    extractedDate = extractedDate.replace(microsecond=0,
                                          second=0,
                                          minute=0,
                                          hour=0)
    if datestr != "":
        en_months = ['january', 'february', 'march', 'april', 'may', 'june',
                     'july', 'august', 'september', 'october', 'november',
                     'december']
        en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
                          'aug',
                          'sept', 'oct', 'nov', 'dec']
        for idx, en_month in enumerate(en_months):
            datestr = datestr.replace(months[idx], en_month)
        for idx, en_month in enumerate(en_monthsShort):
            datestr = datestr.replace(monthsShort[idx], en_month)
        temp = datetime.strptime(datestr, "%B %d")
        if not hasYear:
            temp = temp.replace(year=extractedDate.year)
            if extractedDate < temp:
                extractedDate = extractedDate.replace(year=int(currentYear),
                                                      month=int(
                                                          temp.strftime(
                                                              "%m")),
                                                      day=int(temp.strftime(
                                                          "%d")))
            else:
                extractedDate = extractedDate.replace(
                    year=int(currentYear) + 1,
                    month=int(temp.strftime("%m")),
                    day=int(temp.strftime("%d")))
        else:
            extractedDate = extractedDate.replace(
                year=int(temp.strftime("%Y")),
                month=int(temp.strftime("%m")),
                day=int(temp.strftime("%d")))
    if timeStr != "":
        temp = datetime(timeStr)
        extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
                                              minute=temp.strftime("%M"),
                                              second=temp.strftime("%S"))
    if yearOffset != 0:
        extractedDate = extractedDate + relativedelta(years=yearOffset)
    if monthOffset != 0:
        extractedDate = extractedDate + relativedelta(months=monthOffset)
    if dayOffset != 0:
        extractedDate = extractedDate + relativedelta(days=dayOffset)
    if hrAbs != -1 and minAbs != -1:
        extractedDate = extractedDate + relativedelta(hours=hrAbs,
                                                      minutes=minAbs)
        if (hrAbs != 0 or minAbs != 0) and datestr == "":
            if not daySpecified and dateNow > extractedDate:
                extractedDate = extractedDate + relativedelta(days=1)
    if hrOffset != 0:
        extractedDate = extractedDate + relativedelta(hours=hrOffset)
    if minOffset != 0:
        extractedDate = extractedDate + relativedelta(minutes=minOffset)
    if secOffset != 0:
        extractedDate = extractedDate + relativedelta(seconds=secOffset)
    for idx, word in enumerate(words):
        if words[idx] == "e" and words[idx - 1] == "" and words[
                idx + 1] == "":
            words[idx] = ""
    resultStr = " ".join(words)
    resultStr = ' '.join(resultStr.split())
    return [extractedDate, resultStr]
 def get_gender_it(word, raw_string=""):
    """
    Questa potrebbe non essere utile.
@ -292,7 +1004,7 @@ def get_gender_it(word, raw_string=""):
    analizzare l'articolo che la precede e non la lettera
    con cui finisce la parola, ma sono presenti funzioni per
    la rimozione degli articoli dalla frase per semplificarne
-    l'analisi, in particolare se si rimuovono "i", "gli", "le"
+    l'analisi
    TODO:  verificare se utile
    """
--- a/mycroft/util/parse.py
+++ b/mycroft/util/parse.py
@ -105,6 +105,8 @@ def extract_datetime(text, anchorDate=None, lang="en-us"):
        return extract_datetime_en(text, anchorDate)
    elif lang_lower.startswith("pt"):
        return extract_datetime_pt(text, anchorDate)
    elif lang_lower.startswith("it"):
        return extract_datetime_it(text, anchorDate)
    return text
 # ==============================================================
--- a/test/unittests/util/test_parse_it.py
+++ b/test/unittests/util/test_parse_it.py
@ -0,0 +1,308 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright 2017 Mycroft AI Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import unittest
 from datetime import datetime
 from mycroft.util.parse import get_gender
 from mycroft.util.parse import extract_datetime
 from mycroft.util.parse import extractnumber
 from mycroft.util.parse import normalize
 class TestNormalize(unittest.TestCase):
    """
        Test cases for Italian parsing
    """
    def test_articles_it(self):
        self.assertEqual(normalize(u"questo è il test",
                                   lang="it", remove_articles=True),
                         u"questo è test")
        self.assertEqual(normalize(u"questa è la frase",
                                   lang="it", remove_articles=True),
                         u"questa è frase")
        self.assertEqual(normalize(u"questo è lo scopo", lang="it",
                                   remove_articles=True),
                         u"questo è scopo")
        self.assertEqual(normalize(u"questo è il test extra",
                                   lang="it", remove_articles=False),
                         u"questo è il test extra")
    def test_extractnumber_it(self):
        self.assertEqual(extractnumber(u"questo è il primo test",
                                       lang="it"), 1)
        self.assertEqual(extractnumber(u"questo è il 2 test",
                                       lang="it"), 2)
        self.assertEqual(extractnumber(u"questo è il secondo test",
                                       lang="it"), 2)
        self.assertEqual(extractnumber(u"questo è un terzo di test",
                                       lang="it"), 1.0 / 3.0)
        self.assertEqual(extractnumber(u"questo è test numero 4",
                                       lang="it"), 4)
        self.assertEqual(extractnumber("un terzo di tazza",
                                       lang="it"), 1.0 / 3.0)
        self.assertEqual(extractnumber("tre tazze",
                                       lang="it"), 3)
        self.assertEqual(extractnumber("1/3 tazze",
                                       lang="it"), 1.0 / 3.0)
        self.assertEqual(extractnumber("un quarto di tazza",
                                       lang="it"), 0.25)
        self.assertEqual(extractnumber("1/4 tazza",
                                       lang="it"), 0.25)
        self.assertEqual(extractnumber("2/3 tazza",
                                       lang="it"), 2.0 / 3.0)
        self.assertEqual(extractnumber("3/4 tazza",
                                       lang="it"), 3.0 / 4.0)
        self.assertEqual(extractnumber("1 e 1/4 tazza",
                                       lang="it"), 1.25)
        self.assertEqual(extractnumber("1 tazza e mezzo",
                                       lang="it"), 1.5)
        self.assertEqual(extractnumber("una tazza e mezzo",
                                       lang="it"), 1.5)
        self.assertEqual(extractnumber("una e mezza tazza",
                                       lang="it"), 1.5)
        self.assertEqual(extractnumber("una e una mezza tazza",
                                       lang="it"), 1.5)
        self.assertEqual(extractnumber("tre quarti tazza",
                                       lang="it"), 3.0 / 4.0)
        self.assertEqual(extractnumber("tre quarto tazza",
                                       lang="it"), 3.0 / 4.0)
        self.assertEqual(extractnumber("sette punto cinque",
                                       lang="it"), 7.5)
        self.assertEqual(extractnumber("sette punto 5",
                                       lang="it"), 7.5)
        self.assertEqual(extractnumber("sette e mezzo",
                                       lang="it"), 7.5)
        self.assertEqual(extractnumber("sette e ottanta",
                                       lang="it"), 7.80)
        self.assertEqual(extractnumber("sette e otto",
                                       lang="it"), 7.8)
        self.assertEqual(extractnumber("sette e zero otto",
                                       lang="it"), 7.08)
        self.assertEqual(extractnumber("sette e zero zero otto",
                                       lang="it"), 7.008)
        self.assertEqual(extractnumber("venti tredicesimi",
                                       lang="it"), 20.0 / 13.0)
        self.assertEqual(extractnumber("sei virgola sessanta sei",
                                       lang="it"), 6.66)
        self.assertEqual(extractnumber("sei virgola sessantasei",
                                       lang="it"), 6.66)
        self.assertEqual(extractnumber("seicento sessanta  sei",
                                       lang="it"), 666)
        self.assertEqual(extractnumber("seicento punto zero sei",
                                       lang="it"), 600.06)
        self.assertEqual(extractnumber("seicento punto zero zero sei",
                                       lang="it"), 600.006)
        self.assertEqual(extractnumber("seicento punto zero zero zero sei",
                                       lang="it"), 600.0006)
        self.assertEqual(extractnumber("tre decimi ",
                                       lang="it"), 0.30000000000000004)
        self.assertEqual(extractnumber("dodici centesimi",
                                       lang="it"), 0.12)
        self.assertEqual(extractnumber("cinque e quaranta due millesimi",
                                       lang="it"), 5.042)
        self.assertEqual(extractnumber("mille e uno",
                                       lang="it"), 1001)
        self.assertEqual(extractnumber("due mila venti due dollari ",
                                       lang="it"), 2022)
        self.assertEqual(extractnumber(
            "cento quattordici mila quattrocento undici dollari ",
            lang="it"), 114411)
        self.assertEqual(extractnumber("ventitre dollari ", lang="it"), 23)
        self.assertEqual(extractnumber("quarantacinque minuti ",
                                       lang="it"), 45)
        self.assertEqual(extractnumber("ventuno anni ",
                                       lang="it"), 21)
        self.assertEqual(extractnumber("ventotto euro ",
                                       lang="it"), 28)
        self.assertEqual(extractnumber("dodici e quarantacinque ",
                                       lang="it"), 12.45)
        self.assertEqual(extractnumber("quarantotto euro ",
                                       lang="it"), 48)
        self.assertEqual(extractnumber("novantanove euro ",
                                       lang="it"), 99)
        self.assertEqual(extractnumber("avvisa se qualcuno arriva ",
                                       lang="it"), False)
    def test_spaces_it(self):
        self.assertEqual(normalize(u"questo   e'  il    test",
                                   lang="it"), u"questo e' test")
        self.assertEqual(normalize(u"questo   è    un    test  ",
                                   lang="it"), u"questo è 1 test")
        self.assertEqual(normalize(u"un  altro test  ",
                                   lang="it"), u"1 altro test")
        self.assertEqual(normalize(u"questa è un'  altra amica   ", lang="it",
                                   remove_articles=False),
                         u"questa è 1 altra amica")
        self.assertEqual(normalize(u"questo   è  un    test   ", lang="it",
                                   remove_articles=False), u"questo è 1 test")
    def test_numbers_it(self):
        self.assertEqual(normalize(u"questo è il test uno due tre",
                                   lang="it"), u"questo è test 1 2 3")
        self.assertEqual(normalize(u"è un test sette otto nove",
                                   lang="it"), u"è 1 test 7 8 9")
        self.assertEqual(normalize("test zero dieci undici dodici tredici",
                                   lang="it"), "test 0 10 11 12 13")
        self.assertEqual(normalize("test mille seicento sessanta e sei",
                                   lang="it", remove_articles=False),
                         "test 1000 600 60 e 6")
        self.assertEqual(normalize("test sette e mezzo",
                                   lang="it", remove_articles=False),
                         "test 7 e mezzo")
        self.assertEqual(normalize("test due punto nove",
                                   lang="it"), "test 2 punto 9")
        self.assertEqual(normalize("test cento e nove",
                                   lang="it", remove_articles=False),
                         "test 100 e 9")
        self.assertEqual(normalize("test venti e 1",
                                   lang="it"), "test 20 e 1")
        self.assertEqual(normalize("test ventuno e ventisette",
                                   lang="it"), "test 21 e 27")
    def test_extractdatetime_it(self):
        def extractWithFormat(text):
            date = datetime(2018, 01, 13, 00, 00)
            [extractedDate, leftover] = extract_datetime(text, date,
                                                         lang="it")
            extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S")
            return [extractedDate, leftover]
        def testExtract(text, expected_date, expected_leftover):
            res = extractWithFormat(text)
            self.assertEqual(res[0], expected_date)
            self.assertEqual(res[1], expected_leftover)
        testExtract(u"quale giorno è oggi",
                    "2018-01-13 00:00:00", u"quale giorno")
        testExtract(u"che giorno è domani",
                    "2018-01-14 00:00:00", u"che giorno")
        testExtract(u"che giorno era ieri",
                    "2018-01-12 00:00:00", u"che giorno")
        testExtract(u"che giorno è dopo domani",
                    "2018-01-15 00:00:00", u"che giorno")
        testExtract(u"fissare la cena tra 5 giorni",
                    "2018-01-18 00:00:00", u"fissare cena")
        testExtract(u"Come è il tempo per dopodomani",
                    "2018-01-15 00:00:00", u"come tempo")
        testExtract(u"ricordami alle 22:45",
                    "2018-01-13 22:45:00", u"ricordami")
        testExtract(u"Come è il tempo venerdì mattina",
                    "2018-01-19 08:00:00", "come tempo")
        testExtract(u"Ricordami di chiamare la mamma"
                    u" in 8 settimane e 2 giorni.",
                    "2018-03-12 00:00:00", u"ricordami chiamare mamma")
        testExtract(u"Gioca a briscola 2 giorni dopo venerdì",
                    "2018-01-21 00:00:00", u"gioca briscola")
        testExtract(u"Inizia le pulizie alle 15:45 di giovedì",
                    "2018-01-18 15:45:00", u"inizia pulizie")
        testExtract("lunedi compra formaggio",
                    "2018-01-15 00:00:00", u"compra formaggio")
        testExtract("suona musica compleanno tra 5 anni da oggi",
                    "2023-01-13 00:00:00", "suona musica compleanno")
        testExtract(u"Invia Skype alla mamma alle 12:45 di giovedì prossimo.",
                    "2018-01-18 12:45:00", u"invia skype mamma")
        testExtract(u"Come è il tempo questo venerdì?",
                    "2018-01-19 00:00:00", u"come tempo")
        testExtract(u"Come è il tempo questo venerdì pomeriggio?",
                    "2018-01-19 15:00:00", u"come tempo")
        testExtract(u"Come è il tempo questo venerdì a mezza notte?",
                    "2018-01-20 00:00:00", u"come tempo")
        testExtract(u"Come è il tempo questo venerdì a mezzogiorno?",
                    "2018-01-19 12:00:00", "come tempo")
        testExtract(u"Come è il tempo questo venerdì alle 11 del mattino?",
                    "2018-01-19 11:00:00", "come tempo")
        testExtract("Ricordami di chiamare mia madre il 3 agosto.",
                    "2018-08-03 00:00:00", "ricordami chiamare mia madre")
        testExtract(u"comprare fragole il 13 maggio",
                    "2018-05-13 00:00:00", "comprare fragole")
        testExtract(u"fare acquisti il 13 maggio",
                    "2018-05-13 00:00:00", "fare acquisti")
        testExtract(u"compra le candele il 1° maggio",
                    "2018-05-01 00:00:00", "compra candele")
        testExtract(u"bere birra il 13 maggio",
                    "2018-05-13 00:00:00", "bere birra")
        testExtract(u"Come è il tempo 1 giorno dopo domani?",
                    "2018-01-15 00:00:00", "come tempo")
        testExtract(u"Come è il tempo alle ore 0700?",
                    "2018-01-13 07:00:00", "come tempo ora")
        testExtract(u"Come è il tempo domani alle 7 in punto?",
                    "2018-01-14 07:00:00", "come tempo")
        testExtract(u"Come è il tempo domani alle 2 del pomeriggio",
                    "2018-01-14 14:00:00", "come tempo")
        testExtract(u"Come è il tempo domani pomeriggio alle 2",
                    "2018-01-14 14:00:00", "come tempo")
        testExtract(u"Come è il tempo domani per le 2:00",
                    "2018-01-14 02:00:00", "come tempo")
        testExtract(u"Come è il tempo alle 2 del pomeriggio di \
                    venerdì prossimo?",
                    "2018-01-19 14:00:00", u"come tempo")
        testExtract(u"Ricordami di svegliarmi tra 4 anni",
                    "2022-01-13 00:00:00", u"ricordami svegliarmi")
        testExtract(u"Ricordami di svegliarmi tra 4 anni e 4 giorni",
                    "2022-01-17 00:00:00", u"ricordami svegliarmi")
        testExtract(u"Dormi 3 giorni da domani.",
                    "2018-01-17 00:00:00", u"dormi")
        testExtract(u"segna appuntamento tra 2 settimane e 6 giorni \
                    dopo sabato",
                    "2018-02-02 00:00:00", u"segna appuntamento")
        testExtract(u"La festa inizia alle 8 di sera di giovedì",
                    "2018-01-18 20:00:00", u"la festa inizia")
        testExtract(u"Come è il meteo 3 tra giorni?",
                    "2018-01-16 00:00:00", u"come meteo")
        testExtract(u"fissa appuntamento dicembre 3",
                    "2018-12-03 00:00:00", "fissa appuntamento")
        testExtract(u"incontriamoci questa sera alle 8 ",
                    "2018-01-13 20:00:00", "incontriamoci")
        testExtract(u"incontriamoci alle 8 questa sera",
                    "2018-01-13 20:00:00", "incontriamoci")
        testExtract(u"impostare sveglia questa sera alle 9 ",
                    "2018-01-13 21:00:00", "impostare sveglia")
        testExtract(u"impostare sveglia questa sera alle 21 ",
                    "2018-01-13 21:00:00", "impostare sveglia")
        testExtract(u"inserire appuntamento domani sera alle 23",
                    "2018-01-14 23:00:00", "inserire appuntamento")
        testExtract(u"inserire appuntamento domani alle 9 e mezza",
                    "2018-01-14 09:30:00", "inserire appuntamento")
        testExtract(u"inserire appuntamento domani sera alle 23 e 3 quarti",
                    "2018-01-14 23:45:00", "inserire appuntamento")
    def test_gender_it(self):
        self.assertEqual(get_gender("mucca", lang="it"), "f")
        self.assertEqual(get_gender("cavallo", lang="it"), "m")
        self.assertEqual(get_gender("mucche", "le mucche", lang="it"), "f")
        self.assertEqual(get_gender("bue", "il bue mangia la erba",
                                    lang="it"), "m")
        self.assertEqual(get_gender("pesce", "il pesce nuota",
                                    lang="it"), "m")
        self.assertEqual(get_gender("tigre", lang="it"), "f")
        self.assertEqual(get_gender("uomini", "questi uomini mangiano pasta",
                                    lang="it"), "m")
        self.assertEqual(get_gender("ponte", "il ponte", lang="it"), "m")
        self.assertEqual(get_gender("ponte", u"questo ponte è caduto",
                                    lang="it"), "m")
        self.assertEqual(get_gender("scultrice", "questa scultrice famosa",
                                    lang="it"), "f")
        self.assertEqual(get_gender("scultore", "questo scultore famoso",
                                    lang="it"), "m")
        self.assertEqual(get_gender("scultori", "gli scultori rinascimentali",
                                    lang="it"), "m")
        self.assertEqual(get_gender("scultrici", "le scultrici moderne",
                                    lang="it"), "f")
 if __name__ == "__main__":
    unittest.main()