mycroft-core/mycroft/util/lang/parse_it.py

# -*- coding: utf-8 -*-
#
# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
    Parse functions for Italian (IT-IT)

"""

import collections
from datetime import datetime
from dateutil.relativedelta import relativedelta
from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
    extract_numbers_generic
from mycroft.util.lang.format_it import LONG_SCALE_IT, SHORT_SCALE_IT, \
    pronounce_number_it

SHORT_ORDINAL_STRING_IT = {
    1: 'primo',
    2: 'secondo',
    3: 'terzo',
    4: 'quarto',
    5: 'quinto',
    6: 'sesto',
    7: 'settimo',
    8: 'ottavo',
    9: 'nono',
    10: 'decimo',
    11: 'undicesimo',
    12: 'dodicesimo',
    13: 'tredicesimo',
    14: 'quattordicesimo',
    15: 'quindicesimo',
    16: 'sedicesimo',
    17: 'diciassettesimo',
    18: 'diciottesimo',
    19: 'diciannovesimo',
    20: 'ventesimo',
    30: 'trentesimo',
    40: 'quarantesimo',
    50: 'cinquantesimo',
    60: 'sessantesimo',
    70: 'settantesimo',
    80: 'ottantesimo',
    90: 'novantesimo',
    1e2: 'centesimo',
    1e3: 'millesimo',
    1e6: 'milionesimo',
    1e9: 'miliardesimo',
    1e12: 'trilionesimo',
    1e15: 'quadrilionesimo',
    1e18: 'quintilionesim',
    1e21: 'sestilionesimo',
    1e24: 'settilionesimo',
    1e27: 'ottilionesimo',
    1e30: 'nonilionesimo',
    1e33: 'decilionesimo'
    # TODO > 1e-33
}

#  per i > 10e12 modificata solo la desinenza: da sistemare a fine debug
LONG_ORDINAL_STRING_IT = {
    1: 'primo',
    2: 'secondo',
    3: 'terzo',
    4: 'quarto',
    5: 'quinto',
    6: 'sesto',
    7: 'settimo',
    8: 'ottavo',
    9: 'nono',
    10: 'decimo',
    11: 'undicesimo',
    12: 'dodicesimo',
    13: 'tredicesimo',
    14: 'quattordicesimo',
    15: 'quindicesimo',
    16: 'sedicesimo',
    17: 'diciassettesimo',
    18: 'diciottesimo',
    19: 'diciannovesimo',
    20: 'ventesimo',
    30: 'trentesimo',
    40: 'quarantesimo',
    50: 'cinquantesimo',
    60: 'sessantesimo',
    70: 'settantesimo',
    80: 'ottantesimo',
    90: 'novantesimo',
    1e2: 'centesimo',
    1e3: 'millesimo',
    1e6: 'milionesimo',
    1e12: 'bilionesimo',
    1e18: 'trilionesimo',
    1e24: 'quadrilionesimo',
    1e30: 'quintilionesimo',
    1e36: 'sestilionesimo',
    1e42: 'settilionesimo',
    1e48: 'ottilionesimo',
    1e54: 'nonilionesimo',
    1e60: 'decilionesimo'
    # TODO > 1e60
}

# Undefined articles ['un', 'una', 'un\''] can not be supressed,
# in Italian, 'un cavallo' means 'a horse' or 'one horse'.
ARTICLES_IT = ['il', 'lo', 'la', 'i', 'gli', 'le']

STRING_NUM_ITA = {
    'zero': 0,
    'un': 1,
    'uno': 1,
    'una': 1,
    'un\'': 1,
    'due': 2,
    'tre': 3,
    'quattro': 4,
    'cinque': 5,
    'sei': 6,
    'sette': 7,
    'otto': 8,
    'nove': 9,
    'dieci': 10,
    'undici': 11,
    'dodici': 12,
    'tredici': 13,
    'quattordici': 14,
    'quindici': 15,
    'sedici': 16,
    'diciassette': 17,
    'diciotto': 18,
    'diciannove': 19,
    'venti': 20,
    'vent': 20,
    'trenta': 30,
    'trent': 30,
    'quaranta': 40,
    'quarant': 40,
    'cinquanta': 50,
    'cinquant': 50,
    'sessanta': 60,
    'sessant': 60,
    'settanta': 70,
    'settant': 70,
    'ottanta': 80,
    'ottant': 80,
    'novanta': 90,
    'novant': 90,
    'cento': 100,
    'duecento': 200,
    'trecento': 300,
    'quattrocento': 400,
    'cinquecento': 500,
    'seicento': 600,
    'settecento': 700,
    'ottocento': 800,
    'novecento': 900,
    'mille': 1000,
    'mila': 1000,
    'centomila': 100000,
    'milione': 1000000,
    'miliardo': 1000000000,
    'primo': 1,
    'secondo': 2,
    'mezzo': 0.5,
    'mezza': 0.5,
    'paio': 2,
    'decina': 10,
    'decine': 10,
    'dozzina': 12,
    'dozzine': 12,
    'centinaio': 100,
    'centinaia': 100,
    'migliaio': 1000,
    'migliaia': 1000
}


def isFractional_it(input_str, short_scale=False):
    """
    This function takes the given text and checks if it is a fraction.
    Updated to italian from en version 18.8.9

    Args:
        input_str (str): the string to check if fractional
        short_scale (bool): use short scale if True, long scale if False
    Returns:
        (bool) or (float): False if not a fraction, otherwise the fraction

    """
    input_str = input_str.lower()
    if input_str.endswith('i', -1) and len(input_str) > 2:
        input_str = input_str[:-1] + "o"  # normalizza plurali

    fracts_it = {"intero": 1, "mezza": 2, "mezzo": 2}

    if short_scale:
        for num in SHORT_ORDINAL_STRING_IT:
            if num > 2:
                fracts_it[SHORT_ORDINAL_STRING_IT[num]] = num
    else:
        for num in LONG_ORDINAL_STRING_IT:
            if num > 2:
                fracts_it[LONG_ORDINAL_STRING_IT[num]] = num

    if input_str in fracts_it:
        return 1.0 / fracts_it[input_str]
    return False


def extractnumber_long_it(word):
    """
     This function converts a long textual number like
     milleventisette -> 1027 diecimila -> 10041 in
     integer value, covers from  0 to 999999999999999
     for now limited to 999_e21 but ready for 999_e63
     example:
        milleventisette -> 1027
        diecimilaquarantuno-> 10041
        centottomiladuecentotredici -> 108213
    Args:
         word (str): the word to convert in number
    Returns:
         (bool) or (int): The extracted number or False if no number
                                   was found
    """

    units = {'zero': 0, 'uno': 1, 'due': 2, 'tre': 3, 'quattro': 4,
             'cinque': 5, 'sei': 6, 'sette': 7, 'otto': 8, 'nove': 9}

    tens = {'dieci': 10, 'venti': 20, 'trenta': 30, 'quaranta': 40,
            'cinquanta': 50, 'sessanta': 60, 'settanta': 70, 'ottanta': 80,
            'novanta': 90}

    tens_short = {'vent': 20, 'trent': 30, 'quarant': 40, 'cinquant': 50,
                  'sessant': 60, 'settant': 70, 'ottant': 80, 'novant': 90}

    nums_long = {'undici': 11, 'dodici': 12, 'tredici': 13, 'quattordici': 14,
                 'quindici': 15, 'sedici': 16, 'diciassette': 17,
                 'diciotto': 18, 'diciannove': 19}

    multipli_it = collections.OrderedDict([
        # (1e63, 'deciliardi'),
        # (1e60, 'decilioni'),
        # (1e57, 'noviliardi'),
        # (1e54, 'novilioni'),
        # (1e51, 'ottiliardi'),
        # (1e48, 'ottilioni'),
        # (1e45, 'settiliardi'),
        # (1e42, 'settilioni'),
        # (1e39, 'sestiliardi'),
        # (1e36, 'sestilioni'),
        # (1e33, 'quintiliardi'),
        # (1e30, 'quintilioni'),
        # (1e27, 'quadriliardi'),
        # (1e24, 'quadrilioni'),    # yotta
        (1e21, 'triliardi'),      # zetta
        (1e18, 'trilioni'),       # exa
        (1e15, 'biliardi'),       # peta
        (1e12, 'bilioni'),        # tera
        (1e9, 'miliardi'),        # giga
        (1e6, 'milioni')          # mega
    ])

    multiplier = {}
    un_multiplier = {}

    for num in multipli_it:
        if num > 1000 and num <= 1e21:
            # plurali
            multiplier[multipli_it[num]] = int(num)
            # singolari - modificare per eccezioni *liardo
            if multipli_it[num][-5:-1] == 'iard':
                un_multiplier['un' + multipli_it[num][:-1] + 'o'] = int(num)
            else:
                un_multiplier['un' + multipli_it[num][:-1] + 'e'] = int(num)

    value = False

    # normalizza ordinali singoli o plurali -esimo -esimi
    if word[-5:-1] == 'esim':
        base = word[:-5]
        normalize_ita3 = {'tre': '', 'ttr': 'o', 'sei': '', 'ott': 'o'}
        normalize_ita2 = {'un': 'o', 'du': 'e', 'qu': 'e', 'tt': 'e',
                          'ov': 'e'}

        if base[-3:] in normalize_ita3:
            base += normalize_ita3[base[-3:]]
        elif base[-2:] in normalize_ita2:
            base += normalize_ita2[base[-2:]]

        word = base

    for item in un_multiplier:
        components = word.split(item, 1)
        if len(components) == 2:
            if not components[0]:  # inizia con un1^x
                if not components[1]:  # unmilione
                    word = str(int(un_multiplier[item]))
                else:                  # unmilione + x
                    word = str(int(un_multiplier[item]) +
                               extractnumber_long_it(components[1]))

    for item in multiplier:
        components = word.split(item, 1)
        if len(components) == 2:
            if not components[0]:  # inizia con un1^x
                word = str(int(multiplier[item]) +
                           extractnumber_long_it(components[1]))
            else:
                if not components[1]:
                    word = str(extractnumber_long_it(components[0])) + '*' \
                        + str(int(multiplier[item]))
                else:
                    word = str(extractnumber_long_it(components[0])) + '*' \
                        + str(int(multiplier[item])) + '+' \
                        + str(extractnumber_long_it(components[1]))

    for item in tens:
        word = word.replace(item, '+' + str(tens[item]))

    for item in tens_short:
        word = word.replace(item, '+' + str(tens_short[item]))

    for item in nums_long:
        word = word.replace(item, '+' + str(nums_long[item]))

    word = word.replace('cento', '+1xx')
    word = word.replace('cent', '+1xx')
    word = word.replace('mille', '+1000')   # unmilionemille
    word = word.replace('mila', '*1000')   # unmilioneduemila

    for item in units:
        word = word.replace(item, '+' + str(units[item]))

    # normalizzo i cento
    occorrenze = word.count('+1xx')
    for _ in range(0, occorrenze):
        components = word.rsplit('+1xx', 1)
        if len(components[0]) > 1 and components[0].endswith('0'):
            word = components[0] + '+100' + components[1]
        else:
            word = components[0] + '*100' + components[1]

    components = word.rsplit('*1000', 1)
    if len(components) == 2:
        if components[0].startswith('*'):  # centomila
            components[0] = components[0][1:]
        word = str(extractnumber_long_it(components[0])) + \
            '*1000' + str(components[1])

    # gestione eccezioni
    if word.startswith('*') or word.startswith('+'):
        word = word[1:]

    addends = word.split('+')
    for c, _ in enumerate(addends):
        if '*' in addends[c]:
            factors = addends[c].split('*')
            result = int(factors[0]) * int(factors[1])
            if len(factors) == 3:
                result *= int(factors[2])
            addends[c] = str(result)

    # check if all token are numbers
    if all([s.isdecimal() for s in addends]):
        value = sum([int(s) for s in addends])
    else:
        value = False
    return value


def extractnumber_it(text, short_scale=False, ordinals=False):
    """
    This function extracts a number from a text string,
    handles pronunciations in long scale and short scale

    https://en.wikipedia.org/wiki/Names_of_large_numbers

    Args:
        text (str): the string to normalize
        short_scale (bool): use short scale if True, long scale if False
        ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
    Returns:
        (int) or (float) or False: The extracted number or False if no number
                                   was found

    """

    string_num_ordinal_it = {}
    # first, second...
    if ordinals:
        if short_scale:
            for num in SHORT_ORDINAL_STRING_IT:
                num_string = SHORT_ORDINAL_STRING_IT[num]
                string_num_ordinal_it[num_string] = num
                STRING_NUM_ITA[num_string] = num
        else:
            for num in LONG_ORDINAL_STRING_IT:
                num_string = LONG_ORDINAL_STRING_IT[num]
                string_num_ordinal_it[num_string] = num
                STRING_NUM_ITA[num_string] = num

    # negate next number (-2 = 0 - 2)
    negatives = ['meno']  # 'negativo' non è usuale in italiano

    # multiply the previous number (one hundred = 1 * 100)
    multiplies = ['decina', 'decine', 'dozzina', 'dozzine',
                  'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila']

    # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
    fraction_marker = [' e ']

    # decimal marker ( 1 point 5 = 1 + 0.5)
    decimal_marker = [' punto ', ' virgola ']

    if short_scale:
        for num in SHORT_SCALE_IT:
            num_string = SHORT_SCALE_IT[num]
            STRING_NUM_ITA[num_string] = num
            multiplies.append(num_string)
    else:
        for num in LONG_SCALE_IT:
            num_string = LONG_SCALE_IT[num]
            STRING_NUM_ITA[num_string] = num
            multiplies.append(num_string)

    # 2 e 3/4 ed altri casi
    for separator in fraction_marker:
        components = text.split(separator)
        zeros = 0

        if len(components) == 2:
            # count zeros in fraction part
            sub_components = components[1].split(' ')
            for element in sub_components:
                if element == 'zero' or element == '0':
                    zeros += 1
                else:
                    break
            # ensure first is not a fraction and second is a fraction
            num1 = extractnumber_it(components[0])
            num2 = extractnumber_it(components[1])
            if num1 is not None and num2 is not None \
                    and num1 >= 1 and 0 < num2 < 1:
                return num1 + num2
            # sette e quaranta  sette e zero zero due
            elif num1 is not None and num2 is not None \
                    and num1 >= 1 and num2 > 1:
                return num1 + num2 / pow(10, len(str(num2)) + zeros)

    # 2 punto 5
    for separator in decimal_marker:
        zeros = 0
        # count zeros in fraction part
        components = text.split(separator)

        if len(components) == 2:
            sub_components = components[1].split(' ')
            for element in sub_components:
                if element == 'zero' or element == '0':
                    zeros += 1
                else:
                    break

            number = int(extractnumber_it(components[0]))
            decimal = int(extractnumber_it(components[1]))
            if number is not None and decimal is not None:
                if '.' not in str(decimal):
                    return number + decimal / pow(10,
                                                  len(str(decimal)) + zeros)

    all_words = text.split()
    val = False
    prev_val = None
    to_sum = []
    for idx, word in enumerate(all_words):

        if not word:
            continue
        prev_word = all_words[idx - 1] if idx > 0 else ''
        next_word = all_words[idx + 1] if idx + 1 < len(all_words) else ''

        # is this word already a number ?
        if is_numeric(word):
            val = float(word)

        # is this word the name of a number ?
        if word in STRING_NUM_ITA:
            val = STRING_NUM_ITA[word]

        #  tre quarti  un quarto  trenta secondi
        if isFractional_it(word) and prev_val:
            if word[:-1] == 'second' and not ordinals:
                val = prev_val * 2
            else:
                val = prev_val

        # is the prev word a number and should we multiply it?
        # twenty hundred, six hundred
        if word in multiplies:
            if not prev_val:
                prev_val = 1
            val = prev_val * val

        # is this a spoken fraction?
        # mezza tazza
        if val is False:
            val = isFractional_it(word, short_scale=short_scale)

        # 2 quinti
        if not ordinals:
            next_value = isFractional_it(next_word, short_scale=short_scale)
            if next_value:
                if not val:
                    val = 1
                val = val * next_value

        # is this a negative number?
        if val and prev_word and prev_word in negatives:
            val = 0 - val

        if not val:
            val = extractnumber_long_it(word)

        # let's make sure it isn't a fraction
        if not val:
            # look for fractions like '2/3'
            all_pieces = word.split('/')
            if look_for_fractions(all_pieces):
                val = float(all_pieces[0]) / float(all_pieces[1])
        else:
            prev_val = val
            # handle long numbers
            # six hundred sixty six
            # two million five hundred thousand
            if word in multiplies and next_word not in multiplies:
                to_sum.append(val)
                val = 0
                prev_val = 0
            elif extractnumber_long_it(word) > 100 and \
                extractnumber_long_it(next_word) and \
                    next_word not in multiplies:
                to_sum.append(val)
                val = 0
                prev_val = 0

    if val is not None:
        for addend in to_sum:
            val = val + addend
    return val


def normalize_it(text, remove_articles):
    """ IT string normalization """
    # replace ambiguous words
    text = text.replace('un paio', 'due')

    words = text.split()  # this also removed extra spaces
    # Contractions are not common in IT
    # Convert numbers into digits, e.g. 'quarantadue' -> '42'
    normalized = ''
    i = 0

    while i < len(words):
        word = words[i]
        # remove articles
        # Italian requires the article to define the grammatical gender
        if remove_articles and word in ARTICLES_IT:
            i += 1
            continue

        if word in STRING_NUM_ITA:
            word = str(STRING_NUM_ITA[word])

        val = int(extractnumber_it(word))    # era extractnumber_long_it

        if val:
            word = str(val)

        normalized += ' ' + word
        i += 1
    # indefinite articles in it-it can not be removed

    return normalized[1:]


def extract_datetime_it(string, dateNow, default_time):
    def clean_string(s):
        """
            cleans the input string of unneeded punctuation and capitalization
            among other things.
            Normalize italian plurals
        """
        symbols = ['.', ',', ';', '?', '!', 'º', 'ª', '°', 'l\'']

        for word in symbols:
            s = s.replace(word, '')

        s = s.lower().replace('á', 'a').replace('à', 'a').replace('è', "e'")\
            .replace('é', "e'").replace('ì', 'i').replace('ù', 'u')\
            .replace('ò', 'o').replace('-', ' ').replace('_', '')

        # normalizza plurali per semplificare analisi
        s = s.replace('secondi', 'secondo').replace('minuti', 'minuto')\
            .replace('ore', 'ora').replace('giorni', 'giorno')\
            .replace('settimane', 'settimana').replace('mesi', 'mese')\
            .replace('anni', 'anno').replace('mattino', 'mattina')\
            .replace('prossima', 'prossimo').replace('questa', 'questo')\
            .replace('quarti', 'quarto').replace('in punto', 'in_punto')\
            .replace('decennio', 'decenni').replace('secoli', 'secolo')\
            .replace('millennio', 'millenni').replace(' un ', ' uno ')\
            .replace('scorsa', 'scorso').replace('passata', 'passato')\
            .replace('uno paio', 'due')

        noise_words = ['dello', 'la', 'del', 'al', 'il', 'di', 'tra', 'lo',
                       'le', 'alle', 'alla', 'dai', 'delle', 'della',
                       'a', 'e\'', 'era', 'questa', 'questo', 'e', 'nel',
                       'nello', 'dallo', '  ']

        word_list = s.split()
        word_list = [x for x in word_list if x not in noise_words]
        # normalizza alcuni formati orari
        for idx in range(0, len(word_list) - 1):
            if word_list[idx][0].isdigit() and word_list[idx+1][0].isdigit():
                num0 = int(word_list[idx])
                num1 = int(word_list[idx+1])
                if 0 <= num0 <= 23 and 10 <= num1 <= 59:
                    word_list[idx] = str(num0) + ':' + str(num1)
                    word_list[idx+1] = ''

        word_list = [x for x in word_list if x]

        return word_list

    def date_found():
        return found or \
            (datestr != '' or time_str != '' or year_offset != 0 or
             month_offset != 0 or day_offset is True or hr_offset != 0 or
             hr_abs or min_offset != 0 or min_abs or sec_offset != 0)

    if string == '' or not dateNow:
        return None

    found = False
    day_specified = False
    day_offset = False
    month_offset = 0
    year_offset = 0
    today = dateNow.strftime('%w')
    current_year = dateNow.strftime('%Y')
    from_flag = False
    datestr = ''
    has_year = False
    time_qualifier = ''
    time_qualifiers_am = ['mattina', 'stamani', 'stamane']
    time_qualifiers_pm = ['pomeriggio', 'sera', 'stasera', 'stanotte']
    time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm)
    markers = ['alle', 'in', 'questo', 'per', 'di', 'tra', 'fra', 'entro']
    days = ['lunedi', 'martedi', 'mercoledi',
            'giovedi', 'venerdi', 'sabato', 'domenica']
    months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno',
              'luglio', 'agosto', 'settembre', 'ottobre', 'novembre',
              'dicembre']
    months_short = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago',
                    'set', 'ott', 'nov', 'dic']
    year_multiples = ['decenni', 'secolo', 'millenni']  # decennio <- decenni
    time_multiples = ['ora', 'minuto', 'secondo']
    day_multiples = ['settimana', 'mese', 'anno']
    noise_words_2 = ['tra', 'di', 'per', 'fra', 'un ', 'uno', 'lo', 'del',
                     'l', 'in_punto', ' ', 'nella', 'dell']

    words = clean_string(string)

    for idx, word in enumerate(words):
        if word == '':
            continue
        word_prev_prev = words[idx - 2] if idx > 1 else ''
        word_prev = words[idx - 1] if idx > 0 else ''
        word_next = words[idx + 1] if idx + 1 < len(words) else ''
        word_next_next = words[idx + 2] if idx + 2 < len(words) else ''
        start = idx
        used = 0
        # save timequalifier for later
        if word == 'adesso' and not datestr:
            # word == 'ora' va in conflitto con 'tra un ora'
            words = [x for x in words if x != 'adesso']
            words = [x for x in words if x]
            result_str = ' '.join(words)
            extracted_date = dateNow.replace(microsecond=0)
            return [extracted_date, result_str]

        # un paio di  o  tra tre settimane --> secoli
        elif extractnumber_it(word) and (word_next in year_multiples or
                                         word_next in day_multiples):
            multiplier = int(extractnumber_it(word))
            used += 2
            if word_next == 'decenni':
                year_offset = multiplier * 10
            elif word_next == 'secolo':
                year_offset = multiplier * 100
            elif word_next == 'millenni':
                year_offset = multiplier * 1000
            elif word_next == 'anno':
                year_offset = multiplier
            elif word_next == 'mese':
                month_offset = multiplier
            elif word_next == 'settimana':
                day_offset = multiplier * 7
        elif word in time_qualifiers_list:
            time_qualifier = word
        # parse today, tomorrow, day after tomorrow
        elif word == 'oggi' and not from_flag:
            day_offset = 0
            used += 1
        elif word == 'domani' and not from_flag:
            day_offset = 1
            used += 1
        elif word == 'ieri' and not from_flag:
            day_offset -= 1
            used += 1
        elif word == 'dopodomani' and not from_flag:  # after tomorrow
            day_offset += 2
            used += 1
        elif word == 'dopo' and word_next == 'domani' and not from_flag:
            day_offset += 1
            used += 2
        elif word == 'giorno':
            if word_prev[0].isdigit():
                day_offset += int(word_prev)
                start -= 1
                used = 2
                if word_next == 'dopo' and word_next_next == 'domani':
                    day_offset += 1
                    used += 2
        elif word == 'settimana' and not from_flag:
            if word_prev == 'prossimo':
                day_offset = 7
                start -= 1
                used = 2
            elif word_prev == 'passato' or word_prev == 'scorso':
                day_offset = -7
                start -= 1
                used = 2
            elif word_next == 'prossimo':
                day_offset = 7
                used += 2
            elif word_next == 'passato' or word_next == 'scorso':
                day_offset = -7
                used += 2
        # parse next month, last month
        elif word == 'mese' and not from_flag:
            if word_prev == 'prossimo':
                month_offset = 1
                start -= 1
                used = 2
            elif word_prev == 'passato' or word_prev == 'scorso':
                month_offset = -1
                start -= 1
                used = 2
            elif word_next == 'prossimo':
                month_offset = 1
                used += 2
            elif word_next == 'passato' or word_next == 'scorso':
                month_offset = -1
                used += 2
        # parse next year, last year
        elif word == 'anno' and not from_flag:
            if word_prev == 'prossimo':  # prossimo anno
                year_offset = 1
                start -= 1
                used = 2
            elif word_next == 'prossimo':  # anno prossimo
                year_offset = 1
                used = 2
            elif word_prev == 'passato' or word_prev == 'scorso':
                year_offset = -1
                start -= 1
                used = 2
            elif word_next == 'passato' or word_next == 'scorso':
                year_offset = -1
                used = 2
        elif word == 'decenni' and not from_flag:
            if word_prev == 'prossimo':  # prossimo mese
                year_offset = 10
                start -= 1
                used = 2
            elif word_next == 'prossimo':  # mese prossimo
                year_offset = 10
                used = 2
            elif word_prev == 'passato' or word_prev == 'scorso':
                year_offset = -10
                start -= 1
                used = 2
            elif word_next == 'passato' or word_next == 'scorso':
                year_offset = -10
                used = 2
        # parse Monday, Tuesday, etc., and next Monday,
        # last Tuesday, etc.
        elif word in days and not from_flag:
            ddd = days.index(word)
            day_offset = (ddd + 1) - int(today)
            used = 1
            if day_offset < 0:
                day_offset += 7
            if word_prev == 'prossimo':
                day_offset += 7
                start -= 1
                used += 1
            elif word_prev == 'passato' or word_prev == 'scorso':
                day_offset -= 7
                start -= 1
                used += 1
            if word_next == 'prossimo':
                day_offset += 7
                used += 1
            elif word_next == 'passato' or word_next == 'scorso':
                day_offset -= 7
                used += 1
        # parse 15 of July, June 20th, Feb 18, 19 of February
        elif word in months or word in months_short and not from_flag:
            try:
                mmm = months.index(word)
            except ValueError:
                mmm = months_short.index(word)
            used += 1
            datestr = months[mmm]
            if word_prev and extractnumber_it(word_prev):
                datestr += ' ' + str(int(extractnumber_it(word_prev)))
                start -= 1
                used += 1
                if word_next and extractnumber_it(word_next):
                    datestr += ' ' + str(int(extractnumber_it(word_next)))
                    used += 1
                    has_year = True
                else:
                    has_year = False
            elif word_next and word_next[0].isdigit():
                datestr += ' ' + word_next
                used += 1
                if word_next_next and word_next_next[0].isdigit():
                    datestr += ' ' + word_next_next
                    used += 1
                    has_year = True
                else:
                    has_year = False
        # parse 5 days from tomorrow, 10 weeks from next thursday,
        # 2 months from July
        validFollowups = days + months + months_short
        validFollowups.append('oggi')
        validFollowups.append('domani')
        validFollowups.append('prossimo')
        validFollowups.append('passato')
        validFollowups.append('adesso')

        if (word == 'da' or word == 'dopo') and word_next in validFollowups:
            used = 0
            from_flag = True
            if word_next == 'domani':
                day_offset += 1
                used += 2
            elif word_next == 'oggi' or word_next == 'adesso':
                used += 2
            elif word_next in days:
                ddd = days.index(word_next)
                tmp_offset = (ddd + 1) - int(today)
                used += 2
                if tmp_offset < 0:
                    tmp_offset += 7
                if word_next_next == 'prossimo':
                    tmp_offset += 7
                    used += 1
                elif word_next_next == 'passato' or word_next_next == 'scorso':
                    tmp_offset = (ddd + 1) - int(today)
                    used += 1
                day_offset += tmp_offset
            elif word_next_next and word_next_next in days:
                ddd = days.index(word_next_next)
                tmp_offset = (ddd + 1) - int(today)
                if word_next == 'prossimo':
                    tmp_offset += 7
                # elif word_next == 'passato' or word_next == 'scorso':
                #    tmp_offset -= 7
                day_offset += tmp_offset
                used += 3

        if used > 0:
            if start - 1 > 0 and words[start - 1] == 'questo':
                start -= 1
                used += 1

            for i in range(0, used):
                words[i + start] = ''

            if start - 1 >= 0 and words[start - 1] in markers:
                words[start - 1] = ''
            found = True
            day_specified = True

    # parse time
    time_str = ''
    hr_offset = 0
    min_offset = 0
    sec_offset = 0
    hr_abs = None
    min_abs = None
    military = False

    for idx, word in enumerate(words):
        if word == '':
            continue
        word_prev_prev = words[idx - 2] if idx > 1 else ''
        word_prev = words[idx - 1] if idx > 0 else ''
        word_next = words[idx + 1] if idx + 1 < len(words) else ''
        word_next_next = words[idx + 2] if idx + 2 < len(words) else ''
        # parse noon, midnight, morning, afternoon, evening
        used = 0
        if word == 'mezzogiorno':
            hr_abs = 12
            used += 1
        elif word == 'mezzanotte':
            hr_abs = 24
            used += 1
        if word == 'mezzo' and word_next == 'giorno':
            hr_abs = 12
            used += 2
        elif word == 'mezza' and word_next == 'notte':
            hr_abs = 24
            used += 2
        elif word == 'mattina':
            if not hr_abs:
                hr_abs = 8
            used += 1
            if word_next and word_next[0].isdigit():  # mattina alle 5
                hr_abs = int(word_next)
                used += 1
        elif word == 'pomeriggio':
            if not hr_abs:
                hr_abs = 15
            used += 1
            if word_next and word_next[0].isdigit():  # pomeriggio alle 5
                hr_abs = int(word_next)
                used += 1
                if (hr_abs or 0) < 12:
                    hr_abs = (hr_abs or 0) + 12
        elif word == 'sera':
            if not hr_abs:
                hr_abs = 19
            used += 1
            if word_next and word_next[0].isdigit() \
               and ':' not in word_next:
                hr_abs = int(word_next)
                used += 1
                if (hr_abs or 0) < 12:
                    hr_abs = (hr_abs or 0) + 12
        # da verificare più a fondo
        elif word == 'presto':
            hr_abs -= 1
            used += 1
        elif word == 'tardi':
            hr_abs += 1
            used += 1
        # un paio di minuti  tra cinque minuti tra 5 ore
        elif extractnumber_it(word) and (word_next in time_multiples):
            d_time = int(extractnumber_it(word))
            used += 2
            if word_next == 'ora':
                hr_offset = d_time
                isTime = False
                hr_abs = -1
                min_abs = -1
            elif word_next == 'minuto':
                min_offset = d_time
                isTime = False
                hr_abs = -1
                min_abs = -1
            elif word_next == 'secondo':
                sec_offset = d_time
                isTime = False
                hr_abs = -1
                min_abs = -1
        elif word == 'mezzora':
            min_offset = 30
            used = 1
            isTime = False
            hr_abs = -1
            min_abs = -1
            # if word_prev == 'uno' or word_prev == 'una':
            #    start -= 1
            #    used += 1
        elif extractnumber_it(word) and word_next and \
                word_next == 'quarto' and word_next_next == 'ora':
            if int(extractnumber_it(word)) == 1 \
               or int(extractnumber_it(word)) == 3:
                min_offset = 15 * int(extractnumber_it(word))
            else:  # elimina eventuali errori
                min_offset = 15
            used = 3
            start -= 1
            isTime = False
            hr_abs = -1
            min_abs = -1
        elif word[0].isdigit():
            isTime = True
            str_hh = ''
            str_mm = ''
            remainder = ''
            if ':' in word:
                # parse colons
                # '3:00 in the morning'
                components = word.split(':')
                if len(components) == 2:
                    num0 = int(extractnumber_it(components[0]))
                    num1 = int(extractnumber_it(components[1]))
                    if num0 is not False and num1 is not False \
                            and 0 <= num0 <= 23 and 0 <= num1 <= 59:
                        str_hh = str(num0)
                        str_mm = str(num1)
            elif 0 < int(extractnumber_it(word)) < 24 \
                    and word_next != 'quarto':
                str_hh = str(int(word))
                str_mm = '00'
            elif 100 <= int(word) <= 2400:
                str_hh = int(word) / 100
                str_mm = int(word) - str_hh * 100
                military = True
                isTime = False
            if extractnumber_it(word) and word_next \
               and word_next == 'quarto' and word_next_next != 'ora':
                if int(extractnumber_it(word)) == 1 \
                   or int(extractnumber_it(word)) == 3:
                    str_mm = str(15 * int(extractnumber_it(word)))
                else:  # elimina eventuali errori
                    str_mm = '0'
                str_hh = str(hr_abs)
                used = 2
                words[idx + 1] = ''
                isTime = False
            if extractnumber_it(word) and word_next \
               and word_next == 'in_punto':
                str_hh = str(int(extractnumber_it(word)))
                used = 2
            if word_next == 'pm':
                remainder = 'pm'
                hr_abs = int(str_hh)
                min_abs = int(str_mm)
                if hr_abs <= 12:
                    hr_abs = hr_abs + 12
                used = 2
            elif word_next == 'am':
                remainder = 'am'
                hr_abs = int(str_hh)
                min_abs = int(str_mm)
                used = 2
            elif word_next == 'mattina':
                # ' 11 del mattina'
                hh = int(str_hh)
                mm = int(str_mm)
                used = 2
                remainder = 'am'
                isTime = False
                hr_abs = hh
                min_abs = mm
            elif word_next == 'pomeriggio':
                # ' 2 del pomeriggio'
                hh = int(str_hh)
                mm = int(str_mm)
                if hh < 12:
                    hh += 12
                used = 2
                remainder = 'pm'
                isTime = False
                hr_abs = hh
                min_abs = mm
            elif word_next == 'sera':
                # 'alle 8 di sera'
                hh = int(str_hh)
                mm = int(str_mm)
                if hh < 12:
                    hh += 12
                used = 2
                remainder = 'pm'
                isTime = False
                hr_abs = hh
                min_abs = mm
            elif word_next == 'notte':
                hh = int(str_hh)
                mm = int(str_mm)
                if hh > 5:
                    remainder = 'pm'
                else:
                    remainder = 'am'
                used = 2
                isTime = False
                hr_abs = hh
                min_abs = mm
            # parse half an hour : undici e mezza
            elif word_next and word_next == 'mezza':
                hr_abs = int(str_hh)
                min_abs = 30
                used = 2
                isTime = False
            elif word_next and word_next == 'in_punto':
                hr_abs = int(str_hh)
                min_abs = 0
                str_mm = '0'
                used = 2
                isTime = False
            else:
                # 17:30
                remainder = ''
                hr_abs = int(str_hh)
                min_abs = int(str_mm)
                used = 1
                isTime = False
                if word_prev == 'ora':
                    words[idx - 1] = ''

            if time_qualifier != '':
                # military = True
                if str_hh and int(str_hh) <= 12 and \
                   (time_qualifier in time_qualifiers_pm):
                    str_hh = str(int(str_hh) + 12)
            else:
                isTime = False

            str_hh = int(str_hh) if str_hh else 0
            str_mm = int(str_mm) if str_mm else 0

            str_hh = str_hh + 12 if remainder == 'pm' \
                and str_hh < 12 else str_hh
            str_hh = str_hh - 12 if remainder == 'am' \
                and str_hh >= 12 else str_hh

            if (not military and
                    remainder not in ['am', 'pm'] and
                    ((not day_specified) or day_offset < 1)):
                # ambiguous time, detect whether they mean this evening or
                # the next morning based on whether it has already passed
                hr_abs = str_hh
                if dateNow.hour < str_hh:
                    pass  # No modification needed
                elif dateNow.hour < str_hh + 12:
                    str_hh += 12
                    hr_abs = str_hh
                else:
                    # has passed, assume the next morning
                    day_offset += 1

            if time_qualifier in time_qualifiers_pm and str_hh < 12:
                str_hh += 12

            if str_hh > 24 or str_mm > 59:
                isTime = False
                used = 0
            if isTime:
                hr_abs = str_hh * 1
                min_abs = str_mm * 1
                used += 1

            if (hr_abs or 0) <= 12 and (time_qualifier == 'sera' or
                                        time_qualifier == 'pomeriggio'):
                hr_abs = (hr_abs or 0) + 12

        if used > 0:
            # removed parsed words from the sentence
            for i in range(used):
                words[idx + i] = ''

            if word_prev == 'o' or word_prev == 'oh':
                words[words.index(word_prev)] = ''

            if idx > 0 and word_prev in markers:
                words[idx - 1] = ''
            if idx > 1 and word_prev_prev in markers:
                words[idx - 2] = ''

            idx += used - 1
            found = True

    # check that we found a date
    if not date_found:
        return None

    if day_offset is False:
        day_offset = 0

    # perform date manipulation

    extracted_date = dateNow.replace(microsecond=0)

    if datestr != '':
        en_months = ['january', 'february', 'march', 'april', 'may', 'june',
                     'july', 'august', 'september', 'october', 'november',
                     'december']
        en_months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
                           'aug', 'sept', 'oct', 'nov', 'dec']

        for idx, en_month in enumerate(en_months):
            datestr = datestr.replace(months[idx], en_month)

        for idx, en_month in enumerate(en_months_short):
            datestr = datestr.replace(months_short[idx], en_month)

        try:
            temp = datetime.strptime(datestr, '%B %d')
        except ValueError:
            # Try again, allowing the year
            temp = datetime.strptime(datestr, '%B %d %Y')
        extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
        if not has_year:
            temp = temp.replace(year=extracted_date.year,
                                tzinfo=extracted_date.tzinfo)
            if extracted_date < temp:
                extracted_date = extracted_date.replace(
                    year=int(current_year),
                    month=int(temp.strftime('%m')),
                    day=int(temp.strftime('%d')),
                    tzinfo=extracted_date.tzinfo)
            else:
                extracted_date = extracted_date.replace(
                    year=int(current_year) + 1,
                    month=int(temp.strftime('%m')),
                    day=int(temp.strftime('%d')),
                    tzinfo=extracted_date.tzinfo)
        else:
            extracted_date = extracted_date.replace(
                year=int(temp.strftime('%Y')),
                month=int(temp.strftime('%m')),
                day=int(temp.strftime('%d')),
                tzinfo=extracted_date.tzinfo)
    else:
        # ignore the current HH:MM:SS if relative using days or greater
        if hr_offset == 0 and min_offset == 0 and sec_offset == 0:
            extracted_date = extracted_date.replace(hour=0, minute=0, second=0)

    if year_offset != 0:
        extracted_date = extracted_date + relativedelta(years=year_offset)
    if month_offset != 0:
        extracted_date = extracted_date + relativedelta(months=month_offset)
    if day_offset != 0:
        extracted_date = extracted_date + relativedelta(days=day_offset)
    if hr_abs != -1 and min_abs != -1:
        # If no time was supplied in the string set the time to default
        # time if it's available
        if hr_abs is None and min_abs is None and default_time is not None:
            hr_abs, min_abs = default_time.hour, default_time.minute
        else:
            hr_abs = hr_abs or 0
            min_abs = min_abs or 0

        extracted_date = extracted_date + relativedelta(hours=hr_abs,
                                                        minutes=min_abs)
        if (hr_abs != 0 or min_abs != 0) and datestr == '':
            if not day_specified and dateNow > extracted_date:
                extracted_date = extracted_date + relativedelta(days=1)
    if hr_offset != 0:
        extracted_date = extracted_date + relativedelta(hours=hr_offset)
    if min_offset != 0:
        extracted_date = extracted_date + relativedelta(minutes=min_offset)
    if sec_offset != 0:
        extracted_date = extracted_date + relativedelta(seconds=sec_offset)

    words = [x for x in words if x not in noise_words_2]
    words = [x for x in words if x]
    result_str = ' '.join(words)

    return [extracted_date, result_str]


def get_gender_it(word, raw_string=""):
    """
    In Italian to define the grammatical gender of a word is necessary
    analyze the article that precedes the word and not only the last
    letter of the word.

    TODO: check if useful
    """

    gender = None
    words = raw_string.split(' ')
    for idx, w in enumerate(words):
        if w == word and idx != 0:
            previous = words[idx - 1]
            gender = get_gender_it(previous)
            break

    if not gender:
        if word[-1] == 'a' or word[-1] == 'e':
            gender = 'f'
        if word[-1] == 'o' or word[-1] == 'n' \
                or word[-1] == 'l' or word[-1] == 'i':
            gender = 'm'

    return gender


def extract_numbers_it(text, short_scale=False, ordinals=False):
    """
        Takes in a string and extracts a list of numbers.

    Args:
        text (str): the string to extract a number from
        short_scale (bool): Use "short scale" or "long scale" for large
            numbers -- over a million.  The default is short scale, which
            is now common in most English speaking countries.
            See https://en.wikipedia.org/wiki/Names_of_large_numbers
        ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
    Returns:
        list: list of extracted numbers as floats
    """
    return extract_numbers_generic(text, pronounce_number_it, extractnumber_it,
                                   short_scale=short_scale, ordinals=ordinals)