From 32666e9d686353210a11241700e2c3e895e9e57c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=85ke=20Forslund?= Date: Fri, 13 Dec 2019 13:24:15 +0100 Subject: [PATCH] Remove content from lang specific files The files are kept for backwards compatibility but these just contains imports of lingua-franca versions of variables --- mycroft/util/lang/format_common.py | 3 +- mycroft/util/lang/format_en.py | 285 +---- mycroft/util/lang/format_es.py | 307 +----- mycroft/util/lang/format_fr.py | 289 +----- mycroft/util/lang/format_hu.py | 351 +------ mycroft/util/lang/format_it.py | 485 +-------- mycroft/util/lang/format_nl.py | 382 +------ mycroft/util/lang/format_pt.py | 209 +--- mycroft/util/lang/format_sv.py | 411 +------- mycroft/util/lang/parse_common.py | 89 +- mycroft/util/lang/parse_da.py | 919 +--------------- mycroft/util/lang/parse_de.py | 938 +---------------- mycroft/util/lang/parse_en.py | 1554 +--------------------------- mycroft/util/lang/parse_es.py | 1149 +------------------- mycroft/util/lang/parse_fr.py | 1070 +------------------ mycroft/util/lang/parse_it.py | 1312 +---------------------- mycroft/util/lang/parse_nl.py | 1467 +------------------------- mycroft/util/lang/parse_pt.py | 1127 +------------------- mycroft/util/lang/parse_sv.py | 765 +------------- 19 files changed, 78 insertions(+), 13034 deletions(-) diff --git a/mycroft/util/lang/format_common.py b/mycroft/util/lang/format_common.py index 5f600d979b..c6e0640150 100644 --- a/mycroft/util/lang/format_common.py +++ b/mycroft/util/lang/format_common.py @@ -18,5 +18,4 @@ TODO: Remove in 20.02 """ - -from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.format_common import * diff --git a/mycroft/util/lang/format_en.py b/mycroft/util/lang/format_en.py index 0a2dc66263..7926a19c71 100644 --- a/mycroft/util/lang/format_en.py +++ b/mycroft/util/lang/format_en.py @@ -14,285 +14,8 @@ # limitations under the License. # -from mycroft.util.lang.format_common import convert_to_mixed_fraction -from mycroft.util.log import LOG -from mycroft.util.lang.common_data_en import _NUM_STRING_EN, \ - _FRACTION_STRING_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN +"""File kept for backwards compatibility - -def nice_number_en(number, speech, denominators=range(1, 21)): - """ English helper for nice_number - - This function formats a float to human understandable functions. Like - 4.5 becomes "4 and a half" for speech and "4 1/2" for text - - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. - """ - - result = convert_to_mixed_fraction(number, denominators) - if not result: - # Give up, just represent as a 3 decimal number - return str(round(number, 3)) - - whole, num, den = result - - if not speech: - if num == 0: - # TODO: Number grouping? E.g. "1,000,000" - return str(whole) - else: - return '{} {}/{}'.format(whole, num, den) - - if num == 0: - return str(whole) - den_str = _FRACTION_STRING_EN[den] - if whole == 0: - if num == 1: - return_string = 'a {}'.format(den_str) - else: - return_string = '{} {}'.format(num, den_str) - elif num == 1: - return_string = '{} and a {}'.format(whole, den_str) - else: - return_string = '{} and {} {}'.format(whole, num, den_str) - if num > 1: - return_string += 's' - return return_string - - -def pronounce_number_en(num, places=2, short_scale=True, scientific=False): - """ - Convert a number to its spoken equivalent - - For example, '5.2' would return 'five point two' - - Args: - num(float or int): the number to pronounce - places(int): maximum decimal places to speak - short_scale (bool) : use short (True) or long scale (False) - https://en.wikipedia.org/wiki/Names_of_large_numbers - scientific (bool): pronounce in scientific notation - Returns: - (str): The pronounced number - """ - if scientific: - number = '%E' % num - n, power = number.split("E") - power = int(power) - if power != 0: - # This handles negatives of powers separately from the normal - # handling since each call disables the scientific flag - return '{} times ten to the power of {}{}'.format( - pronounce_number_en(float(n), places, short_scale, True), - 'negative ' if power < 0 else '', - pronounce_number_en(abs(power), places, short_scale, False)) - - number_names = _NUM_STRING_EN - big_number_names = _SHORT_SCALE_EN if short_scale else _LONG_SCALE_EN - - # deal with negatives - result = "" - if num < 0: - result = "negative " if scientific else "minus " - num = abs(num) - - try: - # deal with 4 digits - # usually if it's a 4 digit num it should be said like a date - # i.e. 1972 => nineteen seventy two - if 10000 > num >= 1000 and isinstance(num, int): - # deal with 1000, 2000, 2001, 2100, 3123, etc - # is skipped as the rest of the - # functin deals with this already - if num % 1000 < 10 or num > 2000: - pass - # deal with 1900, 1300, etc - # i.e. 1900 => nineteen hundred - elif not num % 100: - first = number_names[num / 100] - last = big_number_names[100] - return first + " " + last - # deal with 1960, 1961, etc - # i.e. 1960 => nineteen sixty - # 1961 => nineteen sixty one - else: - first = number_names[num // 100] - last = number_names[num % 100 - num % 10] - if num % 10: - last += " " + number_names[num % 10] - return first + " " + last - # exception used to catch any unforseen edge cases - # will default back to normal subroutine - except Exception as e: - LOG.error('Exception in pronounce_number_en: {}' + repr(e)) - - # check for a direct match - if num in number_names: - result += number_names[num] - elif num in big_number_names: - result += "one " + big_number_names[num] - else: - hundreds = list(big_number_names.values()) - - def _sub_thousand(n): - assert 0 <= n <= 999 - if n <= 19: - return number_names[n] - elif n <= 99: - q, r = divmod(n, 10) - return number_names[q * 10] + ( - " " + _sub_thousand(r) if r else "") - else: - q, r = divmod(n, 100) - return number_names[q] + " hundred" + ( - " and " + _sub_thousand(r) if r else "") - - def _short_scale(n): - if n >= max(_SHORT_SCALE_EN): - return "infinity" - n = int(n) - assert 0 <= n - res = [] - for i, z in enumerate(_split_by(n, 1000)): - if not z: - continue - number = _sub_thousand(z) - if i: - number += " " - number += hundreds[i] - res.append(number) - - return ", ".join(reversed(res)) - - def _split_by(n, split=1000): - assert 0 <= n - res = [] - while n: - n, r = divmod(n, split) - res.append(r) - return res - - def _long_scale(n): - if n >= max(_LONG_SCALE_EN): - return "infinity" - n = int(n) - assert 0 <= n - res = [] - for i, z in enumerate(_split_by(n, 1000000)): - if not z: - continue - number = pronounce_number_en(z, places, True, scientific) - # strip off the comma after the thousand - if i: - # plus one as we skip 'thousand' - # (and 'hundred', but this is excluded by index value) - number = number.replace(',', '') - number += " " + hundreds[i+1] - res.append(number) - return ", ".join(reversed(res)) - - if short_scale: - result += _short_scale(num) - else: - result += _long_scale(num) - - # Deal with fractional part - if not num == int(num) and places > 0: - result += " point" - place = 10 - while int(num * place) % 10 > 0 and places > 0: - result += " " + number_names[int(num * place) % 10] - place *= 10 - places -= 1 - return result - - -def nice_time_en(dt, speech=True, use_24hour=False, use_ampm=False): - """ - Format a time to a comfortable human format - - For example, generate 'five thirty' for speech or '5:30' for - text display. - - Args: - dt (datetime): date to format (assumes already in local timezone) - speech (bool): format for speech (default/True) or display (False)=Fal - use_24hour (bool): output in 24-hour/military or 12-hour format - use_ampm (bool): include the am/pm for 12-hour format - Returns: - (str): The formatted time string - """ - if use_24hour: - # e.g. "03:01" or "14:22" - string = dt.strftime("%H:%M") - else: - if use_ampm: - # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") - else: - # e.g. "3:01" or "2:22" - string = dt.strftime("%I:%M") - if string[0] == '0': - string = string[1:] # strip leading zeros - - if not speech: - return string - - # Generate a speakable version of the time - if use_24hour: - speak = "" - - # Either "0 8 hundred" or "13 hundred" - if string[0] == '0': - speak += pronounce_number_en(int(string[0])) + " " - speak += pronounce_number_en(int(string[1])) - else: - speak = pronounce_number_en(int(string[0:2])) - - speak += " " - if string[3:5] == '00': - speak += "hundred" - else: - if string[3] == '0': - speak += pronounce_number_en(0) + " " - speak += pronounce_number_en(int(string[4])) - else: - speak += pronounce_number_en(int(string[3:5])) - return speak - else: - hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 - - if dt.hour == 0 and dt.minute == 0: - return "midnight" - if dt.hour == 12 and dt.minute == 0: - return "noon" - elif dt.minute == 15: - speak = "quarter past " + pronounce_number_en(hour) - elif dt.minute == 30: - speak = "half past " + pronounce_number_en(hour) - elif dt.minute == 45: - next_hour = (dt.hour + 1) % 12 or 12 - speak = "quarter to " + pronounce_number_en(next_hour) - else: - speak = pronounce_number_en(hour) - - if dt.minute == 0: - if not use_ampm: - return speak + " o'clock" - else: - if dt.minute < 10: - speak += " oh" - speak += " " + pronounce_number_en(dt.minute) - - if use_ampm: - if dt.hour > 11: - speak += " p.m." - else: - speak += " a.m." - - return speak +TODO: Remove in 20.02 +""" +from lingua_franca.lang.format_en import * diff --git a/mycroft/util/lang/format_es.py b/mycroft/util/lang/format_es.py index ba7b26040c..83f7a5d4c4 100644 --- a/mycroft/util/lang/format_es.py +++ b/mycroft/util/lang/format_es.py @@ -13,307 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +"""File kept for backwards compatibility + +TODO: Remove in 20.02 """ -Format functions for castillian (es-es) - -""" -from mycroft.util.lang.format_common import convert_to_mixed_fraction - -NUM_STRING_ES = { - 0: 'cero', - 1: 'uno', - 2: 'dos', - 3: 'tres', - 4: 'cuatro', - 5: 'cinco', - 6: 'seis', - 7: 'siete', - 8: 'ocho', - 9: 'nueve', - 10: 'diez', - 11: 'once', - 12: 'doce', - 13: 'trece', - 14: 'catorce', - 15: 'quince', - 16: 'dieciséis', - 17: 'diecisete', - 18: 'dieciocho', - 19: 'diecinueve', - 20: 'veinte', - 30: 'treinta', - 40: 'cuarenta', - 50: 'cincuenta', - 60: 'sesenta', - 70: 'setenta', - 80: 'ochenta', - 90: 'noventa' -} - -FRACTION_STRING_ES = { - 2: 'medio', - 3: 'tercio', - 4: 'cuarto', - 5: 'quinto', - 6: 'sexto', - 7: 'séptimo', - 8: 'octavo', - 9: 'noveno', - 10: 'décimo', - 11: 'onceavo', - 12: 'doceavo', - 13: 'treceavo', - 14: 'catorceavo', - 15: 'quinceavo', - 16: 'dieciseisavo', - 17: 'diecisieteavo', - 18: 'dieciochoavo', - 19: 'diecinueveavo', - 20: 'veinteavo' -} - - -def nice_number_es(number, speech, denominators=range(1, 21)): - """ Spanish helper for nice_number - - This function formats a float to human understandable functions. Like - 4.5 becomes "4 y medio" for speech and "4 1/2" for text - - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. - """ - strNumber = "" - whole = 0 - num = 0 - den = 0 - - result = convert_to_mixed_fraction(number, denominators) - - if not result: - # Give up, just represent as a 3 decimal number - whole = round(number, 3) - else: - whole, num, den = result - - if not speech: - if num == 0: - strNumber = '{:,}'.format(whole) - strNumber = strNumber.replace(",", " ") - strNumber = strNumber.replace(".", ",") - return strNumber - else: - return '{} {}/{}'.format(whole, num, den) - else: - if num == 0: - # if the number is not a fraction, nothing to do - strNumber = str(whole) - strNumber = strNumber.replace(".", ",") - return strNumber - den_str = FRACTION_STRING_ES[den] - # if it is not an integer - if whole == 0: - # if there is no whole number - if num == 1: - # if numerator is 1, return "un medio", for example - strNumber = 'un {}'.format(den_str) - else: - # else return "cuatro tercios", for example - strNumber = '{} {}'.format(num, den_str) - elif num == 1: - # if there is a whole number and numerator is 1 - if den == 2: - # if denominator is 2, return "1 y medio", for example - strNumber = '{} y {}'.format(whole, den_str) - else: - # else return "1 y 1 tercio", for example - strNumber = '{} y 1 {}'.format(whole, den_str) - else: - # else return "2 y 3 cuarto", for example - strNumber = '{} y {} {}'.format(whole, num, den_str) - if num > 1 and den != 3: - # if the numerator is greater than 1 and the denominator - # is not 3 ("tercio"), add an s for plural - strNumber += 's' - - return strNumber - - -def pronounce_number_es(num, places=2): - """ - Convert a number to it's spoken equivalent - - For example, '5.2' would return 'cinco coma dos' - - Args: - num(float or int): the number to pronounce (under 100) - places(int): maximum decimal places to speak - Returns: - (str): The pronounced number - """ - if abs(num) >= 100: - # TODO: Soporta a números por encima de 100 - return str(num) - - result = "" - if num < 0: - result = "menos " - num = abs(num) - - # del 21 al 29 tienen una pronunciación especial - if 20 <= num <= 29: - tens = int(num-int(num) % 10) - ones = int(num - tens) - result += NUM_STRING_ES[tens] - if ones > 0: - result = result[:-1] - # a veinte le quitamos la "e" final para construir los - # números del 21 - 29. Pero primero tenemos en cuenta - # las excepciones: 22, 23 y 26, que llevan tilde. - if ones == 2: - result += "idós" - elif ones == 3: - result += "itrés" - elif ones == 6: - result += "iséis" - else: - result += "i" + NUM_STRING_ES[ones] - elif num >= 30: # de 30 en adelante - tens = int(num-int(num) % 10) - ones = int(num - tens) - result += NUM_STRING_ES[tens] - if ones > 0: - result += " y " + NUM_STRING_ES[ones] - else: - result += NUM_STRING_ES[int(num)] - - # Deal with decimal part, in spanish is commonly used the comma - # instead the dot. Decimal part can be written both with comma - # and dot, but when pronounced, its pronounced "coma" - if not num == int(num) and places > 0: - result += " coma" - place = 10 - while int(num*place) % 10 > 0 and places > 0: - result += " " + NUM_STRING_ES[int(num*place) % 10] - place *= 10 - places -= 1 - return result - - -def nice_time_es(dt, speech=True, use_24hour=False, use_ampm=False): - """ - Format a time to a comfortable human format - - For example, generate 'cinco treinta' for speech or '5:30' for - text display. - - Args: - dt (datetime): date to format (assumes already in local timezone) - speech (bool): format for speech (default/True) or display (False)=Fal - use_24hour (bool): output in 24-hour/military or 12-hour format - use_ampm (bool): include the am/pm for 12-hour format - Returns: - (str): The formatted time string - """ - if use_24hour: - # e.g. "03:01" or "14:22" - string = dt.strftime("%H:%M") - else: - if use_ampm: - # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") - else: - # e.g. "3:01" or "2:22" - string = dt.strftime("%I:%M") - if string[0] == '0': - string = string[1:] # strip leading zeros - - if not speech: - return string - - # Generate a speakable version of the time - speak = "" - if use_24hour: - # Tenemos que tener en cuenta que cuando hablamos en formato - # 24h, no hay que especificar ninguna precisión adicional - # como "la noche", "la tarde" o "la mañana" - # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 - if dt.hour == 1: - speak += "la una" - else: - speak += "las " + pronounce_number_es(dt.hour) - - # las 14:04 son "las catorce cero cuatro" - if dt.minute < 10: - speak += " cero " + pronounce_number_es(dt.minute) - else: - speak += " " + pronounce_number_es(dt.minute) - - else: - # Prepare for "tres menos cuarto" ?? - if dt.minute == 35: - minute = -25 - hour = dt.hour + 1 - elif dt.minute == 40: - minute = -20 - hour = dt.hour + 1 - elif dt.minute == 45: - minute = -15 - hour = dt.hour + 1 - elif dt.minute == 50: - minute = -10 - hour = dt.hour + 1 - elif dt.minute == 55: - minute = -5 - hour = dt.hour + 1 - else: - minute = dt.minute - hour = dt.hour - - if hour == 0 or hour == 12: - speak += "las doce" - elif hour == 1 or hour == 13: - speak += "la una" - elif hour < 13: - speak = "las " + pronounce_number_es(hour) - else: - speak = "las " + pronounce_number_es(hour-12) - - if minute != 0: - # las horas especiales - if minute == 15: - speak += " y cuarto" - elif minute == 30: - speak += " y media" - elif minute == -15: - speak += " menos cuarto" - else: # seis y nueve. siete y veinticinco - if minute > 0: - speak += " y " + pronounce_number_es(minute) - else: # si son las siete menos veinte, no ponemos la "y" - speak += " " + pronounce_number_es(minute) - - # si no especificamos de la tarde, noche, mañana, etc - if minute == 0 and not use_ampm: - # 3:00 - speak += " en punto" - - if use_ampm: - # "de la noche" es desde que anochece hasta medianoche - # así que decir que es desde las 21h es algo subjetivo - # en España a las 20h se dice "de la tarde" - # en castellano, las 12h es de la mañana o mediodía - # así que diremos "de la tarde" a partir de las 13h. - # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 - if hour >= 0 and hour < 6: - speak += " de la madrugada" - elif hour >= 6 and hour < 13: - speak += " de la mañana" - elif hour >= 13 and hour < 21: - speak += " de la tarde" - else: - speak += " de la noche" - return speak +from lingua_franca.lang.format_es import * diff --git a/mycroft/util/lang/format_fr.py b/mycroft/util/lang/format_fr.py index 862b9d2ecc..0345bb536d 100644 --- a/mycroft/util/lang/format_fr.py +++ b/mycroft/util/lang/format_fr.py @@ -13,290 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # -""" Format functions for french (fr) +"""File kept for backwards compatibility. + +TODO: Remove in 20.02 """ - -from mycroft.util.lang.format_common import convert_to_mixed_fraction - -NUM_STRING_FR = { - 0: 'zéro', - 1: 'un', - 2: 'deux', - 3: 'trois', - 4: 'quatre', - 5: 'cinq', - 6: 'six', - 7: 'sept', - 8: 'huit', - 9: 'neuf', - 10: 'dix', - 11: 'onze', - 12: 'douze', - 13: 'treize', - 14: 'quatorze', - 15: 'quinze', - 16: 'seize', - 20: 'vingt', - 30: 'trente', - 40: 'quarante', - 50: 'cinquante', - 60: 'soixante', - 70: 'soixante-dix', - 80: 'quatre-vingt', - 90: 'quatre-vingt-dix' -} - -FRACTION_STRING_FR = { - 2: 'demi', - 3: 'tiers', - 4: 'quart', - 5: 'cinquième', - 6: 'sixième', - 7: 'septième', - 8: 'huitième', - 9: 'neuvième', - 10: 'dixième', - 11: 'onzième', - 12: 'douzième', - 13: 'treizième', - 14: 'quatorzième', - 15: 'quinzième', - 16: 'seizième', - 17: 'dix-septième', - 18: 'dix-huitième', - 19: 'dix-neuvième', - 20: 'vingtième' -} - - -def nice_number_fr(number, speech, denominators=range(1, 21)): - """ French helper for nice_number - - This function formats a float to human understandable functions. Like - 4.5 becomes "4 et demi" for speech and "4 1/2" for text - - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. - """ - strNumber = "" - whole = 0 - num = 0 - den = 0 - - result = convert_to_mixed_fraction(number, denominators) - - if not result: - # Give up, just represent as a 3 decimal number - whole = round(number, 3) - else: - whole, num, den = result - - if not speech: - if num == 0: - strNumber = '{:,}'.format(whole) - strNumber = strNumber.replace(",", " ") - strNumber = strNumber.replace(".", ",") - return strNumber - else: - return '{} {}/{}'.format(whole, num, den) - else: - if num == 0: - # if the number is not a fraction, nothing to do - strNumber = str(whole) - strNumber = strNumber.replace(".", ",") - return strNumber - den_str = FRACTION_STRING_FR[den] - # if it is not an integer - if whole == 0: - # if there is no whole number - if num == 1: - # if numerator is 1, return "un demi", for example - strNumber = 'un {}'.format(den_str) - else: - # else return "quatre tiers", for example - strNumber = '{} {}'.format(num, den_str) - elif num == 1: - # if there is a whole number and numerator is 1 - if den == 2: - # if denominator is 2, return "1 et demi", for example - strNumber = '{} et {}'.format(whole, den_str) - else: - # else return "1 et 1 tiers", for example - strNumber = '{} et 1 {}'.format(whole, den_str) - else: - # else return "2 et 3 quart", for example - strNumber = '{} et {} {}'.format(whole, num, den_str) - if num > 1 and den != 3: - # if the numerator is greater than 1 and the denominator - # is not 3 ("tiers"), add an s for plural - strNumber += 's' - - return strNumber - - -def pronounce_number_fr(num, places=2): - """ - Convert a number to it's spoken equivalent - - For example, '5.2' would return 'cinq virgule deux' - - Args: - num(float or int): the number to pronounce (under 100) - places(int): maximum decimal places to speak - Returns: - (str): The pronounced number - """ - if abs(num) >= 100: - # TODO: Support for numbers over 100 - return str(num) - - result = "" - if num < 0: - result = "moins " - num = abs(num) - - if num > 16: - tens = int(num-int(num) % 10) - ones = int(num-tens) - if ones != 0: - if tens > 10 and tens <= 60 and int(num-tens) == 1: - result += NUM_STRING_FR[tens] + "-et-" + NUM_STRING_FR[ones] - elif num == 71: - result += "soixante-et-onze" - elif tens == 70: - result += NUM_STRING_FR[60] + "-" - if ones < 7: - result += NUM_STRING_FR[10 + ones] - else: - result += NUM_STRING_FR[10] + "-" + NUM_STRING_FR[ones] - elif tens == 90: - result += NUM_STRING_FR[80] + "-" - if ones < 7: - result += NUM_STRING_FR[10 + ones] - else: - result += NUM_STRING_FR[10] + "-" + NUM_STRING_FR[ones] - else: - result += NUM_STRING_FR[tens] + "-" + NUM_STRING_FR[ones] - else: - if num == 80: - result += "quatre-vingts" - else: - result += NUM_STRING_FR[tens] - else: - result += NUM_STRING_FR[int(num)] - - # Deal with decimal part - if not num == int(num) and places > 0: - result += " virgule" - place = 10 - while int(num*place) % 10 > 0 and places > 0: - result += " " + NUM_STRING_FR[int(num*place) % 10] - place *= 10 - places -= 1 - return result - - -def nice_time_fr(dt, speech=True, use_24hour=False, use_ampm=False): - """ - Format a time to a comfortable human format - - For example, generate 'cinq heures trente' for speech or '5:30' for - text display. - - Args: - dt (datetime): date to format (assumes already in local timezone) - speech (bool): format for speech (default/True) or display (False)=Fal - use_24hour (bool): output in 24-hour/military or 12-hour format - use_ampm (bool): include the am/pm for 12-hour format - Returns: - (str): The formatted time string - """ - if use_24hour: - # e.g. "03:01" or "14:22" - string = dt.strftime("%H:%M") - else: - if use_ampm: - # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") - else: - # e.g. "3:01" or "2:22" - string = dt.strftime("%I:%M") - if string[0] == '0': - string = string[1:] # strip leading zeros - - if not speech: - return string - - # Generate a speakable version of the time - speak = "" - if use_24hour: - - # "13 heures trente" - if dt.hour == 0: - speak += "minuit" - elif dt.hour == 12: - speak += "midi" - elif dt.hour == 1: - speak += "une heure" - else: - speak += pronounce_number_fr(dt.hour) + " heures" - - if dt.minute != 0: - speak += " " + pronounce_number_fr(dt.minute) - - else: - # Prepare for "trois heures moins le quart" - if dt.minute == 35: - minute = -25 - hour = dt.hour + 1 - elif dt.minute == 40: - minute = -20 - hour = dt.hour + 1 - elif dt.minute == 45: - minute = -15 - hour = dt.hour + 1 - elif dt.minute == 50: - minute = -10 - hour = dt.hour + 1 - elif dt.minute == 55: - minute = -5 - hour = dt.hour + 1 - else: - minute = dt.minute - hour = dt.hour - - if hour == 0: - speak += "minuit" - elif hour == 12: - speak += "midi" - elif hour == 1 or hour == 13: - speak += "une heure" - elif hour < 13: - speak = pronounce_number_fr(hour) + " heures" - else: - speak = pronounce_number_fr(hour-12) + " heures" - - if minute != 0: - if minute == 15: - speak += " et quart" - elif minute == 30: - speak += " et demi" - elif minute == -15: - speak += " moins le quart" - else: - speak += " " + pronounce_number_fr(minute) - - if use_ampm: - if hour > 17: - speak += " du soir" - elif hour > 12: - speak += " de l'après-midi" - elif hour > 0 and hour < 12: - speak += " du matin" - - return speak +from lingua_franca.lang.format_fr import * diff --git a/mycroft/util/lang/format_hu.py b/mycroft/util/lang/format_hu.py index 6ec62df837..25c1143b56 100644 --- a/mycroft/util/lang/format_hu.py +++ b/mycroft/util/lang/format_hu.py @@ -14,351 +14,8 @@ # limitations under the License. # -from mycroft.util.lang.format_common import convert_to_mixed_fraction -from math import floor +"""File kept for backwards compatibility. -months = ['január', 'február', 'március', 'április', 'május', 'június', - 'július', 'augusztus', 'szeptember', 'október', 'november', - 'december'] - -NUM_STRING_HU = { - 0: 'nulla', - 1: 'egy', - 2: 'kettő', - 3: 'három', - 4: 'négy', - 5: 'öt', - 6: 'hat', - 7: 'hét', - 8: 'nyolc', - 9: 'kilenc', - 10: 'tíz', - 11: 'tizenegy', - 12: 'tizenkettő', - 13: 'tizenhárom', - 14: 'tizennégy', - 15: 'tizenöt', - 16: 'tizenhat', - 17: 'tizenhét', - 18: 'tizennyolc', - 19: 'tizenkilenc', - 20: 'húsz', - 30: 'harminc', - 40: 'negyven', - 50: 'ötven', - 60: 'hatvan', - 70: 'hetven', - 80: 'nyolcvan', - 90: 'kilencven', - 100: 'száz' -} - -# Hungarian uses "long scale" -# https://en.wikipedia.org/wiki/Long_and_short_scales -# Currently, numbers are limited to 1000000000000000000000000, -# but NUM_POWERS_OF_TEN can be extended to include additional number words - -NUM_POWERS_OF_TEN = [ - '', 'ezer', 'millió', 'milliárd', 'billió', 'billiárd', 'trillió', - 'trilliárd' -] - -FRACTION_STRING_HU = { - 2: 'fél', - 3: 'harmad', - 4: 'negyed', - 5: 'ötöd', - 6: 'hatod', - 7: 'heted', - 8: 'nyolcad', - 9: 'kilenced', - 10: 'tized', - 11: 'tizenegyed', - 12: 'tizenketted', - 13: 'tizenharmad', - 14: 'tizennegyed', - 15: 'tizenötöd', - 16: 'tizenhatod', - 17: 'tizenheted', - 18: 'tizennyolcad', - 19: 'tizenkilenced', - 20: 'huszad' -} - -# Numbers below 2 thousand are written in one word in Hungarian -# Numbers above 2 thousand are separated by hyphens -# In some circumstances it may better to seperate individual words -# Set EXTRA_SPACE=" " for separating numbers below 2 thousand ( -# orthographically incorrect) -# Set EXTRA_SPACE="" for correct spelling, this is standard - -# EXTRA_SPACE = " " -EXTRA_SPACE = "" - - -def _get_vocal_type(word): - # checks the vocal attributes of a word - vowels_high = len([char for char in word if char in 'eéiíöőüű']) - vowels_low = len([char for char in word if char in 'aáoóuú']) - if vowels_high != 0 and vowels_low != 0: - return 2 # 2: type is mixed - return 0 if vowels_high == 0 else 1 # 0: type is low, 1: is high - - -def nice_number_hu(number, speech, denominators=range(1, 21)): - """ Hungarian helper for nice_number - - This function formats a float to human understandable functions. Like - 4.5 becomes "4 és fél" for speech and "4 1/2" for text - - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. - """ - - result = convert_to_mixed_fraction(number, denominators) - if not result: - # Give up, just represent as a 3 decimal number - return str(round(number, 3)).replace(".", ",") - - whole, num, den = result - - if not speech: - if num == 0: - # TODO: Number grouping? E.g. "1,000,000" - return str(whole) - else: - return '{} {}/{}'.format(whole, num, den) - - if num == 0: - return str(whole) - den_str = FRACTION_STRING_HU[den] - if whole == 0: - if num == 1: - one = 'egy ' if den != 2 else '' - return_string = '{}{}'.format(one, den_str) - else: - return_string = '{} {}'.format(num, den_str) - elif num == 1: - pointOne = 'egész egy' if den != 2 else 'és' - return_string = '{} {} {}'.format(whole, pointOne, den_str) - else: - return_string = '{} egész {} {}'.format(whole, num, den_str) - return return_string - - -def pronounce_number_hu(num, places=2): - """ - Convert a number to its spoken equivalent - - For example, '5.2' would return 'öt egész két tized' - - Args: - num(float or int): the number to pronounce (set limit below) - places(int): maximum decimal places to speak - Returns: - (str): The pronounced number - """ - - def pronounce_triplet_hu(num): - result = "" - num = floor(num) - if num > 99: - hundreds = floor(num / 100) - if hundreds > 0: - hundredConst = EXTRA_SPACE + 'száz' + EXTRA_SPACE - if hundreds == 1: - result += hundredConst - elif hundreds == 2: - result += 'két' + hundredConst - else: - result += NUM_STRING_HU[hundreds] + hundredConst - num -= hundreds * 100 - if num == 0: - result += '' # do nothing - elif num <= 20: - result += NUM_STRING_HU[num] # + EXTRA_SPACE - elif num > 20: - ones = num % 10 - tens = num - ones - if tens > 0: - if tens != 20: - result += NUM_STRING_HU[tens] + EXTRA_SPACE - else: - result += "huszon" + EXTRA_SPACE - if ones > 0: - result += NUM_STRING_HU[ones] + EXTRA_SPACE - return result - - def pronounce_whole_number_hu(num, scale_level=0): - if num == 0: - return '' - - num = floor(num) - result = '' - last_triplet = num % 1000 - - if last_triplet == 1: - if scale_level == 0: - if result != '': - result += '' + "egy" - else: - result += "egy" - elif scale_level == 1: - result += EXTRA_SPACE + NUM_POWERS_OF_TEN[1] + EXTRA_SPACE - else: - result += "egy" + NUM_POWERS_OF_TEN[scale_level] - elif last_triplet > 1: - result += pronounce_triplet_hu(last_triplet) - if scale_level != 0: - result = result.replace(NUM_STRING_HU[2], 'két') - if scale_level == 1: - result += NUM_POWERS_OF_TEN[1] + EXTRA_SPACE - if scale_level >= 2: - result += NUM_POWERS_OF_TEN[scale_level] - if scale_level > 0: - result += '-' - - num = floor(num / 1000) - scale_level += 1 - return pronounce_whole_number_hu(num, - scale_level) + result - - result = "" - if abs(num) >= 1000000000000000000000000: # cannot do more than this - return str(num) - elif num == 0: - return str(NUM_STRING_HU[0]) - elif num < 0: - return "mínusz " + pronounce_number_hu(abs(num), places) - else: - if num == int(num): - return pronounce_whole_number_hu(num).strip('-') - else: - whole_number_part = floor(num) - fractional_part = num - whole_number_part - if whole_number_part == 0: - result += NUM_STRING_HU[0] - result += pronounce_whole_number_hu(whole_number_part) - if places > 0: - result += " egész " - fraction = pronounce_whole_number_hu( - round(fractional_part * 10 ** places)) - result += fraction.replace(NUM_STRING_HU[2], 'két') - fraction_suffixes = [ - 'tized', 'század', 'ezred', 'tízezred', 'százezred'] - if places <= len(fraction_suffixes): - result += ' ' + fraction_suffixes[places - 1] - return result - - -def pronounce_ordinal_hu(num): - ordinals = ["nulladik", "első", "második", "harmadik", "negyedik", - "ötödik", "hatodik", "hetedik", "nyolcadik", "kilencedik", - "tizedik"] - big_ordinals = ["", "ezredik", "milliomodik"] - - # only for whole positive numbers including zero - if num < 0 or num != int(num): - return num - elif num < 11: - return ordinals[num] - else: - # concatenate parts and inflect them accordingly - root = pronounce_number_hu(num) - vtype = _get_vocal_type(root) - last_digit = num - floor(num/10) * 10 - if root == "húsz": - root = "husz" - if num % 1000000 == 0: - return root.replace(NUM_POWERS_OF_TEN[2], big_ordinals[2]) - if num % 1000 == 0: - return root.replace(NUM_POWERS_OF_TEN[1], big_ordinals[1]) - if last_digit == 1: - return root + "edik" - elif root[-1] == 'ő': - return root[:-1] + 'edik' - elif last_digit != 0: - return ordinals[last_digit].join( - root.rsplit(NUM_STRING_HU[last_digit], 1)) - return root + "edik" if vtype == 1 else root + "adik" - - -def nice_time_hu(dt, speech=True, use_24hour=False, use_ampm=False): - """ - Format a time to a comfortable human format - - For example, generate 'five thirty' for speech or '5:30' for - text display. - - Args: - dt (datetime): date to format (assumes already in local timezone) - speech (bool): format for speech (default/True) or display (False)=Fal - use_24hour (bool): output in 24-hour/military or 12-hour format - use_ampm (bool): include the am/pm for 12-hour format - Returns: - (str): The formatted time string - """ - if use_24hour: - # e.g. "03:01" or "14:22" - string = dt.strftime("%H:%M") - else: - if use_ampm: - # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") - else: - # e.g. "3:01" or "2:22" - string = dt.strftime("%I:%M") - if string[0] == '0': - string = string[1:] # strip leading zeros - - if not speech: - return string - - # Generate a speakable version of the time - speak = "" - if use_24hour: - speak += pronounce_number_hu(dt.hour) - speak = speak.replace(NUM_STRING_HU[2], 'két') - speak += " óra" - if not dt.minute == 0: # zero minutes are not pronounced - speak += " " + pronounce_number_hu(dt.minute) - - return speak # ampm is ignored when use_24hour is true - else: - if dt.hour == 0 and dt.minute == 0: - return "éjfél" - if dt.hour == 12 and dt.minute == 0: - return "dél" - # TODO: "half past 3", "a quarter of 4" and other idiomatic times - - if dt.hour == 0: - speak += pronounce_number_hu(12) - elif dt.hour < 13: - speak = pronounce_number_hu(dt.hour) - else: - speak = pronounce_number_hu(dt.hour - 12) - - speak = speak.replace(NUM_STRING_HU[2], 'két') - speak += " óra" - - if not dt.minute == 0: - speak += " " + pronounce_number_hu(dt.minute) - - if use_ampm: - if dt.hour > 11: - if dt.hour < 18: - speak = "délután " + speak # 12:01 - 17:59 - elif dt.hour < 22: - speak = "este " + speak # 18:00 - 21:59 este/evening - else: - speak = "éjjel " + speak # 22:00 - 23:59 éjjel/at night - elif dt.hour < 3: - speak = "éjjel " + speak # 00:01 - 02:59 éjjel/at night - else: - speak = "reggel " + speak # 03:00 - 11:59 reggel/in t. morning - - return speak +TODO: Remove in 20.02 +""" +from lingua_franca.lang.format_hu import * diff --git a/mycroft/util/lang/format_it.py b/mycroft/util/lang/format_it.py index 8bed788fa9..0cb3900205 100644 --- a/mycroft/util/lang/format_it.py +++ b/mycroft/util/lang/format_it.py @@ -14,485 +14,8 @@ # limitations under the License. # -from mycroft.util.lang.format_common import convert_to_mixed_fraction -import collections +"""File kept for backwards compatibility. -NUM_STRING_IT = { - 0: 'zero', - 1: 'uno', - 2: 'due', - 3: 'tre', - 4: 'quattro', - 5: 'cinque', - 6: 'sei', - 7: 'sette', - 8: 'otto', - 9: 'nove', - 10: 'dieci', - 11: 'undici', - 12: 'dodici', - 13: 'tredici', - 14: 'quattordici', - 15: 'quindici', - 16: 'sedici', - 17: 'diciassette', - 18: 'diciotto', - 19: 'diciannove', - 20: 'venti', - 30: 'trenta', - 40: 'quaranta', - 50: 'cinquanta', - 60: 'sessanta', - 70: 'settanta', - 80: 'ottanta', - 90: 'novanta' -} - -FRACTION_STRING_IT = { - 2: 'mezz', - 3: 'terz', - 4: 'quart', - 5: 'quint', - 6: 'sest', - 7: 'settim', - 8: 'ottav', - 9: 'non', - 10: 'decim', - 11: 'undicesim', - 12: 'dodicesim', - 13: 'tredicesim', - 14: 'quattordicesim', - 15: 'quindicesim', - 16: 'sedicesim', - 17: 'diciassettesim', - 18: 'diciottesim', - 19: 'diciannovesim', - 20: 'ventesim' -} - -# fonte: http://tulengua.es/numeros-texto/default.aspx -LONG_SCALE_IT = collections.OrderedDict([ - (100, 'cento'), - (1000, 'mila'), - (1000000, 'milioni'), - (1e9, "miliardi"), - (1e12, "bilioni"), - (1e18, 'trilioni'), - (1e24, "quadrilioni"), - (1e30, "quintilioni"), - (1e36, "sestilioni"), - (1e42, "settilioni"), - (1e48, "ottillioni"), - (1e54, "nonillioni"), - (1e60, "decemillioni"), - (1e66, "undicilione"), - (1e72, "dodicilione"), - (1e78, "tredicilione"), - (1e84, "quattordicilione"), - (1e90, "quindicilione"), - (1e96, "sedicilione"), - (1e102, "diciasettilione"), - (1e108, "diciottilione"), - (1e114, "dicianovilione"), - (1e120, "vintilione"), - (1e306, "unquinquagintilione"), - (1e312, "duoquinquagintilione"), - (1e336, "sesquinquagintilione"), - (1e366, "unsexagintilione") -]) - - -SHORT_SCALE_IT = collections.OrderedDict([ - (100, 'cento'), - (1000, 'mila'), - (1000000, 'milioni'), - (1e9, "miliardi"), - (1e12, 'bilioni'), - (1e15, "biliardi"), - (1e18, "trilioni"), - (1e21, "triliardi"), - (1e24, "quadrilioni"), - (1e27, "quadriliardi"), - (1e30, "quintilioni"), - (1e33, "quintiliardi"), - (1e36, "sestilioni"), - (1e39, "sestiliardi"), - (1e42, "settilioni"), - (1e45, "settiliardi"), - (1e48, "ottilioni"), - (1e51, "ottiliardi"), - (1e54, "nonilioni"), - (1e57, "noniliardi"), - (1e60, "decilioni"), - (1e63, "deciliardi"), - (1e66, "undicilioni"), - (1e69, "undiciliardi"), - (1e72, "dodicilioni"), - (1e75, "dodiciliardi"), - (1e78, "tredicilioni"), - (1e81, "trediciliardi"), - (1e84, "quattordicilioni"), - (1e87, "quattordiciliardi"), - (1e90, "quindicilioni"), - (1e93, "quindiciliardi"), - (1e96, "sedicilioni"), - (1e99, "sediciliardi"), - (1e102, "diciassettilioni"), - (1e105, "diciassettiliardi"), - (1e108, "diciottilioni"), - (1e111, "diciottiliardi"), - (1e114, "dicianovilioni"), - (1e117, "dicianoviliardi"), - (1e120, "vintilioni"), - (1e123, "vintiliardi"), - (1e153, "quinquagintillion"), - (1e183, "sexagintillion"), - (1e213, "septuagintillion"), - (1e243, "ottogintilioni"), - (1e273, "nonigintillioni"), - (1e303, "centilioni"), - (1e306, "uncentilioni"), - (1e309, "duocentilioni"), - (1e312, "trecentilioni"), - (1e333, "decicentilioni"), - (1e336, "undicicentilioni"), - (1e363, "viginticentilioni"), - (1e366, "unviginticentilioni"), - (1e393, "trigintacentilioni"), - (1e423, "quadragintacentillion"), - (1e453, "quinquagintacentillion"), - (1e483, "sexagintacentillion"), - (1e513, "septuagintacentillion"), - (1e543, "ctogintacentillion"), - (1e573, "nonagintacentillion"), - (1e603, "ducentillion"), - (1e903, "trecentillion"), - (1e1203, "quadringentillion"), - (1e1503, "quingentillion"), - (1e1803, "sescentillion"), - (1e2103, "septingentillion"), - (1e2403, "octingentillion"), - (1e2703, "nongentillion"), - (1e3003, "millinillion") -]) - - -def nice_number_it(number, speech, denominators=range(1, 21)): - """ Italian helper for nice_number - - This function formats a float to human understandable functions. Like - 4.5 becomes "4 e un mezz" for speech and "4 1/2" for text - - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. - """ - - result = convert_to_mixed_fraction(number, denominators) - if not result: - # Give up, just represent as a 3 decimal number - return str(round(number, 3)) - - whole, num, den = result - - if not speech: - if num == 0: - return str(whole) - else: - return '{} {}/{}'.format(whole, num, den) - - if num == 0: - return str(whole) - # denominatore - den_str = FRACTION_STRING_IT[den] - # frazione - if whole == 0: - if num == 1: - # un decimo - return_string = 'un {}'.format(den_str) - else: - # tre mezzi - return_string = '{} {}'.format(num, den_str) - # interi >10 - elif num == 1: - # trenta e un - return_string = '{} e un {}'.format(whole, den_str) - # interi >10 con frazioni - else: - # venti e 3 decimi - return_string = '{} e {} {}'.format(whole, num, den_str) - - # gestisce il plurale del denominatore - if num > 1: - return_string += 'i' - else: - return_string += 'o' - - return return_string - - -def pronounce_number_it(num, places=2, short_scale=False, scientific=False): - """ - Convert a number to it's spoken equivalent - adapted to italian fron en version - - For example, '5.2' would return 'cinque virgola due' - - Args: - num(float or int): the number to pronounce (under 100) - places(int): maximum decimal places to speak - short_scale (bool) : use short (True) or long scale (False) - https://en.wikipedia.org/wiki/Names_of_large_numbers - scientific (bool): pronounce in scientific notation - Returns: - (str): The pronounced number - """ - # gestione infinito - if num == float("inf"): - return "infinito" - elif num == float("-inf"): - return "meno infinito" - - if scientific: - number = '%E' % num - n, power = number.replace("+", "").split("E") - power = int(power) - if power != 0: - return '{}{} per dieci elevato alla {}{}'.format( - 'meno ' if float(n) < 0 else '', - pronounce_number_it(abs(float(n)), places, short_scale, False), - 'meno ' if power < 0 else '', - pronounce_number_it(abs(power), places, short_scale, False)) - - if short_scale: - number_names = NUM_STRING_IT.copy() - number_names.update(SHORT_SCALE_IT) - else: - number_names = NUM_STRING_IT.copy() - number_names.update(LONG_SCALE_IT) - - digits = [number_names[n] for n in range(0, 20)] - - tens = [number_names[n] for n in range(10, 100, 10)] - - if short_scale: - hundreds = [SHORT_SCALE_IT[n] for n in SHORT_SCALE_IT.keys()] - else: - hundreds = [LONG_SCALE_IT[n] for n in LONG_SCALE_IT.keys()] - - # deal with negatives - result = "" - if num < 0: - result = "meno " - num = abs(num) - - # check for a direct match - if num in number_names: - if num > 90: - result += "" # inizio stringa - result += number_names[num] - else: - def _sub_thousand(n): - assert 0 <= n <= 999 - if n <= 19: - return digits[n] - elif n <= 99: - q, r = divmod(n, 10) - _deci = tens[q-1] - _unit = r - _partial = _deci - if _unit > 0: - if _unit == 1 or _unit == 8: - _partial = _partial[:-1] # ventuno ventotto - _partial += number_names[_unit] - return _partial - else: - q, r = divmod(n, 100) - if q == 1: - _partial = "cento" - else: - _partial = digits[q] + "cento" - _partial += ( - " " + _sub_thousand(r) if r else "") # separa centinaia - return _partial - - def _short_scale(n): - if n >= max(SHORT_SCALE_IT.keys()): - return "numero davvero enorme" - n = int(n) - assert 0 <= n - res = [] - for i, z in enumerate(_split_by(n, 1000)): - if not z: - continue - number = _sub_thousand(z) - if i: - number += "" # separa ordini grandezza - number += hundreds[i] - res.append(number) - - return ", ".join(reversed(res)) - - def _split_by(n, split=1000): - assert 0 <= n - res = [] - while n: - n, r = divmod(n, split) - res.append(r) - return res - - def _long_scale(n): - if n >= max(LONG_SCALE_IT.keys()): - return "numero davvero enorme" - n = int(n) - assert 0 <= n - res = [] - for i, z in enumerate(_split_by(n, 1000000)): - if not z: - continue - number = pronounce_number_it(z, places, True, scientific) - # strip off the comma after the thousand - if i: - # plus one as we skip 'thousand' - # (and 'hundred', but this is excluded by index value) - number = number.replace(',', '') - number += " " + hundreds[i+1] - res.append(number) - return ", ".join(reversed(res)) - - if short_scale: - result += _short_scale(num) - else: - result += _long_scale(num) - - # normalizza unità misura singole e 'ragionevoli' ed ad inizio stringa - if result == 'mila': - result = 'mille' - if result == 'milioni': - result = 'un milione' - if result == 'miliardi': - result = 'un miliardo' - if result[0:7] == 'unomila': - result = result.replace('unomila', 'mille', 1) - if result[0:10] == 'unomilioni': - result = result.replace('unomilioni', 'un milione', 1) - # if result[0:11] == 'unomiliardi': - # result = result.replace('unomiliardi', 'un miliardo', 1) - - # Deal with fractional part - if not num == int(num) and places > 0: - result += " virgola" - place = 10 - while int(num * place) % 10 > 0 and places > 0: - result += " " + number_names[int(num * place) % 10] - place *= 10 - places -= 1 - return result - - -def nice_time_it(dt, speech=True, use_24hour=False, use_ampm=False): - """ - Format a time to a comfortable human format - adapted to italian fron en version - - For example, generate 'cinque e trenta' for speech or '5:30' for - text display. - - Args: - dt (datetime): date to format (assumes already in local timezone) - speech (bool): format for speech (default/True) or display (False)=Fal - use_24hour (bool): output in 24-hour/military or 12-hour format - use_ampm (bool): include the am/pm for 12-hour format - Returns: - (str): The formatted time string - """ - if use_24hour: - # e.g. "03:01" or "14:22" - string = dt.strftime("%H:%M") - else: - if use_ampm: - # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") - else: - # e.g. "3:01" or "2:22" - string = dt.strftime("%I:%M") - if string[0] == '0': - string = string[1:] # strip leading zeros - - if not speech: - return string - - # Generate a speakable version of the time - if use_24hour: - speak = "" - # Either "zero 8 zerozero" o "13 zerozero" - if string[0:2] == '00': - speak += "zerozero" - elif string[0] == '0': - speak += pronounce_number_it(int(string[0])) + " " - if int(string[1]) == 1: - speak = "una" - else: - speak += pronounce_number_it(int(string[1])) - else: - speak = pronounce_number_it(int(string[0:2])) - - # in italian "13 e 25" - speak += " e " - - if string[3:5] == '00': - speak += "zerozero" - else: - if string[3] == '0': - speak += pronounce_number_it(0) + " " - speak += pronounce_number_it(int(string[4])) - else: - speak += pronounce_number_it(int(string[3:5])) - return speak - else: - if dt.hour == 0 and dt.minute == 0: - return "mezzanotte" - if dt.hour == 12 and dt.minute == 0: - return "mezzogiorno" - # TODO: "10 e un quarto", "4 e tre quarti" and ot her idiomatic times - - if dt.hour == 0: - speak = "mezzanotte" - elif dt.hour == 1 or dt.hour == 13: - speak = "una" - elif dt.hour > 13: # era minore - speak = pronounce_number_it(dt.hour-12) - else: - speak = pronounce_number_it(dt.hour) - - speak += " e" - if dt.minute == 0: - speak = speak[:-2] - if not use_ampm: - speak += " in punto" - elif dt.minute == 15: - speak += " un quarto" - elif dt.minute == 45: - speak += " tre quarti" - else: - if dt.minute < 10: - speak += " zero" - speak += " " + pronounce_number_it(dt.minute) - - if use_ampm: - - if dt.hour < 4: - speak.strip() - elif dt.hour > 20: - speak += " della notte" - elif dt.hour > 17: - speak += " della sera" - elif dt.hour > 12: - speak += " del pomeriggio" - else: - speak += " della mattina" - - return speak +TODO: Remove in 20.02 +""" +from lingua_franca.lang.format_it import * diff --git a/mycroft/util/lang/format_nl.py b/mycroft/util/lang/format_nl.py index 4179bde693..cfab6301cb 100644 --- a/mycroft/util/lang/format_nl.py +++ b/mycroft/util/lang/format_nl.py @@ -14,382 +14,8 @@ # limitations under the License. # -from mycroft.util.lang.format_common import convert_to_mixed_fraction -from math import floor +"""File kept for backwards compatibility. -months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', - 'juli', 'augustus', 'september', 'oktober', 'november', - 'december'] - -NUM_STRING_NL = { - 0: 'nul', - 1: 'één', - 2: 'twee', - 3: 'drie', - 4: 'vier', - 5: 'vijf', - 6: 'zes', - 7: 'zeven', - 8: 'acht', - 9: 'negen', - 10: 'tien', - 11: 'elf', - 12: 'twaalf', - 13: 'dertien', - 14: 'veertien', - 15: 'vijftien', - 16: 'zestien', - 17: 'zeventien', - 18: 'actien', - 19: 'negentien', - 20: 'twintig', - 30: 'dertig', - 40: 'veertig', - 50: 'vijftig', - 60: 'zestig', - 70: 'zeventig', - 80: 'tachtig', - 90: 'negentig', - 100: 'honderd' -} - -# German uses "long scale" https://en.wikipedia.org/wiki/Long_and_short_scales -# Currently, numbers are limited to 1000000000000000000000000, -# but NUM_POWERS_OF_TEN can be extended to include additional number words - - -NUM_POWERS_OF_TEN = [ - '', 'duizend', 'miljoen', 'miljard', 'biljoen', 'biljard', 'triljoen', - 'triljard' -] - -FRACTION_STRING_NL = { - 2: 'half', - 3: 'derde', - 4: 'vierde', - 5: 'vijfde', - 6: 'zesde', - 7: 'zevende', - 8: 'achtste', - 9: 'negende', - 10: 'tiende', - 11: 'elfde', - 12: 'twaalfde', - 13: 'dertiende', - 14: 'veertiende', - 15: 'vijftiende', - 16: 'zestiende', - 17: 'zeventiende', - 18: 'achttiende', - 19: 'negentiende', - 20: 'twintigste' -} - -# Numbers below 1 million are written in one word in dutch, yielding very -# long words -# In some circumstances it may better to seperate individual words -# Set EXTRA_SPACE=" " for separating numbers below 1 million ( -# orthographically incorrect) -# Set EXTRA_SPACE="" for correct spelling, this is standard - -# EXTRA_SPACE = " " -EXTRA_SPACE = "" - - -def nice_number_nl(number, speech, denominators=range(1, 21)): - """ Dutch helper for nice_number - This function formats a float to human understandable functions. Like - 4.5 becomes "4 einhalb" for speech and "4 1/2" for text - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. - """ - result = convert_to_mixed_fraction(number, denominators) - if not result: - # Give up, just represent as a 3 decimal number - return str(round(number, 3)).replace(".", ",") - whole, num, den = result - if not speech: - if num == 0: - # TODO: Number grouping? E.g. "1,000,000" - return str(whole) - else: - return '{} {}/{}'.format(whole, num, den) - if num == 0: - return str(whole) - den_str = FRACTION_STRING_NL[den] - if whole == 0: - if num == 1: - return_string = 'één {}'.format(den_str) - else: - return_string = '{} {}'.format(num, den_str) - elif num == 1: - return_string = '{} en één {}'.format(whole, den_str) - else: - return_string = '{} en {} {}'.format(whole, num, den_str) - - return return_string - - -def pronounce_number_nl(num, places=2): - """ - Convert a number to its spoken equivalent - For example, '5.2' would return 'five point two' - Args: - num(float or int): the number to pronounce (set limit below) - places(int): maximum decimal places to speak - Returns: - (str): The pronounced number - - """ - - def pronounce_triplet_nl(num): - result = "" - num = floor(num) - if num > 99: - hundreds = floor(num / 100) - if hundreds > 0: - result += NUM_STRING_NL[ - hundreds] + EXTRA_SPACE + 'honderd' + EXTRA_SPACE - num -= hundreds * 100 - if num == 0: - result += '' # do nothing - elif num <= 20: - result += NUM_STRING_NL[num] # + EXTRA_SPACE - elif num > 20: - ones = num % 10 - tens = num - ones - if ones > 0: - result += NUM_STRING_NL[ones] + EXTRA_SPACE - if tens > 0: - result += 'en' + EXTRA_SPACE - if tens > 0: - result += NUM_STRING_NL[tens] + EXTRA_SPACE - return result - - def pronounce_fractional_nl(num, - places): # fixed number of places even with - # trailing zeros - result = "" - place = 10 - while places > 0: # doesn't work with 1.0001 and places = 2: int( - # num*place) % 10 > 0 and places > 0: - result += " " + NUM_STRING_NL[int(num * place) % 10] - if int(num * place) % 10 == 1: - result += '' # "1" is pronounced "eins" after the decimal - # point - place *= 10 - places -= 1 - return result - - def pronounce_whole_number_nl(num, scale_level=0): - if num == 0: - return '' - - num = floor(num) - result = '' - last_triplet = num % 1000 - - if last_triplet == 1: - if scale_level == 0: - if result != '': - result += '' + 'één' - else: - result += "één" - elif scale_level == 1: - result += 'één' + EXTRA_SPACE + 'duizend' + EXTRA_SPACE - else: - result += "één " + NUM_POWERS_OF_TEN[scale_level] + ' ' - elif last_triplet > 1: - result += pronounce_triplet_nl(last_triplet) - if scale_level == 1: - # result += EXTRA_SPACE - result += 'duizend' + EXTRA_SPACE - if scale_level >= 2: - # if EXTRA_SPACE == '': - # result += " " - result += " " + NUM_POWERS_OF_TEN[scale_level] + ' ' - if scale_level >= 2: - if scale_level % 2 == 0: - result += "" # Miljioen - result += "" # Miljard, Miljoen - - num = floor(num / 1000) - scale_level += 1 - return pronounce_whole_number_nl(num, - scale_level) + result + '' - - result = "" - if abs(num) >= 1000000000000000000000000: # cannot do more than this - return str(num) - elif num == 0: - return str(NUM_STRING_NL[0]) - elif num < 0: - return "min " + pronounce_number_nl(abs(num), places) - else: - if num == int(num): - return pronounce_whole_number_nl(num) - else: - whole_number_part = floor(num) - fractional_part = num - whole_number_part - result += pronounce_whole_number_nl(whole_number_part) - if places > 0: - result += " komma" - result += pronounce_fractional_nl(fractional_part, places) - return result - - -def pronounce_ordinal_nl(num): - ordinals = ["nulste", "eerste", "tweede", "derde", "vierde", "vijfde", - "zesde", "zevende", "achtste"] - - # only for whole positive numbers including zero - if num < 0 or num != int(num): - return num - if num < 4: - return ordinals[num] - if num < 8: - return pronounce_number_nl(num) + "de" - if num < 9: - return pronounce_number_nl(num) + "ste" - if num < 20: - return pronounce_number_nl(num) + "de" - return pronounce_number_nl(num) + "ste" - - -def nice_time_nl(dt, speech=True, use_24hour=False, use_ampm=False): - """ - Format a time to a comfortable human format - - For example, generate 'five thirty' for speech or '5:30' for - text display. - - Args: - dt (datetime): date to format (assumes already in local timezone) - speech (bool): format for speech (default/True) or display (False)=Fal - use_24hour (bool): output in 24-hour/military or 12-hour format - use_ampm (bool): include the am/pm for 12-hour format - Returns: - (str): The formatted time string - """ - if use_24hour: - # e.g. "03:01" or "14:22" - string = dt.strftime("%H:%M") - else: - if use_ampm: - # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") - else: - # e.g. "3:01" or "2:22" - string = dt.strftime("%I:%M") - if string[0] == '0': - string = string[1:] # strip leading zeros - - if not speech: - return string - - # Generate a speakable version of the time - speak = "" - if use_24hour: - speak += pronounce_number_nl(dt.hour) - speak += " uur" - if not dt.minute == 0: # zero minutes are not pronounced, 13:00 is - # "13 uur" not "13 hundred hours" - speak += " " + pronounce_number_nl(dt.minute) - return speak # ampm is ignored when use_24hour is true - else: - if dt.hour == 0 and dt.minute == 0: - return "Middernacht" - hour = dt.hour % 12 - if dt.minute == 0: - hour = fix_hour(hour) - speak += pronounce_number_nl(hour) - speak += " uur" - elif dt.minute == 30: - speak += "half " - hour += 1 - hour = fix_hour(hour) - speak += pronounce_number_nl(hour) - elif dt.minute == 15: - speak += "kwart over " - hour = fix_hour(hour) - speak += pronounce_number_nl(hour) - elif dt.minute == 45: - speak += "kwart voor " - hour += 1 - hour = fix_hour(hour) - speak += pronounce_number_nl(hour) - elif dt.minute > 30: - speak += pronounce_number_nl(60 - dt.minute) - speak += " voor " - hour += 1 - hour = fix_hour(hour) - speak += pronounce_number_nl(hour) - else: - speak += pronounce_number_nl(dt.minute) - speak += " over " - hour = fix_hour(hour) - speak += pronounce_number_nl(hour) - - if use_ampm: - speak += nice_part_of_day_nl(dt) - - return speak - - -def fix_hour(hour): - hour = hour % 12 - if hour == 0: - hour = 12 - return hour - - -def nice_part_of_day_nl(dt): - if dt.hour < 6: - return " 's nachts" - if dt.hour < 12: - return " 's ochtends" - if dt.hour < 18: - return " 's middags" - if dt.hour < 24: - return " 's avonds" - raise Exception('dt.hour is bigger than 24') - - -def nice_response_nl(text): - # check for months and call nice_ordinal_nl declension of ordinals - # replace "^" with "tot de macht" (to the power of) - words = text.split() - - for idx, word in enumerate(words): - if word.lower() in months: - text = nice_ordinal_nl(text) - - if word == '^': - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - if wordNext.isnumeric(): - words[idx] = "tot de macht" - text = " ".join(words) - return text - - -def nice_ordinal_nl(text): - # check for months for declension of ordinals before months - # depending on articles/prepositions - normalized_text = text - words = text.split() - for idx, word in enumerate(words): - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordPrev = words[idx - 1] if idx > 0 else "" - if word[:-1].isdecimal(): - if wordNext.lower() in months: - if wordPrev == 'de': - word = pronounce_ordinal_nl(int(word)) - else: - word = pronounce_number_nl(int(word)) - words[idx] = word - normalized_text = " ".join(words) - return normalized_text +TODO: Remove in 20.02 +""" +from lingua_franca.lang.format_nl import * diff --git a/mycroft/util/lang/format_pt.py b/mycroft/util/lang/format_pt.py index a6da14a948..63db7c204e 100644 --- a/mycroft/util/lang/format_pt.py +++ b/mycroft/util/lang/format_pt.py @@ -14,209 +14,8 @@ # limitations under the License. # -from mycroft.util.lang.format_common import convert_to_mixed_fraction -from mycroft.util.lang.common_data_pt import _FRACTION_STRING_PT, \ - _NUM_STRING_PT +"""File kept for backwards compatibility. - -def nice_number_pt(number, speech, denominators=range(1, 21)): - """ Portuguese helper for nice_number - - This function formats a float to human understandable functions. Like - 4.5 becomes "4 e meio" for speech and "4 1/2" for text - - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. - """ - - result = convert_to_mixed_fraction(number, denominators) - if not result: - # Give up, just represent as a 3 decimal number - return str(round(number, 3)) - - whole, num, den = result - - if not speech: - if num == 0: - # TODO: Number grouping? E.g. "1,000,000" - return str(whole) - else: - return '{} {}/{}'.format(whole, num, den) - - if num == 0: - return str(whole) - # denominador - den_str = _FRACTION_STRING_PT[den] - # fracções - if whole == 0: - if num == 1: - # um décimo - return_string = 'um {}'.format(den_str) - else: - # três meio - return_string = '{} {}'.format(num, den_str) - # inteiros >10 - elif num == 1: - # trinta e um - return_string = '{} e {}'.format(whole, den_str) - # inteiros >10 com fracções - else: - # vinte e 3 décimo - return_string = '{} e {} {}'.format(whole, num, den_str) - # plural - if num > 1: - return_string += 's' - return return_string - - -def pronounce_number_pt(num, places=2): - """ - Convert a number to it's spoken equivalent - For example, '5.2' would return 'cinco virgula dois' - Args: - num(float or int): the number to pronounce (under 100) - places(int): maximum decimal places to speak - Returns: - (str): The pronounced number - """ - if abs(num) >= 100: - # TODO: Support n > 100 - return str(num) - - result = "" - if num < 0: - result = "menos " - num = abs(num) - - if num >= 20: - tens = int(num - int(num) % 10) - ones = int(num - tens) - result += _NUM_STRING_PT[tens] - if ones > 0: - result += " e " + _NUM_STRING_PT[ones] - else: - result += _NUM_STRING_PT[int(num)] - - # Deal with decimal part, in portuguese is commonly used the comma - # instead the dot. Decimal part can be written both with comma - # and dot, but when pronounced, its pronounced "virgula" - if not num == int(num) and places > 0: - result += " vírgula" - place = 10 - while int(num * place) % 10 > 0 and places > 0: - result += " " + _NUM_STRING_PT[int(num * place) % 10] - place *= 10 - places -= 1 - return result - - -def nice_time_pt(dt, speech=True, use_24hour=False, use_ampm=False): - """ - Format a time to a comfortable human format - For example, generate 'cinco treinta' for speech or '5:30' for - text display. - Args: - dt (datetime): date to format (assumes already in local timezone) - speech (bool): format for speech (default/True) or display (False)=Fal - use_24hour (bool): output in 24-hour/military or 12-hour format - use_ampm (bool): include the am/pm for 12-hour format - Returns: - (str): The formatted time string - """ - if use_24hour: - # e.g. "03:01" or "14:22" - string = dt.strftime("%H:%M") - else: - if use_ampm: - # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") - else: - # e.g. "3:01" or "2:22" - string = dt.strftime("%I:%M") - if string[0] == '0': - string = string[1:] # strip leading zeros - - if not speech: - return string - - # Generate a speakable version of the time - speak = "" - if use_24hour: - # simply speak the number - if dt.hour == 1: - speak += "uma" - else: - speak += pronounce_number_pt(dt.hour) - - # equivalent to "quarter past ten" - if dt.minute > 0: - speak += " e " + pronounce_number_pt(dt.minute) - - else: - # speak number and add daytime identifier - # (equivalent to "in the morning") - if dt.minute == 35: - minute = -25 - hour = dt.hour + 1 - elif dt.minute == 40: - minute = -20 - hour = dt.hour + 1 - elif dt.minute == 45: - minute = -15 - hour = dt.hour + 1 - elif dt.minute == 50: - minute = -10 - hour = dt.hour + 1 - elif dt.minute == 55: - minute = -5 - hour = dt.hour + 1 - else: - minute = dt.minute - hour = dt.hour - - if hour == 0: - speak += "meia noite" - elif hour == 12: - speak += "meio dia" - # 1 and 2 are pronounced in female form when talking about hours - elif hour == 1 or hour == 13: - speak += "uma" - elif hour == 2 or hour == 14: - speak += "duas" - elif hour < 13: - speak = pronounce_number_pt(hour) - else: - speak = pronounce_number_pt(hour - 12) - - if minute != 0: - if minute == 15: - speak += " e um quarto" - elif minute == 30: - speak += " e meia" - elif minute == -15: - speak += " menos um quarto" - else: - if minute > 0: - speak += " e " + pronounce_number_pt(minute) - else: - speak += " " + pronounce_number_pt(minute) - - # exact time - if minute == 0 and not use_ampm: - # 3:00 - speak += " em ponto" - - if use_ampm: - if hour > 0 and hour < 6: - speak += " da madrugada" - elif hour >= 6 and hour < 12: - speak += " da manhã" - elif hour >= 13 and hour < 21: - speak += " da tarde" - elif hour != 0 and hour != 12: - speak += " da noite" - return speak +TODO: Remove in 20.02 +""" +from lingua_franca.lang.format_pt import * diff --git a/mycroft/util/lang/format_sv.py b/mycroft/util/lang/format_sv.py index ab36d1fec9..3f0e4a05ed 100644 --- a/mycroft/util/lang/format_sv.py +++ b/mycroft/util/lang/format_sv.py @@ -14,411 +14,8 @@ # limitations under the License. # -from mycroft.util.lang.format_common import convert_to_mixed_fraction -from math import floor +"""File kept for backwards compatibility. -months = ['januari', 'februari', 'mars', 'april', 'maj', 'juni', - 'juli', 'augusti', 'september', 'oktober', 'november', - 'december'] - -NUM_STRING_SV = { - 0: 'noll', - 1: 'en', - 2: 'två', - 3: 'tre', - 4: 'fyra', - 5: 'fem', - 6: 'sex', - 7: 'sju', - 8: 'åtta', - 9: 'nio', - 10: 'tio', - 11: 'elva', - 12: 'tolv', - 13: 'tretton', - 14: 'fjorton', - 15: 'femton', - 16: 'sexton', - 17: 'sjutton', - 18: 'arton', - 19: 'nitton', - 20: 'tjugo', - 30: 'trettio', - 40: 'fyrtio', - 50: 'femtio', - 60: 'sextio', - 70: 'sjuttio', - 80: 'åttio', - 90: 'nittio', - 100: 'hundra' -} - -NUM_POWERS_OF_TEN = [ - 'hundra', - 'tusen', - 'miljon', - 'miljard', - 'biljon', - 'biljard', - 'triljon', - 'triljard' -] - -FRACTION_STRING_SV = { - 2: 'halv', - 3: 'tredjedel', - 4: 'fjärdedel', - 5: 'femtedel', - 6: 'sjättedel', - 7: 'sjundedel', - 8: 'åttondel', - 9: 'niondel', - 10: 'tiondel', - 11: 'elftedel', - 12: 'tolftedel', - 13: 'trettondel', - 14: 'fjortondel', - 15: 'femtondel', - 16: 'sextondel', - 17: 'sjuttondel', - 18: 'artondel', - 19: 'nittondel', - 20: 'tjugondel' -} - -EXTRA_SPACE = " " - - -def nice_number_sv(number, speech, denominators=range(1, 21)): - """ Swedish helper for nice_number - - This function formats a float to human understandable functions. Like - 4.5 becomes "4 och en halv" for speech and "4 1/2" for text - - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. - """ - result = convert_to_mixed_fraction(number, denominators) - if not result: - # Give up, just represent as a 3 decimal number - return str(round(number, 3)) - - whole, num, den = result - - if not speech: - if num == 0: - # TODO: Number grouping? E.g. "1,000,000" - return str(whole) - else: - return '{} {}/{}'.format(whole, num, den) - - if num == 0: - return str(whole) - den_str = FRACTION_STRING_SV[den] - if whole == 0: - if num == 1: - return_string = 'en {}'.format(den_str) - else: - return_string = '{} {}'.format(num, den_str) - elif num == 1: - return_string = '{} och en {}'.format(whole, den_str) - else: - return_string = '{} och {} {}'.format(whole, num, den_str) - if num > 1: - return_string += 'ar' - return return_string - - -def pronounce_number_sv(num, places=2): - """ - Convert a number to its spoken equivalent - For example, '5.2' would return 'five point two' - Args: - num(float or int): the number to pronounce (set limit below) - places(int): maximum decimal places to speak - Returns: - (str): The pronounced number - - """ - - def pronounce_triplet_sv(num): - result = "" - num = floor(num) - - if num > 99: - hundreds = floor(num / 100) - if hundreds > 0: - if hundreds == 1: - result += 'ett' + 'hundra' - else: - result += NUM_STRING_SV[hundreds] + 'hundra' - - num -= hundreds * 100 - - if num == 0: - result += '' # do nothing - elif num == 1: - result += 'ett' - elif num <= 20: - result += NUM_STRING_SV[num] - elif num > 20: - tens = num % 10 - ones = num - tens - - if ones > 0: - result += NUM_STRING_SV[ones] - if tens > 0: - result += NUM_STRING_SV[tens] - - return result - - def pronounce_fractional_sv(num, places): - # fixed number of places even with trailing zeros - result = "" - place = 10 - while places > 0: - # doesn't work with 1.0001 and places = 2: int( - # num*place) % 10 > 0 and places > 0: - result += " " + NUM_STRING_SV[int(num * place) % 10] - place *= 10 - places -= 1 - return result - - def pronounce_whole_number_sv(num, scale_level=0): - if num == 0: - return '' - - num = floor(num) - result = '' - last_triplet = num % 1000 - - if last_triplet == 1: - if scale_level == 0: - if result != '': - result += '' + 'ett' - else: - result += 'en' - elif scale_level == 1: - result += 'ettusen' + EXTRA_SPACE - else: - result += 'en ' + NUM_POWERS_OF_TEN[scale_level] + EXTRA_SPACE - elif last_triplet > 1: - result += pronounce_triplet_sv(last_triplet) - if scale_level == 1: - result += 'tusen' + EXTRA_SPACE - if scale_level >= 2: - result += NUM_POWERS_OF_TEN[scale_level] - if scale_level >= 2: - result += 'er' + EXTRA_SPACE # MiljonER - - num = floor(num / 1000) - scale_level += 1 - return pronounce_whole_number_sv(num, scale_level) + result - - result = "" - if abs(num) >= 1000000000000000000000000: # cannot do more than this - return str(num) - elif num == 0: - return str(NUM_STRING_SV[0]) - elif num < 0: - return "minus " + pronounce_number_sv(abs(num), places) - else: - if num == int(num): - return pronounce_whole_number_sv(num) - else: - whole_number_part = floor(num) - fractional_part = num - whole_number_part - result += pronounce_whole_number_sv(whole_number_part) - if places > 0: - result += " komma" - result += pronounce_fractional_sv(fractional_part, places) - return result - - -def pronounce_ordinal_sv(num): - # ordinals for 1, 3, 7 and 8 are irregular - # this produces the base form, it will have to be adapted for genus, - # casus, numerus - - ordinals = ["noll", "första", "andra", "tredje", "fjärde", "femte", - "sjätte", "sjunde", "åttonde", "nionde", "tionde"] - - tens = int(floor(num / 10.0)) * 10 - ones = num % 10 - - if num < 0 or num != int(num): - return num - if num == 0: - return ordinals[num] - - result = "" - if num > 10: - result += pronounce_number_sv(tens).rstrip() - - if ones > 0: - result += ordinals[ones] - else: - result += 'de' - - return result - - -def nice_time_sv(dt, speech=True, use_24hour=False, use_ampm=False): - """ - Format a time to a comfortable human format - - For example, generate 'five thirty' for speech or '5:30' for - text display. - - Args: - dt (datetime): date to format (assumes already in local timezone) - speech (bool): format for speech (default/True) or display (False)=Fal - use_24hour (bool): output in 24-hour/military or 12-hour format - use_ampm (bool): include the am/pm for 12-hour format - Returns: - (str): The formatted time string - """ - if use_24hour: - # e.g. "03:01" or "14:22" - string = dt.strftime("%H:%M") - else: - if use_ampm: - # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") - else: - # e.g. "3:01" or "2:22" - string = dt.strftime("%I:%M") - - if not speech: - return string - - # Generate a speakable version of the time - speak = "" - if use_24hour: - if dt.hour == 1: - speak += "ett" # 01:00 is "ett" not "en" - else: - speak += pronounce_number_sv(dt.hour) - if not dt.minute == 0: - if dt.minute < 10: - speak += ' noll' - - if dt.minute == 1: - speak += ' ett' - else: - speak += " " + pronounce_number_sv(dt.minute) - - return speak # ampm is ignored when use_24hour is true - else: - hour = dt.hour - - if not dt.minute == 0: - if dt.minute < 30: - if dt.minute != 15: - speak += pronounce_number_sv(dt.minute) - else: - speak += 'kvart' - - if dt.minute == 1: - speak += ' minut över ' - elif dt.minute != 10 and dt.minute != 5 and dt.minute != 15: - speak += ' minuter över ' - else: - speak += ' över ' - elif dt.minute > 30: - if dt.minute != 45: - speak += pronounce_number_sv((60 - dt.minute)) - else: - speak += 'kvart' - - if dt.minute == 1: - speak += ' minut i ' - elif dt.minute != 50 and dt.minute != 55 and dt.minute != 45: - speak += ' minuter i ' - else: - speak += ' i ' - - hour = (hour + 1) % 12 - elif dt.minute == 30: - speak += 'halv ' - hour = (hour + 1) % 12 - - if hour == 0 and dt.minute == 0: - return "midnatt" - if hour == 12 and dt.minute == 0: - return "middag" - # TODO: "half past 3", "a quarter of 4" and other idiomatic times - - if hour == 0: - speak += pronounce_number_sv(12) - elif hour <= 13: - if hour == 1 or hour == 13: # 01:00 and 13:00 is "ett" - speak += 'ett' - else: - speak += pronounce_number_sv(hour) - else: - speak += pronounce_number_sv(hour - 12) - - if use_ampm: - if dt.hour > 11: - if dt.hour < 18: - # 12:01 - 17:59 nachmittags/afternoon - speak += " på eftermiddagen" - elif dt.hour < 22: - # 18:00 - 21:59 abends/evening - speak += " på kvällen" - else: - # 22:00 - 23:59 nachts/at night - speak += " på natten" - elif dt.hour < 3: - # 00:01 - 02:59 nachts/at night - speak += " på natten" - else: - # 03:00 - 11:59 morgens/in the morning - speak += " på morgonen" - - return speak - - -def nice_response_sv(text): - # check for months and call nice_ordinal_sv declension of ordinals - # replace "^" with "hoch" (to the power of) - words = text.split() - - for idx, word in enumerate(words): - if word.lower() in months: - text = nice_ordinal_sv(text) - - if word == '^': - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - if wordNext.isnumeric(): - words[idx] = "upphöjt till" - text = " ".join(words) - return text - - -def nice_ordinal_sv(text): - # check for months for declension of ordinals before months - # depending on articles/prepositions - normalized_text = text - words = text.split() - - for idx, word in enumerate(words): - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordPrev = words[idx - 1] if idx > 0 else "" - if word[-1:] == ".": - if word[:-1].isdecimal(): - if wordNext.lower() in months: - word = pronounce_ordinal_sv(int(word[:-1])) - if wordPrev.lower() in ["om", "den", "från", "till", - "(från", "(om", "till"]: - word += "n" - elif wordPrev.lower() not in ["den"]: - word += "r" - words[idx] = word - normalized_text = " ".join(words) - return normalized_text +TODO: Remove in 20.02 +""" +from lingua_franca.lang.format_sv import * diff --git a/mycroft/util/lang/parse_common.py b/mycroft/util/lang/parse_common.py index 2281439a23..ccafa24e91 100644 --- a/mycroft/util/lang/parse_common.py +++ b/mycroft/util/lang/parse_common.py @@ -14,89 +14,8 @@ # limitations under the License. # +"""File kept for backwards compatibility. -def is_numeric(input_str): - """ - Takes in a string and tests to see if it is a number. - Args: - text (str): string to test if a number - Returns: - (bool): True if a number, else False - - """ - - try: - float(input_str) - return True - except ValueError: - return False - - -def look_for_fractions(split_list): - """" - This function takes a list made by fraction & determines if a fraction. - - Args: - split_list (list): list created by splitting on '/' - Returns: - (bool): False if not a fraction, otherwise True - - """ - - if len(split_list) == 2: - if is_numeric(split_list[0]) and is_numeric(split_list[1]): - return True - - return False - - -def extract_numbers_generic(text, pronounce_handler, extract_handler, - short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - Language agnostic, per language parsers need to be provided - - Args: - text (str): the string to extract a number from - pronounce_handler (function): function that pronounces a number - extract_handler (function): function that extracts the last number - present in a string - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ - numbers = [] - normalized = text - extract = extract_handler(normalized, short_scale, ordinals) - to_parse = normalized - while extract: - numbers.append(extract) - prev = to_parse - num_txt = pronounce_handler(extract) - extract = str(extract) - if extract.endswith(".0"): - extract = extract[:-2] - - # handle duplicate occurences, replace last one only - def replace_right(source, target, replacement, replacements=None): - return replacement.join(source.rsplit(target, replacements)) - - normalized = replace_right(normalized, num_txt, extract, 1) - # last biggest number was replaced, recurse to handle cases like - # test one two 3 - to_parse = replace_right(to_parse, num_txt, extract, 1) - to_parse = replace_right(to_parse, extract, " ", 1) - if to_parse == prev: - # avoid infinite loops, occasionally pronounced number may be - # different from extracted text, - # ie pronounce(0.5) != half and extract(half) == 0.5 - extract = False - # TODO fix this - else: - extract = extract_handler(to_parse, short_scale, ordinals) - numbers.reverse() - return numbers +TODO: Remove in 20.02 +""" +from lingua_franca.lang.parse_common import * diff --git a/mycroft/util/lang/parse_da.py b/mycroft/util/lang/parse_da.py index 0ab296bcff..3a8ed6f935 100644 --- a/mycroft/util/lang/parse_da.py +++ b/mycroft/util/lang/parse_da.py @@ -13,920 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from datetime import datetime -from dateutil.relativedelta import relativedelta -from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \ - extract_numbers_generic -from mycroft.util.lang.format_da import pronounce_number_da -da_numbers = { - 'nul': 0, - 'en': 1, - 'et': 1, - 'to': 2, - 'tre': 3, - 'fire': 4, - 'fem': 5, - 'seks': 6, - 'syv': 7, - 'otte': 8, - 'ni': 9, - 'ti': 10, - 'elve': 11, - 'tolv': 12, - 'tretten': 13, - 'fjorten': 14, - 'femten': 15, - 'seksten': 16, - 'sytten': 17, - 'atten': 18, - 'nitten': 19, - 'tyve': 20, - 'enogtyve': 21, - 'toogtyve': 22, - 'treogtyve': 23, - 'fireogtyve': 24, - 'femogtyve': 25, - 'seksogtyve': 26, - 'syvogtyve': 27, - 'otteogtyve': 28, - 'niogtyve': 29, - 'tredive': 30, - 'enogtredive': 31, - 'fyrrre': 40, - 'halvtres': 50, - 'tres': 60, - 'halvfjers': 70, - 'firs': 80, - 'halvfems': 90, - 'hunderede': 100, - 'tohundrede': 200, - 'trehundrede': 300, - 'firehundrede': 400, - 'femhundrede': 500, - 'sekshundrede': 600, - 'syvhundrede': 700, - 'ottehundrede': 800, - 'nihundrede': 900, - 'tusinde': 1000, - 'million': 1000000 -} +"""File kept for backwards compatibility. - -def extractnumber_da(text): - """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' - - """ - aWords = text.split() - aWords = [word for word in aWords if - word not in ["den", "det"]] - and_pass = False - valPreAnd = False - val = False - count = 0 - while count < len(aWords): - word = aWords[count] - if is_numeric(word): - if word.isdigit(): # doesn't work with decimals - val = float(word) - elif isFractional_da(word): - val = isFractional_da(word) - elif isOrdinal_da(word): - val = isOrdinal_da(word) - else: - if word in da_numbers: - val = da_numbers[word] - if count < (len(aWords) - 1): - wordNext = aWords[count + 1] - else: - wordNext = "" - valNext = isFractional_da(wordNext) - - if valNext: - val = val * valNext - aWords[count + 1] = "" - - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - elif and_pass: - # added to value, quit here - val = valPreAnd - break - else: - count += 1 - continue - - aWords[count] = "" - - if and_pass: - aWords[count - 1] = '' # remove "og" - val += valPreAnd - elif count + 1 < len(aWords) and aWords[count + 1] == 'og': - and_pass = True - valPreAnd = val - val = False - count += 2 - continue - elif count + 2 < len(aWords) and aWords[count + 2] == 'og': - and_pass = True - valPreAnd = val - val = False - count += 3 - continue - - break - - if not val: - return False - - return val - - -def extract_datetime_da(string, currentDate, default_time): - def clean_string(s): - """ - cleans the input string of unneeded punctuation - and capitalization among other things. - - 'am' is a preposition, so cannot currently be used - for 12 hour date format - """ - - s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ - .replace(' den ', ' ').replace(' det ', ' ').replace(' om ', - ' ').replace( - ' om ', ' ') \ - .replace(' på ', ' ').replace(' om ', ' ') - wordList = s.split() - - for idx, word in enumerate(wordList): - if isOrdinal_da(word) is not False: - word = str(isOrdinal_da(word)) - wordList[idx] = word - - return wordList - - def date_found(): - return found or \ - ( - datestr != "" or timeStr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs or minOffset != 0 or - minAbs or secOffset != 0 - ) - - if string == "" or not currentDate: - return None - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - dateNow = currentDate - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - timeQualifiersList = ['tidlig', - 'morgen', - 'morgenen', - 'formidag', - 'formiddagen', - 'eftermiddag', - 'eftermiddagen', - 'aften', - 'aftenen', - 'nat', - 'natten'] - markers = ['i', 'om', 'på', 'klokken', 'ved'] - days = ['mandag', 'tirsdag', 'onsdag', - 'torsdag', 'fredag', 'lørdag', 'søndag'] - months = ['januar', 'februar', 'marts', 'april', 'maj', 'juni', - 'juli', 'august', 'september', 'oktober', 'november', - 'desember'] - monthsShort = ['jan', 'feb', 'mar', 'apr', 'maj', 'juni', 'juli', 'aug', - 'sep', 'okt', 'nov', 'des'] - - validFollowups = days + months + monthsShort - validFollowups.append("i dag") - validFollowups.append("morgen") - validFollowups.append("næste") - validFollowups.append("forige") - validFollowups.append("nu") - - words = clean_string(string) - - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - - start = idx - used = 0 - # save timequalifier for later - if word in timeQualifiersList: - timeQualifier = word - # parse today, tomorrow, day after tomorrow - elif word == "dag" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "morgen" and not fromFlag and wordPrev != "om" and \ - wordPrev not in days: # morgen means tomorrow if not "am - # Morgen" and not [day of the week] morgen - dayOffset = 1 - used += 1 - elif word == "overmorgen" and not fromFlag: - dayOffset = 2 - used += 1 - # parse 5 days, 10 weeks, last week, next week - elif word == "dag" or word == "dage": - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used = 2 - elif word == "uge" or word == "uger" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - elif wordPrev[:6] == "næste": - dayOffset = 7 - start -= 1 - used = 2 - elif wordPrev[:5] == "forige": - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "måned" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev[:6] == "næste": - monthOffset = 1 - start -= 1 - used = 2 - elif wordPrev[:5] == "forige": - monthOffset = -1 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "år" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev[:6] == " næste": - yearOffset = 1 - start -= 1 - used = 2 - elif wordPrev[:6] == "næste": - yearOffset = -1 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - if wordNext == "morgen": - # morgen means morning if preceded by - # the day of the week - words[idx + 1] = "tidlig" - if wordPrev[:6] == "næste": - dayOffset += 7 - used += 1 - start -= 1 - elif wordPrev[:5] == "forige": - dayOffset -= 7 - used += 1 - start -= 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in monthsShort and not fromFlag: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months[m] - if wordPrev and (wordPrev[0].isdigit() or - (wordPrev == "of" and wordPrevPrev[0].isdigit())): - if wordPrev == "of" and wordPrevPrev[0].isdigit(): - datestr += " " + words[idx - 2] - used += 1 - start -= 1 - else: - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - - if ( - word == "fra" or word == "til" or word == "om") and wordNext \ - in validFollowups: - used = 2 - fromFlag = True - if wordNext == "morgenen" and \ - wordPrev != "om" and \ - wordPrev not in days: - # morgen means tomorrow if not "am Morgen" and not - # [day of the week] morgen: - dayOffset += 1 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if tmpOffset < 0: - tmpOffset += 7 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNext[:6] == "næste": - tmpOffset += 7 - used += 1 - start -= 1 - elif wordNext[:5] == "forige": - tmpOffset -= 7 - used += 1 - start -= 1 - dayOffset += tmpOffset - if used > 0: - if start - 1 > 0 and words[start - 1].startswith("denne"): - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in markers: - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - timeStr = "" - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = None - minAbs = None - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" - wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" - - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word[:6] == "middag": - hrAbs = 12 - used += 1 - elif word[:11] == "midnat": - hrAbs = 0 - used += 1 - elif word == "morgenen" or ( - wordPrev == "om" and word == "morgenen") or word == "tidlig": - if not hrAbs: - hrAbs = 8 - used += 1 - elif word[:11] == "eftermiddag": - if not hrAbs: - hrAbs = 15 - used += 1 - elif word[:5] == "aften": - if not hrAbs: - hrAbs = 19 - used += 1 - # parse half an hour, quarter hour - elif word == "time" and \ - (wordPrev in markers or wordPrevPrev in markers): - if wordPrev[:4] == "halv": - minOffset = 30 - elif wordPrev == "kvarter": - minOffset = 15 - elif wordPrev == "trekvarter": - minOffset = 45 - else: - hrOffset = 1 - if wordPrevPrev in markers: - words[idx - 2] = "" - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - elif nextWord == "aften": - remainder = "pm" - used += 1 - elif wordNext == "om" and wordNextNext == "morgenen": - remainder = "am" - used += 2 - elif wordNext == "om" and wordNextNext == "eftermiddagen": - remainder = "pm" - used += 2 - elif wordNext == "om" and wordNextNext == "aftenen": - remainder = "pm" - used += 2 - elif wordNext == "morgen": - remainder = "am" - used += 1 - elif wordNext == "eftermiddag": - remainder = "pm" - used += 1 - elif wordNext == "aften": - remainder = "pm" - used += 1 - elif wordNext == "i" and wordNextNext == "morgen": - remainder = "am" - used = 2 - elif wordNext == "i" and wordNextNext == "eftermiddag": - remainder = "pm" - used = 2 - elif wordNext == "i" and wordNextNext == "aften": - remainder = "pm" - used = 2 - elif wordNext == "natten": - if strHH > 4: - remainder = "pm" - else: - remainder = "am" - used += 1 - else: - if timeQualifier != "": - if strHH <= 12 and \ - (timeQualifier == "aftenen" or - timeQualifier == "eftermiddagen"): - strHH += 12 # what happens when strHH is 24? - else: - # try to parse # s without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - else: - if wordNext == "time" and int(word) < 100: - # "in 3 hours" - hrOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "minut": - # "in 10 minutes" - minOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "sekund": - # in 5 seconds - secOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "time": - strHH = word - used += 1 - isTime = True - if wordNextNext == timeQualifier: - strMM = "" - if wordNextNext[:11] == "eftermiddag": - used += 1 - remainder = "pm" - elif wordNextNext == "om" and wordNextNextNext == \ - "eftermiddagen": - used += 2 - remainder = "pm" - elif wordNextNext[:5] == "aften": - used += 1 - remainder = "pm" - elif wordNextNext == "om" and wordNextNextNext == \ - "aftenen": - used += 2 - remainder = "pm" - elif wordNextNext[:6] == "morgen": - used += 1 - remainder = "am" - elif wordNextNext == "om" and wordNextNextNext == \ - "morgenen": - used += 2 - remainder = "am" - elif wordNextNext == "natten": - used += 1 - if 8 <= int(word) <= 12: - remainder = "pm" - else: - remainder = "am" - - elif is_numeric(wordNextNext): - strMM = wordNextNext - used += 1 - if wordNextNextNext == timeQualifier: - if wordNextNextNext[:11] == "eftermiddag": - used += 1 - remainder = "pm" - elif wordNextNextNext == "om" and \ - wordNextNextNextNext == \ - "eftermiddagen": - used += 2 - remainder = "pm" - elif wordNextNextNext[:6] == "natten": - used += 1 - remainder = "pm" - elif wordNextNextNext == "am" and \ - wordNextNextNextNext == "natten": - used += 2 - remainder = "pm" - elif wordNextNextNext[:7] == "morgenen": - used += 1 - remainder = "am" - elif wordNextNextNext == "om" and \ - wordNextNextNextNext == "morgenen": - used += 2 - remainder = "am" - elif wordNextNextNext == "natten": - used += 1 - if 8 <= int(word) <= 12: - remainder = "pm" - else: - remainder = "am" - - elif wordNext == timeQualifier: - strHH = word - strMM = 00 - isTime = True - if wordNext[:10] == "eftermidag": - used += 1 - remainder = "pm" - elif wordNext == "om" and \ - wordNextNext == "eftermiddanen": - used += 2 - remainder = "pm" - elif wordNext[:7] == "aftenen": - used += 1 - remainder = "pm" - elif wordNext == "om" and wordNextNext == "aftenen": - used += 2 - remainder = "pm" - elif wordNext[:7] == "morgenen": - used += 1 - remainder = "am" - elif wordNext == "ao" and wordNextNext == "morgenen": - used += 2 - remainder = "am" - elif wordNext == "natten": - used += 1 - if 8 <= int(word) <= 12: - remainder = "pm" - else: - remainder = "am" - - # if timeQualifier != "": - # military = True - # else: - # isTime = False - - strHH = int(strHH) if strHH else 0 - strMM = int(strMM) if strMM else 0 - strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH - strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH - if strHH > 24 or strMM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = strHH * 1 - minAbs = strMM * 1 - used += 1 - if used > 0: - # removed parsed words from the sentence - for i in range(used): - words[idx + i] = "" - - if wordPrev == "tidlig": - hrOffset = -1 - words[idx - 1] = "" - idx -= 1 - elif wordPrev == "sen": - hrOffset = 1 - words[idx - 1] = "" - idx -= 1 - if idx > 0 and wordPrev in markers: - words[idx - 1] = "" - if idx > 1 and wordPrevPrev in markers: - words[idx - 2] = "" - - idx += used - 1 - found = True - - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow - extractedDate = extractedDate.replace(microsecond=0, - second=0, - minute=0, - hour=0) - if datestr != "": - en_months = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', - 'aug', - 'sept', 'oct', 'nov', 'dec'] - for idx, en_month in enumerate(en_months): - datestr = datestr.replace(months[idx], en_month) - for idx, en_month in enumerate(en_monthsShort): - datestr = datestr.replace(monthsShort[idx], en_month) - - temp = datetime.strptime(datestr, "%B %d") - if not hasYear: - temp = temp.replace(year=extractedDate.year) - if extractedDate < temp: - extractedDate = extractedDate.replace(year=int(currentYear), - month=int( - temp.strftime( - "%m")), - day=int(temp.strftime( - "%d"))) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - - if timeStr != "": - temp = datetime(timeStr) - extractedDate = extractedDate.replace(hour=temp.strftime("%H"), - minute=temp.strftime("%M"), - second=temp.strftime("%S")) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - - if hrAbs is None and minAbs is None and default_time: - hrAbs = default_time.hour - minAbs = default_time.minute - - if hrAbs != -1 and minAbs != -1: - - extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, - minutes=minAbs or 0) - if (hrAbs or minAbs) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - for idx, word in enumerate(words): - if words[idx] == "og" and words[idx - 1] == "" \ - and words[idx + 1] == "": - words[idx] = "" - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - - return [extractedDate, resultStr] - - -def isFractional_da(input_str): - """ - This function takes the given text and checks if it is a fraction. - - Args: - input_str (str): the string to check if fractional - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.lower().startswith("halv"): - return 0.5 - - if input_str.lower() == "trediedel": - return 1.0 / 3 - elif input_str.endswith('del'): - input_str = input_str[:len(input_str) - 3] # e.g. "fünftel" - if input_str.lower() in da_numbers: - return 1.0 / (da_numbers[input_str.lower()]) - - return False - - -def isOrdinal_da(input_str): - """ - This function takes the given text and checks if it is an ordinal number. - - Args: - input_str (str): the string to check if ordinal - Returns: - (bool) or (float): False if not an ordinal, otherwise the number - corresponding to the ordinal - - ordinals for 1, 3, 7 and 8 are irregular - - only works for ordinals corresponding to the numbers in da_numbers - - """ - - lowerstr = input_str.lower() - - if lowerstr.startswith("første"): - return 1 - if lowerstr.startswith("anden"): - return 2 - if lowerstr.startswith("tredie"): - return 3 - if lowerstr.startswith("fjerde"): - return 4 - if lowerstr.startswith("femte"): - return 5 - if lowerstr.startswith("sjette"): - return 6 - if lowerstr.startswith("elfte"): - return 1 - if lowerstr.startswith("tolvfte"): - return 12 - - if lowerstr[-3:] == "nde": - # from 20 suffix is -ste* - lowerstr = lowerstr[:-3] - if lowerstr in da_numbers: - return da_numbers[lowerstr] - - if lowerstr[-4:] in ["ende"]: - lowerstr = lowerstr[:-4] - if lowerstr in da_numbers: - return da_numbers[lowerstr] - - if lowerstr[-2:] == "te": # below 20 suffix is -te* - lowerstr = lowerstr[:-2] - if lowerstr in da_numbers: - return da_numbers[lowerstr] - - return False - - -def normalize_da(text, remove_articles): - """ German string normalization """ - - words = text.split() # this also removed extra spaces - normalized = "" - for word in words: - if remove_articles and word in ["den", "det"]: - continue - - # Convert numbers into digits, e.g. "two" -> "2" - - if word in da_numbers: - word = str(da_numbers[word]) - - normalized += " " + word - - return normalized[1:] # strip the initial space - - -def extract_numbers_da(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ - return extract_numbers_generic(text, pronounce_number_da, extractnumber_da, - short_scale=short_scale, ordinals=ordinals) +TODO: Remove in 20.02 +""" +from lingua_franca.lang.parse_da import * diff --git a/mycroft/util/lang/parse_de.py b/mycroft/util/lang/parse_de.py index 60878b4390..c789ef5255 100644 --- a/mycroft/util/lang/parse_de.py +++ b/mycroft/util/lang/parse_de.py @@ -14,938 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from datetime import datetime -from dateutil.relativedelta import relativedelta -from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \ - extract_numbers_generic -from mycroft.util.lang.format_de import pronounce_number_de +"""File kept for backwards compatibility. -de_numbers = { - 'null': 0, - 'ein': 1, - 'eins': 1, - 'eine': 1, - 'einer': 1, - 'einem': 1, - 'einen': 1, - 'eines': 1, - 'zwei': 2, - 'drei': 3, - 'vier': 4, - 'fünf': 5, - 'sechs': 6, - 'sieben': 7, - 'acht': 8, - 'neun': 9, - 'zehn': 10, - 'elf': 11, - 'zwölf': 12, - 'dreizehn': 13, - 'vierzehn': 14, - 'fünfzehn': 15, - 'sechzehn': 16, - 'siebzehn': 17, - 'achtzehn': 18, - 'neunzehn': 19, - 'zwanzig': 20, - 'einundzwanzig': 21, - 'zweiundzwanzig': 22, - 'dreiundzwanzig': 23, - 'vierundzwanzig': 24, - 'fünfundzwanzig': 25, - 'sechsundzwanzig': 26, - 'siebenundzwanzig': 27, - 'achtundzwanzig': 28, - 'neunundzwanzig': 29, - 'dreißig': 30, - 'einunddreißig': 31, - 'vierzig': 40, - 'fünfzig': 50, - 'sechzig': 60, - 'siebzig': 70, - 'achtzig': 80, - 'neunzig': 90, - 'hundert': 100, - 'zweihundert': 200, - 'dreihundert': 300, - 'vierhundert': 400, - 'fünfhundert': 500, - 'sechshundert': 600, - 'siebenhundert': 700, - 'achthundert': 800, - 'neunhundert': 900, - 'tausend': 1000, - 'million': 1000000 -} - - -def extractnumber_de(text): - """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' - - """ - aWords = text.split() - aWords = [word for word in aWords if - word not in ["der", "die", "das", "des", "den", "dem"]] - and_pass = False - valPreAnd = False - val = False - count = 0 - while count < len(aWords): - word = aWords[count] - if is_numeric(word): - # if word.isdigit(): # doesn't work with decimals - val = float(word) - elif isFractional_de(word): - val = isFractional_de(word) - elif isOrdinal_de(word): - val = isOrdinal_de(word) - else: - if word in de_numbers: - val = de_numbers[word] - if count < (len(aWords) - 1): - wordNext = aWords[count + 1] - else: - wordNext = "" - valNext = isFractional_de(wordNext) - - if valNext: - val = val * valNext - aWords[count + 1] = "" - - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - elif and_pass: - # added to value, quit here - val = valPreAnd - break - else: - count += 1 - continue - - aWords[count] = "" - - if and_pass: - aWords[count - 1] = '' # remove "and" - val += valPreAnd - elif count + 1 < len(aWords) and aWords[count + 1] == 'und': - and_pass = True - valPreAnd = val - val = False - count += 2 - continue - elif count + 2 < len(aWords) and aWords[count + 2] == 'und': - and_pass = True - valPreAnd = val - val = False - count += 3 - continue - - break - - if not val: - return False - - return val - - -def extract_datetime_de(string, currentDate, default_time): - def clean_string(s): - """ - cleans the input string of unneeded punctuation - and capitalization among other things. - - 'am' is a preposition, so cannot currently be used - for 12 hour date format - """ - - s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ - .replace(' der ', ' ').replace(' den ', ' ').replace(' an ', - ' ').replace( - ' am ', ' ') \ - .replace(' auf ', ' ').replace(' um ', ' ') - wordList = s.split() - - for idx, word in enumerate(wordList): - if isOrdinal_de(word) is not False: - word = str(isOrdinal_de(word)) - wordList[idx] = word - - return wordList - - def date_found(): - return found or \ - ( - datestr != "" or timeStr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs or minOffset != 0 or - minAbs or secOffset != 0 - ) - - if string == "" or not currentDate: - return None - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - dateNow = currentDate - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - timeQualifiersList = ['früh', 'morgens', 'vormittag', 'vormittags', - 'nachmittag', 'nachmittags', 'abend', 'abends', - 'nachts'] - markers = ['in', 'am', 'gegen', 'bis', 'für'] - days = ['montag', 'dienstag', 'mittwoch', - 'donnerstag', 'freitag', 'samstag', 'sonntag'] - months = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', - 'juli', 'august', 'september', 'october', 'november', - 'dezember'] - monthsShort = ['jan', 'feb', 'mär', 'apr', 'mai', 'juni', 'juli', 'aug', - 'sept', 'oct', 'nov', 'dez'] - - validFollowups = days + months + monthsShort - validFollowups.append("heute") - validFollowups.append("morgen") - validFollowups.append("nächste") - validFollowups.append("nächster") - validFollowups.append("nächstes") - validFollowups.append("nächsten") - validFollowups.append("nächstem") - validFollowups.append("letzte") - validFollowups.append("letzter") - validFollowups.append("letztes") - validFollowups.append("letzten") - validFollowups.append("letztem") - validFollowups.append("jetzt") - - words = clean_string(string) - - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - - # this isn't in clean string because I don't want to save back to words - - if word != 'morgen' and word != 'übermorgen': - if word[-2:] == "en": - word = word[:-2] # remove en - if word != 'heute': - if word[-1:] == "e": - word = word[:-1] # remove plural for most nouns - - start = idx - used = 0 - # save timequalifier for later - if word in timeQualifiersList: - timeQualifier = word - # parse today, tomorrow, day after tomorrow - elif word == "heute" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "morgen" and not fromFlag and wordPrev != "am" and \ - wordPrev not in days: # morgen means tomorrow if not "am - # Morgen" and not [day of the week] morgen - dayOffset = 1 - used += 1 - elif word == "übermorgen" and not fromFlag: - dayOffset = 2 - used += 1 - # parse 5 days, 10 weeks, last week, next week - elif word == "tag" or word == "tage": - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used = 2 - elif word == "woch" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - elif wordPrev[:6] == "nächst": - dayOffset = 7 - start -= 1 - used = 2 - elif wordPrev[:5] == "letzt": - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "monat" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev[:6] == "nächst": - monthOffset = 1 - start -= 1 - used = 2 - elif wordPrev[:5] == "letzt": - monthOffset = -1 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "jahr" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev[:6] == "nächst": - yearOffset = 1 - start -= 1 - used = 2 - elif wordPrev[:6] == "nächst": - yearOffset = -1 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - if wordNext == "morgen": # morgen means morning if preceded by - # the day of the week - words[idx + 1] = "früh" - if wordPrev[:6] == "nächst": - dayOffset += 7 - used += 1 - start -= 1 - elif wordPrev[:5] == "letzt": - dayOffset -= 7 - used += 1 - start -= 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in monthsShort and not fromFlag: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months[m] - if wordPrev and (wordPrev[0].isdigit() or - (wordPrev == "of" and wordPrevPrev[0].isdigit())): - if wordPrev == "of" and wordPrevPrev[0].isdigit(): - datestr += " " + words[idx - 2] - used += 1 - start -= 1 - else: - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - - if ( - word == "von" or word == "nach" or word == "ab") and wordNext \ - in validFollowups: - used = 2 - fromFlag = True - if wordNext == "morgen" and wordPrev != "am" and \ - wordPrev not in days: # morgen means tomorrow if not "am - # Morgen" and not [day of the week] morgen: - dayOffset += 1 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if tmpOffset < 0: - tmpOffset += 7 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNext[:6] == "nächst": - tmpOffset += 7 - used += 1 - start -= 1 - elif wordNext[:5] == "letzt": - tmpOffset -= 7 - used += 1 - start -= 1 - dayOffset += tmpOffset - if used > 0: - if start - 1 > 0 and words[start - 1].startswith("diese"): - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in markers: - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - timeStr = "" - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = None - minAbs = None - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" - wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" - - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word[:6] == "mittag": - hrAbs = 12 - used += 1 - elif word[:11] == "mitternacht": - hrAbs = 0 - used += 1 - elif word == "morgens" or ( - wordPrev == "am" and word == "morgen") or word == "früh": - if not hrAbs: - hrAbs = 8 - used += 1 - elif word[:10] == "nachmittag": - if not hrAbs: - hrAbs = 15 - used += 1 - elif word[:5] == "abend": - if not hrAbs: - hrAbs = 19 - used += 1 - # parse half an hour, quarter hour - elif word == "stunde" and \ - (wordPrev in markers or wordPrevPrev in markers): - if wordPrev[:4] == "halb": - minOffset = 30 - elif wordPrev == "viertel": - minOffset = 15 - elif wordPrev == "dreiviertel": - minOffset = 45 - else: - hrOffset = 1 - if wordPrevPrev in markers: - words[idx - 2] = "" - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - elif nextWord == "abends": - remainder = "pm" - used += 1 - elif wordNext == "am" and wordNextNext == "morgen": - remainder = "am" - used += 2 - elif wordNext == "am" and wordNextNext == "nachmittag": - remainder = "pm" - used += 2 - elif wordNext == "am" and wordNextNext == "abend": - remainder = "pm" - used += 2 - elif wordNext == "morgens": - remainder = "am" - used += 1 - elif wordNext == "nachmittags": - remainder = "pm" - used += 1 - elif wordNext == "abends": - remainder = "pm" - used += 1 - elif wordNext == "heute" and wordNextNext == "morgen": - remainder = "am" - used = 2 - elif wordNext == "heute" and wordNextNext == "nachmittag": - remainder = "pm" - used = 2 - elif wordNext == "heute" and wordNextNext == "abend": - remainder = "pm" - used = 2 - elif wordNext == "nachts": - if strHH > 4: - remainder = "pm" - else: - remainder = "am" - used += 1 - else: - if timeQualifier != "": - if strHH <= 12 and \ - (timeQualifier == "abends" or - timeQualifier == "nachmittags"): - strHH += 12 # what happens when strHH is 24? - else: - # try to parse # s without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - else: - if wordNext == "stund" and int(word) < 100: - # "in 3 hours" - hrOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "minut": - # "in 10 minutes" - minOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "sekund": - # in 5 seconds - secOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "uhr": - strHH = word - used += 1 - isTime = True - if wordNextNext == timeQualifier: - strMM = "" - if wordNextNext[:10] == "nachmittag": - used += 1 - remainder = "pm" - elif wordNextNext == "am" and wordNextNextNext == \ - "nachmittag": - used += 2 - remainder = "pm" - elif wordNextNext[:5] == "abend": - used += 1 - remainder = "pm" - elif wordNextNext == "am" and wordNextNextNext == \ - "abend": - used += 2 - remainder = "pm" - elif wordNextNext[:7] == "morgens": - used += 1 - remainder = "am" - elif wordNextNext == "am" and wordNextNextNext == \ - "morgen": - used += 2 - remainder = "am" - elif wordNextNext == "nachts": - used += 1 - if 8 <= int(word) <= 12: - remainder = "pm" - else: - remainder = "am" - - elif is_numeric(wordNextNext): - strMM = wordNextNext - used += 1 - if wordNextNextNext == timeQualifier: - if wordNextNextNext[:10] == "nachmittag": - used += 1 - remainder = "pm" - elif wordNextNextNext == "am" and \ - wordNextNextNextNext == "nachmittag": - used += 2 - remainder = "pm" - elif wordNextNextNext[:5] == "abend": - used += 1 - remainder = "pm" - elif wordNextNextNext == "am" and \ - wordNextNextNextNext == "abend": - used += 2 - remainder = "pm" - elif wordNextNextNext[:7] == "morgens": - used += 1 - remainder = "am" - elif wordNextNextNext == "am" and \ - wordNextNextNextNext == "morgen": - used += 2 - remainder = "am" - elif wordNextNextNext == "nachts": - used += 1 - if 8 <= int(word) <= 12: - remainder = "pm" - else: - remainder = "am" - - elif wordNext == timeQualifier: - strHH = word - strMM = 00 - isTime = True - if wordNext[:10] == "nachmittag": - used += 1 - remainder = "pm" - elif wordNext == "am" and wordNextNext == "nachmittag": - used += 2 - remainder = "pm" - elif wordNext[:5] == "abend": - used += 1 - remainder = "pm" - elif wordNext == "am" and wordNextNext == "abend": - used += 2 - remainder = "pm" - elif wordNext[:7] == "morgens": - used += 1 - remainder = "am" - elif wordNext == "am" and wordNextNext == "morgen": - used += 2 - remainder = "am" - elif wordNext == "nachts": - used += 1 - if 8 <= int(word) <= 12: - remainder = "pm" - else: - remainder = "am" - - # if timeQualifier != "": - # military = True - # else: - # isTime = False - - strHH = int(strHH) if strHH else 0 - strMM = int(strMM) if strMM else 0 - strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH - strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH - if strHH > 24 or strMM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = strHH * 1 - minAbs = strMM * 1 - used += 1 - if used > 0: - # removed parsed words from the sentence - for i in range(used): - words[idx + i] = "" - - if wordPrev == "Uhr": - words[words.index(wordPrev)] = "" - - if wordPrev == "früh": - hrOffset = -1 - words[idx - 1] = "" - idx -= 1 - elif wordPrev == "spät": - hrOffset = 1 - words[idx - 1] = "" - idx -= 1 - if idx > 0 and wordPrev in markers: - words[idx - 1] = "" - if idx > 1 and wordPrevPrev in markers: - words[idx - 2] = "" - - idx += used - 1 - found = True - - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow - extractedDate = extractedDate.replace(microsecond=0, - second=0, - minute=0, - hour=0) - if datestr != "": - en_months = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', - 'aug', - 'sept', 'oct', 'nov', 'dec'] - for idx, en_month in enumerate(en_months): - datestr = datestr.replace(months[idx], en_month) - for idx, en_month in enumerate(en_monthsShort): - datestr = datestr.replace(monthsShort[idx], en_month) - - temp = datetime.strptime(datestr, "%B %d") - if not hasYear: - temp = temp.replace(year=extractedDate.year) - if extractedDate < temp: - extractedDate = extractedDate.replace(year=int(currentYear), - month=int( - temp.strftime( - "%m")), - day=int(temp.strftime( - "%d"))) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - - if timeStr != "": - temp = datetime(timeStr) - extractedDate = extractedDate.replace(hour=temp.strftime("%H"), - minute=temp.strftime("%M"), - second=temp.strftime("%S")) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - - if hrAbs is None and minAbs is None and default_time: - hrAbs = default_time.hour - minAbs = default_time.minute - - if hrAbs != -1 and minAbs != -1: - - extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, - minutes=minAbs or 0) - if (hrAbs or minAbs) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - for idx, word in enumerate(words): - if words[idx] == "und" and words[idx - 1] == "" \ - and words[idx + 1] == "": - words[idx] = "" - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - - return [extractedDate, resultStr] - - -def isFractional_de(input_str): - """ - This function takes the given text and checks if it is a fraction. - - Args: - input_str (str): the string to check if fractional - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.lower().startswith("halb"): - return 0.5 - - if input_str.lower() == "drittel": - return 1.0 / 3 - elif input_str.endswith('tel'): - if input_str.endswith('stel'): - input_str = input_str[:len(input_str) - 4] # e.g. "hundertstel" - else: - input_str = input_str[:len(input_str) - 3] # e.g. "fünftel" - if input_str.lower() in de_numbers: - return 1.0 / (de_numbers[input_str.lower()]) - - return False - - -def isOrdinal_de(input_str): - """ - This function takes the given text and checks if it is an ordinal number. - - Args: - input_str (str): the string to check if ordinal - Returns: - (bool) or (float): False if not an ordinal, otherwise the number - corresponding to the ordinal - - ordinals for 1, 3, 7 and 8 are irregular - - only works for ordinals corresponding to the numbers in de_numbers - - """ - - lowerstr = input_str.lower() - - if lowerstr.startswith("erste"): - return 1 - if lowerstr.startswith("dritte"): - return 3 - if lowerstr.startswith("siebte"): - return 7 - if lowerstr.startswith("achte"): - return 8 - - if lowerstr[-3:] == "ste": # from 20 suffix is -ste* - lowerstr = lowerstr[:-3] - if lowerstr in de_numbers: - return de_numbers[lowerstr] - - if lowerstr[-4:] in ["ster", "stes", "sten", "stem"]: - lowerstr = lowerstr[:-4] - if lowerstr in de_numbers: - return de_numbers[lowerstr] - - if lowerstr[-2:] == "te": # below 20 suffix is -te* - lowerstr = lowerstr[:-2] - if lowerstr in de_numbers: - return de_numbers[lowerstr] - - if lowerstr[-3:] in ["ter", "tes", "ten", "tem"]: - lowerstr = lowerstr[:-3] - if lowerstr in de_numbers: - return de_numbers[lowerstr] - - return False - - -def normalize_de(text, remove_articles): - """ German string normalization """ - - words = text.split() # this also removed extra spaces - normalized = "" - for word in words: - if remove_articles and word in ["der", "die", "das", "des", "den", - "dem"]: - continue - - # Expand common contractions, e.g. "isn't" -> "is not" - contraction = ["net", "nett"] - if word in contraction: - expansion = ["nicht", "nicht"] - word = expansion[contraction.index(word)] - - # Convert numbers into digits, e.g. "two" -> "2" - - if word in de_numbers: - word = str(de_numbers[word]) - - normalized += " " + word - - return normalized[1:] # strip the initial space - - -def extract_numbers_de(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ - return extract_numbers_generic(text, pronounce_number_de, extractnumber_de, - short_scale=short_scale, ordinals=ordinals) +TODO: Remove in 20.02 +""" +from lingua_franca.lang.parse_de import * diff --git a/mycroft/util/lang/parse_en.py b/mycroft/util/lang/parse_en.py index 173e8208a8..acf2bb1ee3 100644 --- a/mycroft/util/lang/parse_en.py +++ b/mycroft/util/lang/parse_en.py @@ -13,1550 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from collections import namedtuple -from datetime import datetime, timedelta +"""Backwards compatibility with mycroft-core tests to verify functionality.""" +from lingua_franca.lang.parse_en import * +from lingua_franca.lang.parse_en import _extract_whole_number_with_text_en +from lingua_franca.lang.parse_en import _extract_decimal_with_text_en +from lingua_franca.lang.parse_common import ReplaceableNumber +from lingua_franca.lang.parse_common import tokenize as _tokenize +from lingua_franca.lang.parse_common import Token as _Token -from dateutil.relativedelta import relativedelta -from mycroft.util.lang.parse_common import is_numeric, look_for_fractions -from mycroft.util.lang.common_data_en import _ARTICLES, _NUM_STRING_EN, \ - _LONG_ORDINAL_STRING_EN, _LONG_SCALE_EN, \ - _SHORT_SCALE_EN, _SHORT_ORDINAL_STRING_EN - -import re - - -def _invert_dict(original): - """ - Produce a dictionary with the keys and values - inverted, relative to the dict passed in. - - Args: - original dict: The dict like object to invert - - Returns: - dict - - """ - return {value: key for key, value in original.items()} - - -def _generate_plurals(originals): - """ - Return a new set or dict containing the original values, - all with 's' appended to them. - - Args: - originals set(str) or dict(str, any): values to pluralize - - Returns: - set(str) or dict(str, any) - - """ - if isinstance(originals, dict): - return {key + 's': value for key, value in originals.items()} - return {value + "s" for value in originals} - - -# negate next number (-2 = 0 - 2) -_NEGATIVES = {"negative", "minus"} - -# sum the next number (twenty two = 20 + 2) -_SUMS = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50', - 'sixty', '60', 'seventy', '70', 'eighty', '80', 'ninety', '90'} - -_MULTIPLIES_LONG_SCALE_EN = set(_LONG_SCALE_EN.values()) | \ - _generate_plurals(_LONG_SCALE_EN.values()) - -_MULTIPLIES_SHORT_SCALE_EN = set(_SHORT_SCALE_EN.values()) | \ - _generate_plurals(_SHORT_SCALE_EN.values()) - - -# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) -_FRACTION_MARKER = {"and"} - -# decimal marker ( 1 point 5 = 1 + 0.5) -_DECIMAL_MARKER = {"point", "dot"} - -_STRING_NUM_EN = _invert_dict(_NUM_STRING_EN) -_STRING_NUM_EN.update(_generate_plurals(_STRING_NUM_EN)) -_STRING_NUM_EN.update({ - "half": 0.5, - "halves": 0.5, - "couple": 2 -}) - -_STRING_SHORT_ORDINAL_EN = _invert_dict(_SHORT_ORDINAL_STRING_EN) -_STRING_LONG_ORDINAL_EN = _invert_dict(_LONG_ORDINAL_STRING_EN) - - -# _Token is intended to be used in the number processing functions in -# this module. The parsing requires slicing and dividing of the original -# text. To ensure things parse correctly, we need to know where text came -# from in the original input, hence this nametuple. -_Token = namedtuple('_Token', 'word index') - - -class _ReplaceableNumber(): - """ - Similar to _Token, this class is used in number parsing. - - Once we've found a number in a string, this class contains all - the info about the value, and where it came from in the original text. - In other words, it is the text, and the number that can replace it in - the string. - """ - - def __init__(self, value, tokens: [_Token]): - self.value = value - self.tokens = tokens - - def __bool__(self): - return bool(self.value is not None and self.value is not False) - - @property - def start_index(self): - return self.tokens[0].index - - @property - def end_index(self): - return self.tokens[-1].index - - @property - def text(self): - return ' '.join([t.word for t in self.tokens]) - - def __setattr__(self, key, value): - try: - getattr(self, key) - except AttributeError: - super().__setattr__(key, value) - else: - raise Exception("Immutable!") - - def __str__(self): - return "({v}, {t})".format(v=self.value, t=self.tokens) - - def __repr__(self): - return "{n}({v}, {t})".format(n=self.__class__.__name__, v=self.value, - t=self.tokens) - - -def _tokenize(text): - """ - Generate a list of token object, given a string. - Args: - text str: Text to tokenize. - - Returns: - [_Token] - - """ - return [_Token(word, index) for index, word in enumerate(text.split())] - - -def _partition_list(items, split_on): - """ - Partition a list of items. - - Works similarly to str.partition - - Args: - items: - split_on callable: - Should return a boolean. Each item will be passed to - this callable in succession, and partitions will be - created any time it returns True. - - Returns: - [[any]] - - """ - splits = [] - current_split = [] - for item in items: - if split_on(item): - splits.append(current_split) - splits.append([item]) - current_split = [] - else: - current_split.append(item) - splits.append(current_split) - return list(filter(lambda x: len(x) != 0, splits)) - - -def _convert_words_to_numbers(text, short_scale=True, ordinals=False): - """ - Convert words in a string into their equivalent numbers. - Args: - text str: - short_scale boolean: True if short scale numbers should be used. - ordinals boolean: True if ordinals (e.g. first, second, third) should - be parsed to their number values (1, 2, 3...) - - Returns: - str - The original text, with numbers subbed in where appropriate. - - """ - text = text.lower() - tokens = _tokenize(text) - numbers_to_replace = \ - _extract_numbers_with_text(tokens, short_scale, ordinals) - numbers_to_replace.sort(key=lambda number: number.start_index) - - results = [] - for token in tokens: - if not numbers_to_replace or \ - token.index < numbers_to_replace[0].start_index: - results.append(token.word) - else: - if numbers_to_replace and \ - token.index == numbers_to_replace[0].start_index: - results.append(str(numbers_to_replace[0].value)) - if numbers_to_replace and \ - token.index == numbers_to_replace[0].end_index: - numbers_to_replace.pop(0) - - return ' '.join(results) - - -def _extract_numbers_with_text(tokens, short_scale=True, - ordinals=False, fractional_numbers=True): - """ - Extract all numbers from a list of _Tokens, with the words that - represent them. - - Args: - [_Token]: The tokens to parse. - short_scale bool: True if short scale numbers should be used, False for - long scale. True by default. - ordinals bool: True if ordinal words (first, second, third, etc) should - be parsed. - fractional_numbers bool: True if we should look for fractions and - decimals. - - Returns: - [_ReplaceableNumber]: A list of tuples, each containing a number and a - string. - - """ - placeholder = "" # inserted to maintain correct indices - results = [] - while True: - to_replace = \ - _extract_number_with_text_en(tokens, short_scale, - ordinals, fractional_numbers) - - if not to_replace: - break - - results.append(to_replace) - - tokens = [ - t if not - to_replace.start_index <= t.index <= to_replace.end_index - else - _Token(placeholder, t.index) for t in tokens - ] - results.sort(key=lambda n: n.start_index) - return results - - -def _extract_number_with_text_en(tokens, short_scale=True, - ordinals=False, fractional_numbers=True): - """ - This function extracts a number from a list of _Tokens. - - Args: - tokens str: the string to normalize - short_scale (bool): use short scale if True, long scale if False - ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 - fractional_numbers (bool): True if we should look for fractions and - decimals. - Returns: - _ReplaceableNumber - - """ - number, tokens = \ - _extract_number_with_text_en_helper(tokens, short_scale, - ordinals, fractional_numbers) - while tokens and tokens[0].word in _ARTICLES: - tokens.pop(0) - return _ReplaceableNumber(number, tokens) - - -def _extract_number_with_text_en_helper(tokens, - short_scale=True, ordinals=False, - fractional_numbers=True): - """ - Helper for _extract_number_with_text_en. - - This contains the real logic for parsing, but produces - a result that needs a little cleaning (specific, it may - contain leading articles that can be trimmed off). - - Args: - tokens [_Token]: - short_scale boolean: - ordinals boolean: - fractional_numbers boolean: - - Returns: - int or float, [_Tokens] - - """ - if fractional_numbers: - fraction, fraction_text = \ - _extract_fraction_with_text_en(tokens, short_scale, ordinals) - if fraction: - return fraction, fraction_text - - decimal, decimal_text = \ - _extract_decimal_with_text_en(tokens, short_scale, ordinals) - if decimal: - return decimal, decimal_text - - return _extract_whole_number_with_text_en(tokens, short_scale, ordinals) - - -def _extract_fraction_with_text_en(tokens, short_scale, ordinals): - """ - Extract fraction numbers from a string. - - This function handles text such as '2 and 3/4'. Note that "one half" or - similar will be parsed by the whole number function. - - Args: - tokens [_Token]: words and their indexes in the original string. - short_scale boolean: - ordinals boolean: - - Returns: - (int or float, [_Token]) - The value found, and the list of relevant tokens. - (None, None) if no fraction value is found. - - """ - for c in _FRACTION_MARKER: - partitions = _partition_list(tokens, lambda t: t.word == c) - - if len(partitions) == 3: - numbers1 = \ - _extract_numbers_with_text(partitions[0], short_scale, - ordinals, fractional_numbers=False) - numbers2 = \ - _extract_numbers_with_text(partitions[2], short_scale, - ordinals, fractional_numbers=True) - - if not numbers1 or not numbers2: - return None, None - - # ensure first is not a fraction and second is a fraction - num1 = numbers1[-1] - num2 = numbers2[0] - if num1.value >= 1 and 0 < num2.value < 1: - return num1.value + num2.value, \ - num1.tokens + partitions[1] + num2.tokens - - return None, None - - -def _extract_decimal_with_text_en(tokens, short_scale, ordinals): - """ - Extract decimal numbers from a string. - - This function handles text such as '2 point 5'. - - Notes: - While this is a helper for extractnumber_en, it also depends on - extractnumber_en, to parse out the components of the decimal. - - This does not currently handle things like: - number dot number number number - - Args: - tokens [_Token]: The text to parse. - short_scale boolean: - ordinals boolean: - - Returns: - (float, [_Token]) - The value found and relevant tokens. - (None, None) if no decimal value is found. - - """ - for c in _DECIMAL_MARKER: - partitions = _partition_list(tokens, lambda t: t.word == c) - - if len(partitions) == 3: - numbers1 = \ - _extract_numbers_with_text(partitions[0], short_scale, - ordinals, fractional_numbers=False) - numbers2 = \ - _extract_numbers_with_text(partitions[2], short_scale, - ordinals, fractional_numbers=False) - - if not numbers1 or not numbers2: - return None, None - - number = numbers1[-1] - decimal = numbers2[0] - - # TODO handle number dot number number number - if "." not in str(decimal.text): - return number.value + float('0.' + str(decimal.value)), \ - number.tokens + partitions[1] + decimal.tokens - return None, None - - -def _extract_whole_number_with_text_en(tokens, short_scale, ordinals): - """ - Handle numbers not handled by the decimal or fraction functions. This is - generally whole numbers. Note that phrases such as "one half" will be - handled by this function, while "one and a half" are handled by the - fraction function. - - Args: - tokens [_Token]: - short_scale boolean: - ordinals boolean: - - Returns: - int or float, [_Tokens] - The value parsed, and tokens that it corresponds to. - - """ - multiplies, string_num_ordinal, string_num_scale = \ - _initialize_number_data(short_scale) - - number_words = [] # type: [_Token] - val = False - prev_val = None - next_val = None - to_sum = [] - for idx, token in enumerate(tokens): - current_val = None - if next_val: - next_val = None - continue - - word = token.word - if word in _ARTICLES or word in _NEGATIVES: - number_words.append(token) - continue - - prev_word = tokens[idx - 1].word if idx > 0 else "" - next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" - - if word not in string_num_scale and \ - word not in _STRING_NUM_EN and \ - word not in _SUMS and \ - word not in multiplies and \ - not (ordinals and word in string_num_ordinal) and \ - not is_numeric(word) and \ - not isFractional_en(word, short_scale=short_scale) and \ - not look_for_fractions(word.split('/')): - words_only = [token.word for token in number_words] - if number_words and not all([w in _ARTICLES | - _NEGATIVES for w in words_only]): - break - else: - number_words = [] - continue - elif word not in multiplies \ - and prev_word not in multiplies \ - and prev_word not in _SUMS \ - and not (ordinals and prev_word in string_num_ordinal) \ - and prev_word not in _NEGATIVES \ - and prev_word not in _ARTICLES: - number_words = [token] - elif prev_word in _SUMS and word in _SUMS: - number_words = [token] - else: - number_words.append(token) - - # is this word already a number ? - if is_numeric(word): - if word.isdigit(): # doesn't work with decimals - val = int(word) - else: - val = float(word) - current_val = val - - # is this word the name of a number ? - if word in _STRING_NUM_EN: - val = _STRING_NUM_EN.get(word) - current_val = val - elif word in string_num_scale: - val = string_num_scale.get(word) - current_val = val - elif ordinals and word in string_num_ordinal: - val = string_num_ordinal[word] - current_val = val - - # is the prev word an ordinal number and current word is one? - # second one, third one - if ordinals and prev_word in string_num_ordinal and val == 1: - val = prev_val - - # is the prev word a number and should we sum it? - # twenty two, fifty six - if prev_word in _SUMS and val and val < 10: - val = prev_val + val - - # is the prev word a number and should we multiply it? - # twenty hundred, six hundred - if word in multiplies: - if not prev_val: - prev_val = 1 - val = prev_val * val - - # is this a spoken fraction? - # half cup - if val is False: - val = isFractional_en(word, short_scale=short_scale) - current_val = val - - # 2 fifths - if not ordinals: - next_val = isFractional_en(next_word, short_scale=short_scale) - if next_val: - if not val: - val = 1 - val = val * next_val - number_words.append(tokens[idx + 1]) - - # is this a negative number? - if val and prev_word and prev_word in _NEGATIVES: - val = 0 - val - - # let's make sure it isn't a fraction - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - current_val = val - - else: - if prev_word in _SUMS and word not in _SUMS and current_val >= 10: - # Backtrack - we've got numbers we can't sum. - number_words.pop() - val = prev_val - break - prev_val = val - - # handle long numbers - # six hundred sixty six - # two million five hundred thousand - if word in multiplies and next_word not in multiplies: - to_sum.append(val) - val = 0 - prev_val = 0 - - if val is not None and to_sum: - val += sum(to_sum) - - return val, number_words - - -def _initialize_number_data(short_scale): - """ - Generate dictionaries of words to numbers, based on scale. - - This is a helper function for _extract_whole_number. - - Args: - short_scale boolean: - - Returns: - (set(str), dict(str, number), dict(str, number)) - multiplies, string_num_ordinal, string_num_scale - - """ - multiplies = _MULTIPLIES_SHORT_SCALE_EN if short_scale \ - else _MULTIPLIES_LONG_SCALE_EN - - string_num_ordinal_en = _STRING_SHORT_ORDINAL_EN if short_scale \ - else _STRING_LONG_ORDINAL_EN - - string_num_scale_en = _SHORT_SCALE_EN if short_scale else _LONG_SCALE_EN - string_num_scale_en = _invert_dict(string_num_scale_en) - string_num_scale_en.update(_generate_plurals(string_num_scale_en)) - - return multiplies, string_num_ordinal_en, string_num_scale_en - - -def extractnumber_en(text, short_scale=True, ordinals=False): - """ - This function extracts a number from a text string, - handles pronunciations in long scale and short scale - - https://en.wikipedia.org/wiki/Names_of_large_numbers - - Args: - text (str): the string to normalize - short_scale (bool): use short scale if True, long scale if False - ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 - Returns: - (int) or (float) or False: The extracted number or False if no number - was found - - """ - return _extract_number_with_text_en(_tokenize(text.replace("%", "")), - short_scale, ordinals).value - - -def extract_duration_en(text): - """ - Convert an english phrase into a number of seconds - - Convert things like: - "10 minute" - "2 and a half hours" - "3 days 8 hours 10 minutes and 49 seconds" - into an int, representing the total number of seconds. - - The words used in the duration will be consumed, and - the remainder returned. - - As an example, "set a timer for 5 minutes" would return - (300, "set a timer for"). - - Args: - text (str): string containing a duration - - Returns: - (timedelta, str): - A tuple containing the duration and the remaining text - not consumed in the parsing. The first value will - be None if no duration is found. The text returned - will have whitespace stripped from the ends. - """ - if not text: - return None - - time_units = { - 'microseconds': None, - 'milliseconds': None, - 'seconds': None, - 'minutes': None, - 'hours': None, - 'days': None, - 'weeks': None - } - - pattern = r"(?P\d+(?:\.?\d+)?)\s+{unit}s?" - text = _convert_words_to_numbers(text) - - for unit in time_units: - unit_pattern = pattern.format(unit=unit[:-1]) # remove 's' from unit - matches = re.findall(unit_pattern, text) - value = sum(map(float, matches)) - time_units[unit] = value - text = re.sub(unit_pattern, '', text) - - text = text.strip() - duration = timedelta(**time_units) if any(time_units.values()) else None - - return (duration, text) - - -def extract_datetime_en(string, dateNow, default_time): - """ Convert a human date reference into an exact datetime - - Convert things like - "today" - "tomorrow afternoon" - "next Tuesday at 4pm" - "August 3rd" - into a datetime. If a reference date is not provided, the current - local time is used. Also consumes the words used to define the date - returning the remaining string. For example, the string - "what is Tuesday's weather forecast" - returns the date for the forthcoming Tuesday relative to the reference - date and the remainder string - "what is weather forecast". - - The "next" instance of a day or weekend is considered to be no earlier than - 48 hours in the future. On Friday, "next Monday" would be in 3 days. - On Saturday, "next Monday" would be in 9 days. - - Args: - string (str): string containing date words - dateNow (datetime): A reference date/time for "tommorrow", etc - default_time (time): Time to set if no time was found in the string - - Returns: - [datetime, str]: An array containing the datetime and the remaining - text not consumed in the parsing, or None if no - date or time related text was found. - """ - - def clean_string(s): - # clean unneeded punctuation and capitalization among other things. - s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ - .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \ - .replace("o' clock", "o'clock").replace("o clock", "o'clock") \ - .replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \ - .replace("oclock", "o'clock").replace("couple", "2") \ - .replace("centuries", "century").replace("decades", "decade") \ - .replace("millenniums", "millennium") - - wordList = s.split() - for idx, word in enumerate(wordList): - word = word.replace("'s", "") - - ordinals = ["rd", "st", "nd", "th"] - if word[0].isdigit(): - for ordinal in ordinals: - # "second" is the only case we should not do this - if ordinal in word and "second" not in word: - word = word.replace(ordinal, "") - wordList[idx] = word - - return wordList - - def date_found(): - return found or \ - ( - datestr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs or minOffset != 0 or - minAbs or secOffset != 0 - ) - - if string == "" or not dateNow: - return None - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - timeQualifiersAM = ['morning'] - timeQualifiersPM = ['afternoon', 'evening', 'night', 'tonight'] - timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) - markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of', "within"] - days = ['monday', 'tuesday', 'wednesday', - 'thursday', 'friday', 'saturday', 'sunday'] - months = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - recur_markers = days + [d+'s' for d in days] + ['weekend', 'weekday', - 'weekends', 'weekdays'] - monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', - 'sept', 'oct', 'nov', 'dec'] - year_multiples = ["decade", "century", "millennium"] - day_multiples = ["weeks", "months", "years"] - - words = clean_string(string) - - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - - # this isn't in clean string because I don't want to save back to words - word = word.rstrip('s') - start = idx - used = 0 - # save timequalifier for later - - if word == "now" and not datestr: - resultStr = " ".join(words[idx + 1:]) - resultStr = ' '.join(resultStr.split()) - extractedDate = dateNow.replace(microsecond=0) - return [extractedDate, resultStr] - elif wordNext in year_multiples: - multiplier = None - if is_numeric(word): - multiplier = extractnumber_en(word) - multiplier = multiplier or 1 - multiplier = int(multiplier) - used += 2 - if wordNext == "decade": - yearOffset = multiplier * 10 - elif wordNext == "century": - yearOffset = multiplier * 100 - elif wordNext == "millennium": - yearOffset = multiplier * 1000 - # couple of - elif word == "2" and wordNext == "of" and \ - wordNextNext in year_multiples: - multiplier = 2 - used += 3 - if wordNextNext == "decade": - yearOffset = multiplier * 10 - elif wordNextNext == "century": - yearOffset = multiplier * 100 - elif wordNextNext == "millennium": - yearOffset = multiplier * 1000 - elif word == "2" and wordNext == "of" and \ - wordNextNext in day_multiples: - multiplier = 2 - used += 3 - if wordNextNext == "years": - yearOffset = multiplier - elif wordNextNext == "months": - monthOffset = multiplier - elif wordNextNext == "weeks": - dayOffset = multiplier * 7 - elif word in timeQualifiersList: - timeQualifier = word - # parse today, tomorrow, day after tomorrow - elif word == "today" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "tomorrow" and not fromFlag: - dayOffset = 1 - used += 1 - elif (word == "day" and - wordNext == "after" and - wordNextNext == "tomorrow" and - not fromFlag and - not (wordPrev[0].isdigit() if wordPrev else False)): - dayOffset = 2 - used = 3 - if wordPrev == "the": - start -= 1 - used += 1 - # parse 5 days, 10 weeks, last week, next week - elif word == "day": - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used = 2 - elif word == "week" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - elif wordPrev == "next": - dayOffset = 7 - start -= 1 - used = 2 - elif wordPrev == "last": - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "month" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev == "next": - monthOffset = 1 - start -= 1 - used = 2 - elif wordPrev == "last": - monthOffset = -1 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "year" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev == "next": - yearOffset = 1 - start -= 1 - used = 2 - elif wordPrev == "last": - yearOffset = -1 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - if wordPrev == "next": - if dayOffset <= 2: - dayOffset += 7 - used += 1 - start -= 1 - elif wordPrev == "last": - dayOffset -= 7 - used += 1 - start -= 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in monthsShort and not fromFlag: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months[m] - if wordPrev and (wordPrev[0].isdigit() or - (wordPrev == "of" and wordPrevPrev[0].isdigit())): - if wordPrev == "of" and wordPrevPrev[0].isdigit(): - datestr += " " + words[idx - 2] - used += 1 - start -= 1 - else: - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - - # if no date indicators found, it may not be the month of May - # may "i/we" ... - # "... may be" - elif word == 'may' and wordNext in ['i', 'we', 'be']: - datestr = "" - - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - validFollowups = days + months + monthsShort - validFollowups.append("today") - validFollowups.append("tomorrow") - validFollowups.append("next") - validFollowups.append("last") - validFollowups.append("now") - if (word == "from" or word == "after") and wordNext in validFollowups: - used = 2 - fromFlag = True - if wordNext == "tomorrow": - dayOffset += 1 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if tmpOffset < 0: - tmpOffset += 7 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNext == "next": - if dayOffset <= 2: - tmpOffset += 7 - used += 1 - start -= 1 - elif wordNext == "last": - tmpOffset -= 7 - used += 1 - start -= 1 - dayOffset += tmpOffset - if used > 0: - if start - 1 > 0 and words[start - 1] == "this": - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in markers: - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = None - minAbs = None - military = False - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word == "noon": - hrAbs = 12 - used += 1 - elif word == "midnight": - hrAbs = 0 - used += 1 - elif word == "morning": - if hrAbs is None: - hrAbs = 8 - used += 1 - elif word == "afternoon": - if hrAbs is None: - hrAbs = 15 - used += 1 - elif word == "evening": - if hrAbs is None: - hrAbs = 19 - used += 1 - # couple of time_unit - elif word == "2" and wordNext == "of" and \ - wordNextNext in ["hours", "minutes", "seconds"]: - used += 3 - if wordNextNext == "hours": - hrOffset = 2 - elif wordNextNext == "minutes": - minOffset = 2 - elif wordNextNext == "seconds": - secOffset = 2 - # parse half an hour, quarter hour - elif word == "hour" and \ - (wordPrev in markers or wordPrevPrev in markers): - if wordPrev == "half": - minOffset = 30 - elif wordPrev == "quarter": - minOffset = 15 - elif wordPrevPrev == "quarter": - minOffset = 15 - if idx > 2 and words[idx - 3] in markers: - words[idx - 3] = "" - words[idx - 2] = "" - elif wordPrev == "within": - hrOffset = 1 - else: - hrOffset = 1 - if wordPrevPrev in markers: - words[idx - 2] = "" - if wordPrevPrev == "this": - daySpecified = True - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - # parse in a minute - elif word == "minute" and wordPrev == "in": - minOffset = 1 - words[idx - 1] = "" - used += 1 - # parse in a second - elif word == "second" and wordPrev == "in": - secOffset = 1 - words[idx - 1] = "" - used += 1 - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - wordNextNextNext = words[idx + 3] \ - if idx + 3 < len(words) else "" - if wordNext == "tonight" or wordNextNext == "tonight" or \ - wordPrev == "tonight" or wordPrevPrev == "tonight" or \ - wordNextNextNext == "tonight": - remainder = "pm" - used += 1 - if wordPrev == "tonight": - words[idx - 1] = "" - if wordPrevPrev == "tonight": - words[idx - 2] = "" - if wordNextNext == "tonight": - used += 1 - if wordNextNextNext == "tonight": - used += 1 - - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - - elif wordNext == "in" and wordNextNext == "morning": - remainder = "am" - used += 2 - elif wordNext == "in" and wordNextNext == "afternoon": - remainder = "pm" - used += 2 - elif wordNext == "in" and wordNextNext == "evening": - remainder = "pm" - used += 2 - elif wordNext == "this" and wordNextNext == "morning": - remainder = "am" - used = 2 - daySpecified = True - elif wordNext == "this" and wordNextNext == "afternoon": - remainder = "pm" - used = 2 - daySpecified = True - elif wordNext == "this" and wordNextNext == "evening": - remainder = "pm" - used = 2 - daySpecified = True - elif wordNext == "at" and wordNextNext == "night": - if strHH and int(strHH) > 5: - remainder = "pm" - else: - remainder = "am" - used += 2 - - else: - if timeQualifier != "": - military = True - if strHH and int(strHH) <= 12 and \ - (timeQualifier in timeQualifiersPM): - strHH += str(int(strHH) + 12) - - else: - # try to parse numbers without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - elif ( - remainder in recur_markers or - wordNext in recur_markers or - wordNextNext in recur_markers): - # Ex: "7 on mondays" or "3 this friday" - # Set strHH so that isTime == True - # when am or pm is not specified - strHH = strNum - used = 1 - else: - if ( - int(strNum) > 100 and - ( - wordPrev == "o" or - wordPrev == "oh" - )): - # 0800 hours (pronounced oh-eight-hundred) - strHH = str(int(strNum) // 100) - strMM = str(int(strNum) % 100) - military = True - if wordNext == "hours": - used += 1 - elif ( - (wordNext == "hours" or wordNext == "hour" or - remainder == "hours" or remainder == "hour") and - word[0] != '0' and - ( - int(strNum) < 100 or - int(strNum) > 2400 - )): - # ignores military time - # "in 3 hours" - hrOffset = int(strNum) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "minutes" or wordNext == "minute" or \ - remainder == "minutes" or remainder == "minute": - # "in 10 minutes" - minOffset = int(strNum) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "seconds" or wordNext == "second" \ - or remainder == "seconds" or remainder == "second": - # in 5 seconds - secOffset = int(strNum) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif int(strNum) > 100: - # military time, eg. "3300 hours" - strHH = str(int(strNum) // 100) - strMM = str(int(strNum) % 100) - military = True - if wordNext == "hours" or wordNext == "hour" or \ - remainder == "hours" or remainder == "hour": - used += 1 - elif wordNext and wordNext[0].isdigit(): - # military time, e.g. "04 38 hours" - strHH = strNum - strMM = wordNext - military = True - used += 1 - if (wordNextNext == "hours" or - wordNextNext == "hour" or - remainder == "hours" or remainder == "hour"): - used += 1 - elif ( - wordNext == "" or wordNext == "o'clock" or - ( - wordNext == "in" and - ( - wordNextNext == "the" or - wordNextNext == timeQualifier - ) - ) or wordNext == 'tonight' or - wordNextNext == 'tonight'): - - strHH = strNum - strMM = "00" - if wordNext == "o'clock": - used += 1 - - if wordNext == "in" or wordNextNext == "in": - used += (1 if wordNext == "in" else 2) - wordNextNextNext = words[idx + 3] \ - if idx + 3 < len(words) else "" - - if (wordNextNext and - (wordNextNext in timeQualifier or - wordNextNextNext in timeQualifier)): - if (wordNextNext in timeQualifiersPM or - wordNextNextNext in timeQualifiersPM): - remainder = "pm" - used += 1 - if (wordNextNext in timeQualifiersAM or - wordNextNextNext in timeQualifiersAM): - remainder = "am" - used += 1 - - if timeQualifier != "": - if timeQualifier in timeQualifiersPM: - remainder = "pm" - used += 1 - - elif timeQualifier in timeQualifiersAM: - remainder = "am" - used += 1 - else: - # TODO: Unsure if this is 100% accurate - used += 1 - military = True - else: - isTime = False - HH = int(strHH) if strHH else 0 - MM = int(strMM) if strMM else 0 - HH = HH + 12 if remainder == "pm" and HH < 12 else HH - HH = HH - 12 if remainder == "am" and HH >= 12 else HH - - if (not military and - remainder not in ['am', 'pm', 'hours', 'minutes', - "second", "seconds", - "hour", "minute"] and - ((not daySpecified) or dayOffset < 1)): - # ambiguous time, detect whether they mean this evening or - # the next morning based on whether it has already passed - if dateNow.hour < HH or (dateNow.hour == HH and - dateNow.minute < MM): - pass # No modification needed - elif dateNow.hour < HH + 12: - HH += 12 - else: - # has passed, assume the next morning - dayOffset += 1 - - if timeQualifier in timeQualifiersPM and HH < 12: - HH += 12 - - if HH > 24 or MM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = HH - minAbs = MM - used += 1 - - if used > 0: - # removed parsed words from the sentence - for i in range(used): - if idx + i >= len(words): - break - words[idx + i] = "" - - if wordPrev == "o" or wordPrev == "oh": - words[words.index(wordPrev)] = "" - - if wordPrev == "early": - hrOffset = -1 - words[idx - 1] = "" - idx -= 1 - elif wordPrev == "late": - hrOffset = 1 - words[idx - 1] = "" - idx -= 1 - if idx > 0 and wordPrev in markers: - words[idx - 1] = "" - if wordPrev == "this": - daySpecified = True - if idx > 1 and wordPrevPrev in markers: - words[idx - 2] = "" - if wordPrevPrev == "this": - daySpecified = True - - idx += used - 1 - found = True - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow.replace(microsecond=0) - - if datestr != "": - # date included an explicit date, e.g. "june 5" or "june 2, 2017" - try: - temp = datetime.strptime(datestr, "%B %d") - except ValueError: - # Try again, allowing the year - temp = datetime.strptime(datestr, "%B %d %Y") - extractedDate = extractedDate.replace(hour=0, minute=0, second=0) - if not hasYear: - temp = temp.replace(year=extractedDate.year, - tzinfo=extractedDate.tzinfo) - if extractedDate < temp: - extractedDate = extractedDate.replace( - year=int(currentYear), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d")), - tzinfo=extractedDate.tzinfo) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d")), - tzinfo=extractedDate.tzinfo) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d")), - tzinfo=extractedDate.tzinfo) - else: - # ignore the current HH:MM:SS if relative using days or greater - if hrOffset == 0 and minOffset == 0 and secOffset == 0: - extractedDate = extractedDate.replace(hour=0, minute=0, second=0) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - if hrAbs != -1 and minAbs != -1: - # If no time was supplied in the string set the time to default - # time if it's available - if hrAbs is None and minAbs is None and default_time is not None: - hrAbs, minAbs = default_time.hour, default_time.minute - else: - hrAbs = hrAbs or 0 - minAbs = minAbs or 0 - - extractedDate = extractedDate + relativedelta(hours=hrAbs, - minutes=minAbs) - if (hrAbs != 0 or minAbs != 0) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - for idx, word in enumerate(words): - if words[idx] == "and" and \ - words[idx - 1] == "" and words[idx + 1] == "": - words[idx] = "" - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - return [extractedDate, resultStr] - - -def isFractional_en(input_str, short_scale=True): - """ - This function takes the given text and checks if it is a fraction. - - Args: - input_str (str): the string to check if fractional - short_scale (bool): use short scale if True, long scale if False - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - - fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} - if short_scale: - for num in _SHORT_ORDINAL_STRING_EN: - if num > 2: - fracts[_SHORT_ORDINAL_STRING_EN[num]] = num - else: - for num in _LONG_ORDINAL_STRING_EN: - if num > 2: - fracts[_LONG_ORDINAL_STRING_EN[num]] = num - - if input_str.lower() in fracts: - return 1.0 / fracts[input_str.lower()] - return False - - -def extract_numbers_en(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ - results = _extract_numbers_with_text(_tokenize(text), - short_scale, ordinals) - return [float(result.value) for result in results] - - -def normalize_en(text, remove_articles): - """ English string normalization """ - - words = text.split() # this also removed extra spaces - normalized = "" - for word in words: - if remove_articles and word in ["the", "a", "an"]: - continue - - # Expand common contractions, e.g. "isn't" -> "is not" - contraction = ["ain't", "aren't", "can't", "could've", "couldn't", - "didn't", "doesn't", "don't", "gonna", "gotta", - "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", - "how'd", "how'll", "how's", "I'd", "I'll", "I'm", - "I've", "isn't", "it'd", "it'll", "it's", "mightn't", - "might've", "mustn't", "must've", "needn't", - "oughtn't", - "shan't", "she'd", "she'll", "she's", "shouldn't", - "should've", "somebody's", "someone'd", "someone'll", - "someone's", "that'll", "that's", "that'd", "there'd", - "there're", "there's", "they'd", "they'll", "they're", - "they've", "wasn't", "we'd", "we'll", "we're", "we've", - "weren't", "what'd", "what'll", "what're", "what's", - "whats", # technically incorrect but some STT outputs - "what've", "when's", "when'd", "where'd", "where's", - "where've", "who'd", "who'd've", "who'll", "who're", - "who's", "who've", "why'd", "why're", "why's", "won't", - "won't've", "would've", "wouldn't", "wouldn't've", - "y'all", "ya'll", "you'd", "you'd've", "you'll", - "y'aint", "y'ain't", "you're", "you've"] - if word in contraction: - expansion = ["is not", "are not", "can not", "could have", - "could not", "did not", "does not", "do not", - "going to", "got to", "had not", "has not", - "have not", "he would", "he will", "he is", - "how did", - "how will", "how is", "I would", "I will", "I am", - "I have", "is not", "it would", "it will", "it is", - "might not", "might have", "must not", "must have", - "need not", "ought not", "shall not", "she would", - "she will", "she is", "should not", "should have", - "somebody is", "someone would", "someone will", - "someone is", "that will", "that is", "that would", - "there would", "there are", "there is", "they would", - "they will", "they are", "they have", "was not", - "we would", "we will", "we are", "we have", - "were not", "what did", "what will", "what are", - "what is", - "what is", "what have", "when is", "when did", - "where did", "where is", "where have", "who would", - "who would have", "who will", "who are", "who is", - "who have", "why did", "why are", "why is", - "will not", "will not have", "would have", - "would not", "would not have", "you all", "you all", - "you would", "you would have", "you will", - "you are not", "you are not", "you are", "you have"] - word = expansion[contraction.index(word)] - - # Convert numbers into digits, e.g. "two" -> "2" - textNumbers = ["zero", "one", "two", "three", "four", "five", "six", - "seven", "eight", "nine", "ten", "eleven", "twelve", - "thirteen", "fourteen", "fifteen", "sixteen", - "seventeen", "eighteen", "nineteen", "twenty"] - - if word in textNumbers: - word = str(textNumbers.index(word)) - - normalized += " " + word - - return normalized[1:] # strip the initial space +class _ReplaceableNumber(ReplaceableNumber): + pass diff --git a/mycroft/util/lang/parse_es.py b/mycroft/util/lang/parse_es.py index 89f6e2768e..dbd00018e0 100644 --- a/mycroft/util/lang/parse_es.py +++ b/mycroft/util/lang/parse_es.py @@ -13,1149 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +"""File kept for backwards compatibility. + +TODO: Remove in 20.02 """ - Parse functions for spanish (es) - TODO: numbers greater than 999999 -""" -from datetime import datetime -from dateutil.relativedelta import relativedelta -from mycroft.util.lang.format_es import pronounce_number_es -from mycroft.util.lang.parse_common import * - -# Undefined articles ["un", "una", "unos", "unas"] can not be supressed, -# in Spanish, "un caballo" means "a horse" or "one horse". -es_articles = ["el", "la", "los", "las"] - -es_numbers = { - "cero": 0, - "un": 1, - "uno": 1, - "una": 1, - "dos": 2, - "tres": 3, - "trés": 3, - "cuatro": 4, - "cinco": 5, - "seis": 6, - "siete": 7, - "ocho": 8, - "nueve": 9, - "diez": 10, - "once": 11, - "doce": 12, - "trece": 13, - "catorce": 14, - "quince": 15, - "dieciseis": 16, - "dieciséis": 16, - "diecisiete": 17, - "dieciocho": 18, - "diecinueve": 19, - "veinte": 20, - "veintiuno": 21, - "veintid�s": 22, - "veintitr�s": 23, - "veintidos": 22, - "veintitres": 23, - "veintitrés": 23, - "veinticuatro": 24, - "veinticinco": 25, - "veintiséis": 26, - "veintiseis": 26, - "veintisiete": 27, - "veintiocho": 28, - "veintinueve": 29, - "treinta": 30, - "cuarenta": 40, - "cincuenta": 50, - "sesenta": 60, - "setenta": 70, - "ochenta": 80, - "noventa": 90, - "cien": 100, - "ciento": 100, - "doscientos": 200, - "doscientas": 200, - "trescientos": 300, - "trescientas": 300, - "cuatrocientos": 400, - "cuatrocientas": 400, - "quinientos": 500, - "quinientas": 500, - "seiscientos": 600, - "seiscientas": 600, - "setecientos": 700, - "setecientas": 700, - "ochocientos": 800, - "ochocientas": 800, - "novecientos": 900, - "novecientas": 900, - "mil": 1000} - - -def isFractional_es(input_str): - """ - This function takes the given text and checks if it is a fraction. - - Args: - text (str): the string to check if fractional - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - - aFrac = {"medio": 2, "media": 2, "tercio": 3, "cuarto": 4, - "cuarta": 4, "quinto": 5, "quinta": 5, "sexto": 6, "sexta": 6, - "séptimo": 7, "séptima": 7, "octavo": 8, "octava": 8, - "noveno": 9, "novena": 9, "décimo": 10, "décima": 10, - "onceavo": 11, "onceava": 11, "doceavo": 12, "doceava": 12} - - if input_str.lower() in aFrac: - return 1.0 / aFrac[input_str] - if (input_str == "vigésimo" or input_str == "vigésima"): - return 1.0 / 20 - if (input_str == "trigésimo" or input_str == "trigésima"): - return 1.0 / 30 - if (input_str == "centésimo" or input_str == "centésima"): - return 1.0 / 100 - if (input_str == "milésimo" or input_str == "milésima"): - return 1.0 / 1000 - return False - - -# TODO: short_scale and ordinals don't do anything here. -# The parameters are present in the function signature for API compatibility -# reasons. -# -# Returns incorrect output on certain fractional phrases like, "cuarto de dos" -def extractnumber_es(text, short_scale=True, ordinals=False): - """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - - """ - aWords = text.split() - count = 0 - result = None - while count < len(aWords): - val = 0 - word = aWords[count] - next_next_word = None - if count + 1 < len(aWords): - next_word = aWords[count + 1] - if count + 2 < len(aWords): - next_next_word = aWords[count + 2] - else: - next_word = None - - # is current word a number? - if word in es_numbers: - val = es_numbers[word] - elif word.isdigit(): # doesn't work with decimals - val = int(word) - elif is_numeric(word): - val = float(word) - elif isFractional_es(word): - if not result: - result = 1 - result = result * isFractional_es(word) - count += 1 - continue - - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - - if val: - if result is None: - result = 0 - # handle fractions - if next_word != "avos": - result = val - else: - result = float(result) / float(val) - - if next_word is None: - break - - # number word and fraction - ands = ["e"] - if next_word in ands: - zeros = 0 - if result is None: - count += 1 - continue - newWords = aWords[count + 2:] - newText = "" - for word in newWords: - newText += word + " " - - afterAndVal = extractnumber_es(newText[:-1]) - if afterAndVal: - if result < afterAndVal or result < 20: - while afterAndVal > 1: - afterAndVal = afterAndVal / 10.0 - for word in newWords: - if word == "cero" or word == "0": - zeros += 1 - else: - break - for _ in range(0, zeros): - afterAndVal = afterAndVal / 10.0 - result += afterAndVal - break - elif next_next_word is not None: - if next_next_word in ands: - newWords = aWords[count + 3:] - newText = "" - for word in newWords: - newText += word + " " - afterAndVal = extractnumber_es(newText[:-1]) - if afterAndVal: - if result is None: - result = 0 - result += afterAndVal - break - - decimals = ["punto", "coma", ".", ","] - if next_word in decimals: - zeros = 0 - newWords = aWords[count + 2:] - newText = "" - for word in newWords: - newText += word + " " - for word in newWords: - if word == "cero" or word == "0": - zeros += 1 - else: - break - afterDotVal = str(extractnumber_es(newText[:-1])) - afterDotVal = zeros * "0" + afterDotVal - result = float(str(result) + "." + afterDotVal) - break - count += 1 - - if result is None: - return False - - # Return the $str with the number related words removed - # (now empty strings, so strlen == 0) - # aWords = [word for word in aWords if len(word) > 0] - # text = ' '.join(aWords) - if "." in str(result): - integer, dec = str(result).split(".") - # cast float to int - if dec == "0": - result = int(integer) - - return result - - -def extract_numbers_es(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ - return extract_numbers_generic(text, pronounce_number_es, extractnumber_es, - short_scale=short_scale, ordinals=ordinals) - - -# TODO Not parsing 'cero' -def es_number_parse(words, i): - def es_cte(i, s): - if i < len(words) and s == words[i]: - return s, i + 1 - return None - - def es_number_word(i, mi, ma): - if i < len(words): - v = es_numbers.get(words[i]) - if v and v >= mi and v <= ma: - return v, i + 1 - return None - - def es_number_1_99(i): - r1 = es_number_word(i, 1, 29) - if r1: - return r1 - - r1 = es_number_word(i, 30, 90) - if r1: - v1, i1 = r1 - r2 = es_cte(i1, "y") - if r2: - i2 = r2[1] - r3 = es_number_word(i2, 1, 9) - if r3: - v3, i3 = r3 - return v1 + v3, i3 - return r1 - return None - - def es_number_1_999(i): - # [2-9]cientos [1-99]? - r1 = es_number_word(i, 100, 900) - if r1: - v1, i1 = r1 - r2 = es_number_1_99(i1) - if r2: - v2, i2 = r2 - return v1 + v2, i2 - else: - return r1 - - # [1-99] - r1 = es_number_1_99(i) - if r1: - return r1 - - return None - - def es_number(i): - # check for cero - r1 = es_number_word(i, 0, 0) - if r1: - return r1 - - # check for [1-999] (mil [0-999])? - r1 = es_number_1_999(i) - if r1: - v1, i1 = r1 - r2 = es_cte(i1, "mil") - if r2: - i2 = r2[1] - r3 = es_number_1_999(i2) - if r3: - v3, i3 = r3 - return v1 * 1000 + v3, i3 - else: - return v1 * 1000, i2 - else: - return r1 - return None - - return es_number(i) - - -def normalize_es(text, remove_articles): - """ Spanish string normalization """ - - words = text.split() # this also removed extra spaces - - normalized = "" - i = 0 - while i < len(words): - word = words[i] - - if remove_articles and word in es_articles: - i += 1 - continue - - # Convert numbers into digits - r = es_number_parse(words, i) - if r: - v, i = r - normalized += " " + str(v) - continue - - normalized += " " + word - i += 1 - - return normalized[1:] # strip the initial space - - -# TODO MycroftAI/mycroft-core#2348 -def extract_datetime_es(input_str, currentDate=None, default_time=None): - def clean_string(s): - # cleans the input string of unneeded punctuation and capitalization - # among other things - symbols = [".", ",", ";", "?", "!", "º", "ª"] - noise_words = ["entre", "la", "del", "al", "el", "de", - "por", "para", "una", "cualquier", "a", - "e'", "esta", "este"] - - for word in symbols: - s = s.replace(word, "") - for word in noise_words: - s = s.replace(" " + word + " ", " ") - s = s.lower().replace( - "á", - "a").replace( - "é", - "e").replace( - "ó", - "o").replace( - "-", - " ").replace( - "_", - "") - # handle synonyms and equivalents, "tomorrow early = tomorrow morning - synonyms = {"mañana": ["amanecer", "temprano", "muy temprano"], - "tarde": ["media tarde", "atardecer"], - "noche": ["anochecer", "tarde"]} - for syn in synonyms: - for word in synonyms[syn]: - s = s.replace(" " + word + " ", " " + syn + " ") - # relevant plurals, cant just extract all s in pt - wordlist = ["mañanas", "tardes", "noches", "días", "semanas", - "años", "minutos", "segundos", "las", "los", "siguientes", - "próximas", "próximos", "horas"] - for _, word in enumerate(wordlist): - s = s.replace(word, word.rstrip('s')) - s = s.replace("meses", "mes").replace("anteriores", "anterior") - return s - - def date_found(): - return found or \ - ( - datestr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs or minOffset != 0 or - minAbs or secOffset != 0 - ) - - if input_str == "": - return None - if currentDate is None: - currentDate = datetime.now() - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - dateNow = currentDate - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - words = clean_string(input_str).split(" ") - timeQualifiersList = ['mañana', 'tarde', 'noche'] - time_indicators = ["en", "la", "al", "por", "pasados", - "pasadas", "día", "hora"] - days = ['lunes', 'martes', 'miércoles', - 'jueves', 'viernes', 'sábado', 'domingo'] - months = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', - 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', - 'diciembre'] - monthsShort = ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago', - 'sep', 'oct', 'nov', 'dic'] - nexts = ["siguiente", "próximo", "próxima"] - suffix_nexts = ["siguientes", "subsecuentes"] - lasts = ["último", "última"] - suffix_lasts = ["pasada", "pasado", "anterior", "antes"] - nxts = ["después", "siguiente", "próximo", "próxima"] - prevs = ["antes", "previa", "previo", "anterior"] - froms = ["desde", "en", "para", "después de", "por", "próximo", - "próxima", "de"] - thises = ["este", "esta"] - froms += thises - lists = nxts + prevs + froms + time_indicators - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" - - start = idx - used = 0 - # save timequalifier for later - if word in timeQualifiersList: - timeQualifier = word - - # parse today, tomorrow, yesterday - elif word == "hoy" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "mañana" and not fromFlag: - dayOffset = 1 - used += 1 - elif word == "ayer" and not fromFlag: - dayOffset -= 1 - used += 1 - # "before yesterday" and "before before yesterday" - elif (word == "anteayer" or - (word == "ante" and wordNext == "ayer")) and not fromFlag: - dayOffset -= 2 - used += 1 - if wordNext == "ayer": - used += 1 - elif word == "ante" and wordNext == "ante" and wordNextNext == \ - "ayer" and not fromFlag: - dayOffset -= 3 - used += 3 - elif word == "ante anteayer" and not fromFlag: - dayOffset -= 3 - used += 1 - # day after tomorrow - elif word == "pasado" and wordNext == "mañana" and not fromFlag: - dayOffset += 2 - used = 2 - # day before yesterday - elif word == "ante" and wordNext == "ayer" and not fromFlag: - dayOffset -= 2 - used = 2 - # parse 5 days, 10 weeks, last week, next week, week after - elif word == "día": - if wordNext == "pasado" or wordNext == "ante": - used += 1 - if wordPrev and wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used += 1 - elif (wordPrev and wordPrev[0].isdigit() and - wordNext not in months and - wordNext not in monthsShort): - dayOffset += int(wordPrev) - start -= 1 - used += 2 - elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ - months and wordNextNext not in monthsShort: - dayOffset += int(wordNext) - start -= 1 - used += 2 - - elif word == "semana" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - dayOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - dayOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - dayOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "mes" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - monthOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - monthOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - monthOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - monthOffset = -7 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "año" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - yearOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - yearOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - yearOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - yearOffset = -7 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - if wordPrev == "siguiente": - dayOffset += 7 - used += 1 - start -= 1 - elif wordPrev == "pasado": - dayOffset -= 7 - used += 1 - start -= 1 - if wordNext == "siguiente": - # dayOffset += 7 - used += 1 - elif wordNext == "pasado": - # dayOffset -= 7 - used += 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in monthsShort: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months[m] - if wordPrev and wordPrev[0].isdigit(): - # 13 mayo - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - # mayo 13 - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordPrevPrev and wordPrevPrev[0].isdigit(): - # 13 dia mayo - datestr += " " + wordPrevPrev - - start -= 2 - used += 2 - if wordNext and word[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNextNext and wordNextNext[0].isdigit(): - # mayo dia 13 - datestr += " " + wordNextNext - used += 2 - if wordNextNextNext and wordNextNextNext[0].isdigit(): - datestr += " " + wordNextNextNext - used += 1 - hasYear = True - else: - hasYear = False - - if datestr in months: - datestr = "" - - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - validFollowups = days + months + monthsShort - validFollowups.append("hoy") - validFollowups.append("mañana") - validFollowups.append("ayer") - validFollowups.append("anteayer") - validFollowups.append("ahora") - validFollowups.append("ya") - validFollowups.append("ante") - - # TODO debug word "depois" that one is failing for some reason - if word in froms and wordNext in validFollowups: - - if not (wordNext == "mañana" and wordNext == "ayer") and not ( - word == "pasado" or word == "antes"): - used = 2 - fromFlag = True - if wordNext == "mañana" and word != "pasado": - dayOffset += 1 - elif wordNext == "ayer": - dayOffset -= 1 - elif wordNext == "anteayer": - dayOffset -= 2 - elif wordNext == "ante" and wordNextNext == "ayer": - dayOffset -= 2 - elif (wordNext == "ante" and wordNext == "ante" and - wordNextNextNext == "ayer"): - dayOffset -= 3 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - # if wordNextNext == "feira": - # used += 1 - if tmpOffset < 0: - tmpOffset += 7 - if wordNextNext: - if wordNextNext in nxts: - tmpOffset += 7 - used += 1 - elif wordNextNext in prevs: - tmpOffset -= 7 - used += 1 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNextNextNext: - if wordNextNextNext in nxts: - tmpOffset += 7 - used += 1 - elif wordNextNextNext in prevs: - tmpOffset -= 7 - used += 1 - dayOffset += tmpOffset - # if wordNextNextNext == "feira": - # used += 1 - if wordNext in months: - used -= 1 - if used > 0: - - if start - 1 > 0 and words[start - 1] in lists: - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in lists: - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = None - minAbs = None - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word == "medio" and wordNext == "día": - hrAbs = 12 - used += 2 - elif word == "media" and wordNext == "noche": - hrAbs = 0 - used += 2 - elif word == "mañana": - if not hrAbs: - hrAbs = 8 - used += 1 - elif word == "tarde": - if not hrAbs: - hrAbs = 15 - used += 1 - elif word == "media" and wordNext == "tarde": - if not hrAbs: - hrAbs = 17 - used += 2 - elif word == "tarde" and wordNext == "noche": - if not hrAbs: - hrAbs = 20 - used += 2 - elif word == "media" and wordNext == "mañana": - if not hrAbs: - hrAbs = 10 - used += 2 - # elif word == "fim" and wordNext == "tarde": - # if not hrAbs: - # hrAbs = 19 - # used += 2 - # elif word == "fim" and wordNext == "manha": - # if not hrAbs: - # hrAbs = 11 - # used += 2 - elif word == "madrugada": - if not hrAbs: - hrAbs = 1 - used += 2 - elif word == "noche": - if not hrAbs: - hrAbs = 21 - used += 1 - # parse half an hour, quarter hour - elif word == "hora" and \ - (wordPrev in time_indicators or wordPrevPrev in - time_indicators): - if wordPrev == "media": - minOffset = 30 - elif wordPrev == "cuarto": - minOffset = 15 - elif wordPrevPrev == "cuarto": - minOffset = 15 - if idx > 2 and words[idx - 3] in time_indicators: - words[idx - 3] = "" - words[idx - 2] = "" - else: - hrOffset = 1 - if wordPrevPrev in time_indicators: - words[idx - 2] = "" - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - elif wordNext == "mañana" or wordNext == "madrugada": - remainder = "am" - used += 1 - elif wordNext == "tarde": - remainder = "pm" - used += 1 - elif wordNext == "noche": - if 0 < int(word[0]) < 6: - remainder = "am" - else: - remainder = "pm" - used += 1 - elif wordNext in thises and wordNextNext == "mañana": - remainder = "am" - used = 2 - elif wordNext in thises and wordNextNext == "tarde": - remainder = "pm" - used = 2 - elif wordNext in thises and wordNextNext == "noche": - remainder = "pm" - used = 2 - else: - if timeQualifier != "": - if strHH <= 12 and \ - (timeQualifier == "mañana" or - timeQualifier == "tarde"): - strHH += 12 - - else: - # try to parse # s without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - else: - if (wordNext == "pm" or - wordNext == "p.m." or - wordNext == "tarde"): - strHH = strNum - remainder = "pm" - used = 1 - elif (wordNext == "am" or - wordNext == "a.m." or - wordNext == "mañana"): - strHH = strNum - remainder = "am" - used = 1 - elif (int(word) > 100 and - ( - # wordPrev == "o" or - # wordPrev == "oh" or - wordPrev == "cero" - )): - # 0800 hours (pronounced oh-eight-hundred) - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - if wordNext == "hora": - used += 1 - elif ( - wordNext == "hora" and - word[0] != '0' and - ( - int(word) < 100 and - int(word) > 2400 - )): - # ignores military time - # "in 3 hours" - hrOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "minuto": - # "in 10 minutes" - minOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "segundo": - # in 5 seconds - secOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif int(word) > 100: - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - if wordNext == "hora": - used += 1 - - elif wordNext == "" or ( - wordNext == "en" and wordNextNext == "punto"): - strHH = word - strMM = 00 - if wordNext == "en" and wordNextNext == "punto": - used += 2 - if wordNextNextNext == "tarde": - remainder = "pm" - used += 1 - elif wordNextNextNext == "mañana": - remainder = "am" - used += 1 - elif wordNextNextNext == "noche": - if 0 > strHH > 6: - remainder = "am" - else: - remainder = "pm" - used += 1 - - elif wordNext[0].isdigit(): - strHH = word - strMM = wordNext - used += 1 - if wordNextNext == "hora": - used += 1 - else: - isTime = False - - strHH = int(strHH) if strHH else 0 - strMM = int(strMM) if strMM else 0 - strHH = strHH + 12 if (remainder == "pm" and - 0 < strHH < 12) else strHH - strHH = strHH - 12 if (remainder == "am" and - 0 < strHH >= 12) else strHH - if strHH > 24 or strMM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = strHH * 1 - minAbs = strMM * 1 - used += 1 - - if used > 0: - # removed parsed words from the sentence - for i in range(used): - words[idx + i] = "" - - if wordPrev == "en" or wordPrev == "punto": - words[words.index(wordPrev)] = "" - - if idx > 0 and wordPrev in time_indicators: - words[idx - 1] = "" - if idx > 1 and wordPrevPrev in time_indicators: - words[idx - 2] = "" - - idx += used - 1 - found = True - - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow - extractedDate = extractedDate.replace(microsecond=0, - second=0, - minute=0, - hour=0) - if datestr != "": - en_months = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', - 'aug', - 'sept', 'oct', 'nov', 'dec'] - for idx, en_month in enumerate(en_months): - datestr = datestr.replace(months[idx], en_month) - for idx, en_month in enumerate(en_monthsShort): - datestr = datestr.replace(monthsShort[idx], en_month) - - temp = datetime.strptime(datestr, "%B %d") - if not hasYear: - temp = temp.replace(year=extractedDate.year) - if extractedDate < temp: - extractedDate = extractedDate.replace(year=int(currentYear), - month=int( - temp.strftime( - "%m")), - day=int(temp.strftime( - "%d"))) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - - if hrAbs is None and minAbs is None and default_time: - hrAbs = default_time.hour - minAbs = default_time.minute - - if hrAbs != -1 and minAbs != -1: - extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, - minutes=minAbs or 0) - if (hrAbs or minAbs) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - # resultStr = pt_pruning(resultStr) - return [extractedDate, resultStr] - - -def get_gender_es(word, raw_string=""): - # Next rules are imprecise and incompleted, but is a good starting point. - # For more detailed explanation, see - # http://www.wikilengua.org/index.php/Género_gramatical - word = word.rstrip("s") - gender = False - words = raw_string.split(" ") - for idx, w in enumerate(words): - if w == word and idx != 0: - previous = words[idx - 1] - gender = get_gender_es(previous) - break - if not gender: - if word[-1] == "a": - gender = "f" - if word[-1] == "o" or word[-1] == "e": - gender = "m" - return gender +from lingua_franca.lang.parse_es import * diff --git a/mycroft/util/lang/parse_fr.py b/mycroft/util/lang/parse_fr.py index 2f11850b76..271d9e7560 100644 --- a/mycroft/util/lang/parse_fr.py +++ b/mycroft/util/lang/parse_fr.py @@ -13,1072 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -""" Parse functions for french (fr) +"""File kept for backwards compatibility. - Todo: - * extractnumber_fr: ordinal numbers ("cinquième") - * extractnumber_fr: numbers greater than 999 999 ("cinq millions") - * extract_datetime_fr: "quatrième lundi de janvier" - * get_gender_fr +TODO: Remove in 20.02 """ - -from datetime import datetime -from dateutil.relativedelta import relativedelta -from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \ - extract_numbers_generic -from mycroft.util.lang.format_fr import pronounce_number_fr - -# Undefined articles ["un", "une"] cannot be supressed, -# in French, "un cheval" means "a horse" or "one horse". -articles_fr = ["le", "la", "du", "de", "les", "des"] - -numbers_fr = { - "zéro": 0, - "un": 1, - "une": 1, - "deux": 2, - "trois": 3, - "quatre": 4, - "cinq": 5, - "six": 6, - "sept": 7, - "huit": 8, - "neuf": 9, - "dix": 10, - "onze": 11, - "douze": 12, - "treize": 13, - "quatorze": 14, - "quinze": 15, - "seize": 16, - "vingt": 20, - "trente": 30, - "quarante": 40, - "cinquante": 50, - "soixante": 60, - "soixante-dix": 70, - "septante": 70, - "quatre-vingt": 80, - "quatre-vingts": 80, - "octante": 80, - "huitante": 80, - "quatre-vingt-dix": 90, - "nonante": 90, - "cent": 100, - "cents": 100, - "mille": 1000, - "mil": 1000, - "millier": 1000, - "milliers": 1000, - "million": 1000000, - "millions": 1000000, - "milliard": 1000000000, - "milliards": 1000000000} - -ordinals_fr = ("er", "re", "ère", "nd", "nde" "ième", "ème", "e") - - -def number_parse_fr(words, i): - """ Parses a list of words to find a number - Takes in a list of words (strings without whitespace) and - extracts a number that starts at the given index. - Args: - words (array): the list to extract a number from - i (int): the index in words where to look for the number - Returns: - tuple with number, index of next word after the number. - - Returns None if no number was found. - """ - - def cte_fr(i, s): - # Check if string s is equal to words[i]. - # If it is return tuple with s, index of next word. - # If it is not return None. - if i < len(words) and s == words[i]: - return s, i + 1 - return None - - def number_word_fr(i, mi, ma): - # Check if words[i] is a number in numbers_fr between mi and ma. - # If it is return tuple with number, index of next word. - # If it is not return None. - if i < len(words): - val = numbers_fr.get(words[i]) - # Numbers [1-16,20,30,40,50,60,70,80,90,100,1000] - if val is not None: - if val >= mi and val <= ma: - return val, i + 1 - else: - return None - # The number may be hyphenated (numbers [17-999]) - splitWord = words[i].split('-') - if len(splitWord) > 1: - val1 = numbers_fr.get(splitWord[0]) - if val1: - i1 = 0 - val2 = 0 - val3 = 0 - if val1 < 10 and splitWord[1] == "cents": - val1 = val1 * 100 - i1 = 2 - - # For [81-99], e.g. "quatre-vingt-deux" - if len(splitWord) > i1 and splitWord[0] == "quatre" and \ - splitWord[1] == "vingt": - val1 = 80 - i1 += 2 - - # We still found a number - if i1 == 0: - i1 = 1 - - if len(splitWord) > i1: - # For [21,31,41,51,61,71] - if len(splitWord) > i1 + 1 and splitWord[i1] == "et": - val2 = numbers_fr.get(splitWord[i1 + 1]) - if val2 is not None: - i1 += 2 - # For [77-79],[97-99] e.g. "soixante-dix-sept" - elif splitWord[i1] == "dix" and \ - len(splitWord) > i1 + 1: - val2 = numbers_fr.get(splitWord[i1 + 1]) - if val2 is not None: - val2 += 10 - i1 += 2 - else: - val2 = numbers_fr.get(splitWord[i1]) - if val2 is not None: - i1 += 1 - if len(splitWord) > i1: - val3 = numbers_fr.get(splitWord[i1]) - if val3 is not None: - i1 += 1 - - if val2: - if val3: - val = val1 + val2 + val3 - else: - val = val1 + val2 - else: - return None - if i1 == len(splitWord) and val and ma >= val >= mi: - return val, i + 1 - - return None - - def number_1_99_fr(i): - # Check if words[i] is a number between 1 and 99. - # If it is return tuple with number, index of next word. - # If it is not return None. - - # Is it a number between 1 and 16? - result1 = number_word_fr(i, 1, 16) - if result1: - return result1 - - # Is it a number between 10 and 99? - result1 = number_word_fr(i, 10, 99) - if result1: - val1, i1 = result1 - result2 = cte_fr(i1, "et") - # If the number is not hyphenated [21,31,41,51,61,71] - if result2: - i2 = result2[1] - result3 = number_word_fr(i2, 1, 11) - if result3: - val3, i3 = result3 - return val1 + val3, i3 - return result1 - - # It is not a number - return None - - def number_1_999_fr(i): - # Check if words[i] is a number between 1 and 999. - # If it is return tuple with number, index of next word. - # If it is not return None. - - # Is it 100 ? - result = number_word_fr(i, 100, 100) - - # Is it [200,300,400,500,600,700,800,900]? - if not result: - resultH1 = number_word_fr(i, 2, 9) - if resultH1: - valH1, iH1 = resultH1 - resultH2 = number_word_fr(iH1, 100, 100) - if resultH2: - iH2 = resultH2[1] - result = valH1 * 100, iH2 - - if result: - val1, i1 = result - result2 = number_1_99_fr(i1) - if result2: - val2, i2 = result2 - return val1 + val2, i2 - else: - return result - - # Is it hyphenated? [101-999] - result = number_word_fr(i, 101, 999) - if result: - return result - - # [1-99] - result = number_1_99_fr(i) - if result: - return result - - return None - - def number_1_999999_fr(i): - """ Find a number in a list of words - Checks if words[i] is a number between 1 and 999,999. - - Args: - i (int): the index in words where to look for the number - Returns: - tuple with number, index of next word after the number. - - Returns None if no number was found. - """ - - # check for zero - result1 = number_word_fr(i, 0, 0) - if result1: - return result1 - - # check for [1-999] - result1 = number_1_999_fr(i) - if result1: - val1, i1 = result1 - else: - val1 = 1 - i1 = i - # check for 1000 - result2 = number_word_fr(i1, 1000, 1000) - if result2: - # it's [1000-999000] - i2 = result2[1] - # check again for [1-999] - result3 = number_1_999_fr(i2) - if result3: - val3, i3 = result3 - return val1 * 1000 + val3, i3 - else: - return val1 * 1000, i2 - elif result1: - return result1 - return None - - return number_1_999999_fr(i) - - -def getOrdinal_fr(word): - """ Get the ordinal number - Takes in a word (string without whitespace) and - extracts the ordinal number. - Args: - word (string): the word to extract the number from - Returns: - number (int) - - Returns None if no ordinal number was found. - """ - if word: - for ordinal in ordinals_fr: - if word[0].isdigit() and ordinal in word: - result = word.replace(ordinal, "") - if result.isdigit(): - return int(result) - - return None - - -def number_ordinal_fr(words, i): - """ Find an ordinal number in a list of words - Takes in a list of words (strings without whitespace) and - extracts an ordinal number that starts at the given index. - Args: - words (array): the list to extract a number from - i (int): the index in words where to look for the ordinal number - Returns: - tuple with ordinal number (str), - index of next word after the number (int). - - Returns None if no ordinal number was found. - """ - val1 = None - strOrd = "" - # it's already a digit, normalize to "1er" or "5e" - val1 = getOrdinal_fr(words[i]) - if val1 is not None: - if val1 == 1: - strOrd = "1er" - else: - strOrd = str(val1) + "e" - return strOrd, i + 1 - - # if it's a big number the beginning should be detected as a number - result = number_parse_fr(words, i) - if result: - val1, i = result - else: - val1 = 0 - - if i < len(words): - word = words[i] - if word in ["premier", "première"]: - strOrd = "1er" - elif word == "second": - strOrd = "2e" - elif word.endswith("ième"): - val2 = None - word = word[:-4] - # centième - if word == "cent": - if val1: - strOrd = str(val1 * 100) + "e" - else: - strOrd = "100e" - # millième - elif word == "mill": - if val1: - strOrd = str(val1 * 1000) + "e" - else: - strOrd = "1000e" - else: - # "cinquième", "trente-cinquième" - if word.endswith("cinqu"): - word = word[:-1] - # "neuvième", "dix-neuvième" - elif word.endswith("neuv"): - word = word[:-1] + "f" - result = number_parse_fr([word], 0) - if not result: - # "trentième", "douzième" - word = word + "e" - result = number_parse_fr([word], 0) - if result: - val2, i = result - if val2 is not None: - strOrd = str(val1 + val2) + "e" - if strOrd: - return strOrd, i + 1 - - return None - - -def extractnumber_fr(text): - """Takes in a string and extracts a number. - Args: - text (str): the string to extract a number from - Returns: - (str): The number extracted or the original text. - """ - # normalize text, keep articles for ordinals versus fractionals - text = normalize_fr(text, False) - # split words by whitespace - aWords = text.split() - count = 0 - result = None - add = False - while count < len(aWords): - val = None - word = aWords[count] - wordNext = "" - wordPrev = "" - if count < (len(aWords) - 1): - wordNext = aWords[count + 1] - if count > 0: - wordPrev = aWords[count - 1] - - if word in articles_fr: - count += 1 - continue - if word in ["et", "plus", "+"]: - count += 1 - add = True - continue - - # is current word a numeric number? - if word.isdigit(): - val = int(word) - count += 1 - elif is_numeric(word): - val = float(word) - count += 1 - elif wordPrev in articles_fr and getOrdinal_fr(word): - val = getOrdinal_fr(word) - count += 1 - # is current word the denominator of a fraction? - elif isFractional_fr(word): - val = isFractional_fr(word) - count += 1 - - # is current word the numerator of a fraction? - if val and wordNext: - valNext = isFractional_fr(wordNext) - if valNext: - val = float(val) * valNext - count += 1 - - if not val: - count += 1 - # is current word a numeric fraction like "2/3"? - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - - # is current word followed by a decimal value? - if wordNext == "virgule": - zeros = 0 - newWords = aWords[count + 1:] - # count the number of zeros after the decimal sign - for word in newWords: - if word == "zéro" or word == "0": - zeros += 1 - else: - break - afterDotVal = None - # extract the number after the zeros - if newWords[zeros].isdigit(): - afterDotVal = newWords[zeros] - countDot = count + zeros + 2 - # if a number was extracted (since comma is also a - # punctuation sign) - if afterDotVal: - count = countDot - if not val: - val = 0 - # add the zeros - afterDotString = zeros * "0" + afterDotVal - val = float(str(val) + "." + afterDotString) - if val: - if add: - result += val - add = False - else: - result = val - - # if result == False: - if not result: - return normalize_fr(text, True) - - return result - - -def extract_datetime_fr(string, currentDate, default_time): - def clean_string(s): - """ - cleans the input string of unneeded punctuation and capitalization - among other things. - """ - s = normalize_fr(s, True) - wordList = s.split() - for idx, word in enumerate(wordList): - # remove comma and dot if it's not a number - if word[-1] in [",", "."]: - word = word[:-1] - wordList[idx] = word - - return wordList - - def date_found(): - return found or \ - ( - datestr != "" or - yearOffset != 0 or monthOffset != 0 or dayOffset or - (isTime and (hrAbs or minAbs)) or - hrOffset != 0 or minOffset != 0 or secOffset != 0 - ) - - if string == "" or not currentDate: - return None - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - dateNow = currentDate - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - timeQualifiersList = ["matin", "après-midi", "soir", "nuit"] - words_in = ["dans", "après"] - markers = ["à", "dès", "autour", "vers", "environs", "ce", - "cette"] + words_in - days = ["lundi", "mardi", "mercredi", - "jeudi", "vendredi", "samedi", "dimanche"] - months = ["janvier", "février", "mars", "avril", "mai", "juin", - "juillet", "août", "septembre", "octobre", "novembre", - "décembre"] - monthsShort = ["jan", "fév", "mar", "avr", "mai", "juin", "juil", "aoû", - "sept", "oct", "nov", "déc"] - # needed for format functions - months_en = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - - words = clean_string(string) - - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - - start = idx - used = 0 - # save timequalifier for later - if word in timeQualifiersList: - timeQualifier = word - used = 1 - if wordPrev in ["ce", "cet", "cette"]: - used = 2 - start -= 1 - # parse aujourd'hui, demain, après-demain - elif word == "aujourd'hui" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "demain" and not fromFlag: - dayOffset = 1 - used += 1 - elif word == "après-demain" and not fromFlag: - dayOffset = 2 - used += 1 - # parse 5 jours, 10 semaines, semaine dernière, semaine prochaine - elif word in ["jour", "jours"]: - if wordPrev.isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used = 2 - # "3e jour" - elif getOrdinal_fr(wordPrev) is not None: - dayOffset += getOrdinal_fr(wordPrev) - 1 - start -= 1 - used = 2 - elif word in ["semaine", "semaines"] and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - elif wordNext in ["prochaine", "suivante"]: - dayOffset = 7 - used = 2 - elif wordNext in ["dernière", "précédente"]: - dayOffset = -7 - used = 2 - # parse 10 mois, mois prochain, mois dernier - elif word == "mois" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordNext in ["prochain", "suivant"]: - monthOffset = 1 - used = 2 - elif wordNext in ["dernier", "précédent"]: - monthOffset = -1 - used = 2 - # parse 5 ans, an prochain, année dernière - elif word in ["an", "ans", "année", "années"] and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordNext in ["prochain", "prochaine", "suivant", "suivante"]: - yearOffset = 1 - used = 2 - elif wordNext in ["dernier", "dernière", "précédent", - "précédente"]: - yearOffset = -1 - used = 2 - # parse lundi, mardi etc., and lundi prochain, mardi dernier, etc. - elif word in days and not fromFlag: - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - if wordNext in ["prochain", "suivant"]: - dayOffset += 7 - used += 1 - elif wordNext in ["dernier", "précédent"]: - dayOffset -= 7 - used += 1 - # parse 15 juillet, 15 juil - elif word in months or word in monthsShort and not fromFlag: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months_en[m] - if wordPrev and (wordPrev[0].isdigit()): - datestr += " " + wordPrev - start -= 1 - used += 1 - else: - datestr += " 1" - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - # parse 5 jours après demain, 10 semaines après jeudi prochain, - # 2 mois après juillet - validFollowups = days + months + monthsShort - validFollowups.append("aujourd'hui") - validFollowups.append("demain") - validFollowups.append("prochain") - validFollowups.append("prochaine") - validFollowups.append("suivant") - validFollowups.append("suivante") - validFollowups.append("dernier") - validFollowups.append("dernière") - validFollowups.append("précédent") - validFollowups.append("précédente") - validFollowups.append("maintenant") - if word in ["après", "depuis"] and wordNext in validFollowups: - used = 2 - fromFlag = True - if wordNext == "demain": - dayOffset += 1 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if wordNextNext == "prochain": - tmpOffset += 7 - used += 1 - elif wordNextNext == "dernier": - tmpOffset -= 7 - used += 1 - elif tmpOffset < 0: - tmpOffset += 7 - dayOffset += tmpOffset - if used > 0: - if start - 1 > 0 and words[start - 1] in ["ce", "cette"]: - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in markers: - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = None - minAbs = None - ampm = "" - isTime = False - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - used = 0 - start = idx - - # parse midi et quart, minuit et demi, midi 10, minuit moins 20 - if word in ["midi", "minuit"]: - isTime = True - if word == "midi": - hrAbs = 12 - used += 1 - elif word == "minuit": - hrAbs = 0 - used += 1 - if wordNext.isdigit(): - minAbs = int(wordNext) - used += 1 - elif wordNext == "et": - if wordNextNext == "quart": - minAbs = 15 - used += 2 - elif wordNextNext == "demi": - minAbs = 30 - used += 2 - elif wordNext == "moins": - if wordNextNext.isdigit(): - minAbs = 60 - int(wordNextNext) - if not hrAbs: - hrAbs = 23 - else: - hrAbs -= 1 - used += 2 - if wordNextNext == "quart": - minAbs = 45 - if not hrAbs: - hrAbs = 23 - else: - hrAbs -= 1 - used += 2 - # parse une demi-heure, un quart d'heure - elif word == "demi-heure" or word == "heure" and \ - (wordPrevPrev in markers or wordPrevPrevPrev in markers): - used = 1 - isTime = True - if word == "demi-heure": - minOffset = 30 - elif wordPrev == "quart": - minOffset = 15 - used += 1 - start -= 1 - elif wordPrev == "quarts" and wordPrevPrev.isdigit(): - minOffset = int(wordPrevPrev) * 15 - used += 1 - start -= 1 - if wordPrev.isdigit() or wordPrevPrev.isdigit(): - start -= 1 - used += 1 - # parse 5:00 du matin, 12:00, etc - elif word[0].isdigit() and getOrdinal_fr(word) is None: - isTime = True - if ":" in word or "h" in word or "min" in word: - # parse hours on short format - # "3:00 du matin", "4h14", "3h15min" - strHH = "" - strMM = "" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - used = 1 - elif word[i] in [":", "h", "m"]: - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - used = 1 - else: - stage = 2 - if word[i:i + 3] == "min": - i += 1 - elif stage == 2: - break - if wordPrev in words_in: - hrOffset = int(strHH) if strHH else 0 - minOffset = int(strMM) if strMM else 0 - else: - hrAbs = int(strHH) if strHH else 0 - minAbs = int(strMM) if strMM else 0 - else: - # try to parse time without colons - # 5 hours, 10 minutes etc. - length = len(word) - ampm = "" - if ( - word.isdigit() and - wordNext in ["heures", "heure"] and word != "0" and - ( - int(word) < 100 or - int(word) > 2400 - )): - # "dans 3 heures", "à 3 heures" - if wordPrev in words_in: - hrOffset = int(word) - else: - hrAbs = int(word) - used = 2 - idxHr = idx + 2 - # "dans 1 heure 40", "à 1 heure 40" - if idxHr < len(words): - # "3 heures 45" - if words[idxHr].isdigit(): - if wordPrev in words_in: - minOffset = int(words[idxHr]) - else: - minAbs = int(words[idxHr]) - used += 1 - idxHr += 1 - # "3 heures et quart", "4 heures et demi" - elif words[idxHr] == "et" and idxHr + 1 < len(words): - if words[idxHr + 1] == "quart": - if wordPrev in words_in: - minOffset = 15 - else: - minAbs = 15 - used += 2 - idxHr += 2 - elif words[idxHr + 1] == "demi": - if wordPrev in words_in: - minOffset = 30 - else: - minAbs = 30 - used += 2 - idxHr += 2 - # "5 heures moins 20", "6 heures moins le quart" - elif words[idxHr] == "moins" and \ - idxHr + 1 < len(words): - if words[idxHr + 1].isdigit(): - if wordPrev in words_in: - hrOffset -= 1 - minOffset = 60 - int(words[idxHr + 1]) - else: - hrAbs = hrAbs - 1 - minAbs = 60 - int(words[idxHr + 1]) - used += 2 - idxHr += 2 - elif words[idxHr + 1] == "quart": - if wordPrev in words_in: - hrOffset -= 1 - minOffset = 45 - else: - hrAbs = hrAbs - 1 - minAbs = 45 - used += 2 - idxHr += 2 - # remove word minutes if present - if idxHr < len(words) and \ - words[idxHr] in ["minutes", "minute"]: - used += 1 - idxHr += 1 - elif wordNext == "minutes": - # "dans 10 minutes" - if wordPrev in words_in: - minOffset = int(word) - else: - minAbs = int(word) - used = 2 - elif wordNext == "secondes": - # "dans 5 secondes" - secOffset = int(word) - used = 2 - elif int(word) > 100: - # format militaire - hrAbs = int(word) / 100 - minAbs = int(word) - hrAbs * 100 - used = 1 - if wordNext == "heures": - used += 1 - - # handle am/pm - if timeQualifier: - if timeQualifier == "matin": - ampm = "am" - elif timeQualifier == "après-midi": - ampm = "pm" - elif timeQualifier == "soir": - ampm = "pm" - elif timeQualifier == "nuit": - if (hrAbs or 0) > 8: - ampm = "pm" - else: - ampm = "am" - hrAbs = ((hrAbs or 0) + 12 if ampm == "pm" and (hrAbs or 0) < 12 - else hrAbs) - hrAbs = ((hrAbs or 0) - 12 if ampm == "am" and (hrAbs or 0) >= 12 - else hrAbs) - if (hrAbs or 0) > 24 or ((minAbs or 0) > 59): - isTime = False - used = 0 - elif wordPrev in words_in: - isTime = False - else: - isTime = True - - elif not hrAbs and timeQualifier: - if timeQualifier == "matin": - hrAbs = 8 - elif timeQualifier == "après-midi": - hrAbs = 15 - elif timeQualifier == "soir": - hrAbs = 19 - elif timeQualifier == "nuit": - hrAbs = 2 - isTime = True - - if used > 0: - # removed parsed words from the sentence - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in markers: - words[start - 1] = "" - - idx += used - 1 - found = True - - # check that we found a date - if not date_found(): - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - extractedDate = dateNow - extractedDate = extractedDate.replace(microsecond=0, - second=0, - minute=0, - hour=0) - if datestr != "": - if not hasYear: - temp = datetime.strptime(datestr, "%B %d") - temp = temp.replace(year=extractedDate.year) - if extractedDate < temp: - extractedDate = extractedDate.replace(year=int(currentYear), - month=int( - temp.strftime( - "%m")), - day=int(temp.strftime( - "%d"))) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - else: - temp = datetime.strptime(datestr, "%B %d %Y") - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - - if hrAbs is None and minAbs is None and default_time: - hrAbs = default_time.hour - minAbs = default_time.minute - if hrAbs != -1 and minAbs != -1: - extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, - minutes=minAbs or 0) - if (hrAbs or minAbs) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - for idx, word in enumerate(words): - if words[idx] == "et" and words[idx - 1] == "" and \ - words[idx + 1] == "": - words[idx] = "" - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - return [extractedDate, resultStr] - - -def isFractional_fr(input_str): - """ - This function takes the given text and checks if it is a fraction. - Args: - input_str (str): the string to check if fractional - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - """ - input_str = input_str.lower() - - if input_str != "tiers" and input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "quarts" - - aFrac = ["entier", "demi", "tiers", "quart", "cinquième", "sixième", - "septième", "huitième", "neuvième", "dixième", "onzième", - "douzième", "treizième", "quatorzième", "quinzième", "seizième", - "dix-septième", "dix-huitième", "dix-neuvième", "vingtième"] - - if input_str in aFrac: - return 1.0 / (aFrac.index(input_str) + 1) - if getOrdinal_fr(input_str): - return 1.0 / getOrdinal_fr(input_str) - if input_str == "trentième": - return 1.0 / 30 - if input_str == "centième": - return 1.0 / 100 - if input_str == "millième": - return 1.0 / 1000 - - return False - - -def normalize_fr(text, remove_articles): - """ French string normalization """ - text = text.lower() - words = text.split() # this also removed extra spaces - normalized = "" - i = 0 - while i < len(words): - # remove articles - if remove_articles and words[i] in articles_fr: - i += 1 - continue - if remove_articles and words[i][:2] in ["l'", "d'"]: - words[i] = words[i][2:] - # remove useless punctuation signs - if words[i] in ["?", "!", ";", "…"]: - i += 1 - continue - # Normalize ordinal numbers - if i > 0 and words[i - 1] in articles_fr: - result = number_ordinal_fr(words, i) - if result is not None: - val, i = result - normalized += " " + str(val) - continue - # Convert numbers into digits - result = number_parse_fr(words, i) - if result is not None: - val, i = result - normalized += " " + str(val) - continue - - normalized += " " + words[i] - i += 1 - - return normalized[1:] # strip the initial space - - -def extract_numbers_fr(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ - return extract_numbers_generic(text, pronounce_number_fr, extractnumber_fr, - short_scale=short_scale, ordinals=ordinals) +from lingua_franca.lang.parse_fr import * diff --git a/mycroft/util/lang/parse_it.py b/mycroft/util/lang/parse_it.py index 7976978b4c..5ec7ae914f 100644 --- a/mycroft/util/lang/parse_it.py +++ b/mycroft/util/lang/parse_it.py @@ -13,1312 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +"""File kept for backwards compatibility. + +TODO: Remove in 20.02 """ - Parse functions for Italian (IT-IT) - -""" - -import collections -from datetime import datetime -from dateutil.relativedelta import relativedelta -from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \ - extract_numbers_generic -from mycroft.util.lang.format_it import LONG_SCALE_IT, SHORT_SCALE_IT, \ - pronounce_number_it - -SHORT_ORDINAL_STRING_IT = { - 1: 'primo', - 2: 'secondo', - 3: 'terzo', - 4: 'quarto', - 5: 'quinto', - 6: 'sesto', - 7: 'settimo', - 8: 'ottavo', - 9: 'nono', - 10: 'decimo', - 11: 'undicesimo', - 12: 'dodicesimo', - 13: 'tredicesimo', - 14: 'quattordicesimo', - 15: 'quindicesimo', - 16: 'sedicesimo', - 17: 'diciassettesimo', - 18: 'diciottesimo', - 19: 'diciannovesimo', - 20: 'ventesimo', - 30: 'trentesimo', - 40: 'quarantesimo', - 50: 'cinquantesimo', - 60: 'sessantesimo', - 70: 'settantesimo', - 80: 'ottantesimo', - 90: 'novantesimo', - 1e2: 'centesimo', - 1e3: 'millesimo', - 1e6: 'milionesimo', - 1e9: 'miliardesimo', - 1e12: 'trilionesimo', - 1e15: 'quadrilionesimo', - 1e18: 'quintilionesim', - 1e21: 'sestilionesimo', - 1e24: 'settilionesimo', - 1e27: 'ottilionesimo', - 1e30: 'nonilionesimo', - 1e33: 'decilionesimo' - # TODO > 1e-33 -} - -# per i > 10e12 modificata solo la desinenza: da sistemare a fine debug -LONG_ORDINAL_STRING_IT = { - 1: 'primo', - 2: 'secondo', - 3: 'terzo', - 4: 'quarto', - 5: 'quinto', - 6: 'sesto', - 7: 'settimo', - 8: 'ottavo', - 9: 'nono', - 10: 'decimo', - 11: 'undicesimo', - 12: 'dodicesimo', - 13: 'tredicesimo', - 14: 'quattordicesimo', - 15: 'quindicesimo', - 16: 'sedicesimo', - 17: 'diciassettesimo', - 18: 'diciottesimo', - 19: 'diciannovesimo', - 20: 'ventesimo', - 30: 'trentesimo', - 40: 'quarantesimo', - 50: 'cinquantesimo', - 60: 'sessantesimo', - 70: 'settantesimo', - 80: 'ottantesimo', - 90: 'novantesimo', - 1e2: 'centesimo', - 1e3: 'millesimo', - 1e6: 'milionesimo', - 1e12: 'bilionesimo', - 1e18: 'trilionesimo', - 1e24: 'quadrilionesimo', - 1e30: 'quintilionesimo', - 1e36: 'sestilionesimo', - 1e42: 'settilionesimo', - 1e48: 'ottilionesimo', - 1e54: 'nonilionesimo', - 1e60: 'decilionesimo' - # TODO > 1e60 -} - -# Undefined articles ['un', 'una', 'un\''] can not be supressed, -# in Italian, 'un cavallo' means 'a horse' or 'one horse'. -ARTICLES_IT = ['il', 'lo', 'la', 'i', 'gli', 'le'] - -STRING_NUM_ITA = { - 'zero': 0, - 'un': 1, - 'uno': 1, - 'una': 1, - 'un\'': 1, - 'due': 2, - 'tre': 3, - 'quattro': 4, - 'cinque': 5, - 'sei': 6, - 'sette': 7, - 'otto': 8, - 'nove': 9, - 'dieci': 10, - 'undici': 11, - 'dodici': 12, - 'tredici': 13, - 'quattordici': 14, - 'quindici': 15, - 'sedici': 16, - 'diciassette': 17, - 'diciotto': 18, - 'diciannove': 19, - 'venti': 20, - 'vent': 20, - 'trenta': 30, - 'trent': 30, - 'quaranta': 40, - 'quarant': 40, - 'cinquanta': 50, - 'cinquant': 50, - 'sessanta': 60, - 'sessant': 60, - 'settanta': 70, - 'settant': 70, - 'ottanta': 80, - 'ottant': 80, - 'novanta': 90, - 'novant': 90, - 'cento': 100, - 'duecento': 200, - 'trecento': 300, - 'quattrocento': 400, - 'cinquecento': 500, - 'seicento': 600, - 'settecento': 700, - 'ottocento': 800, - 'novecento': 900, - 'mille': 1000, - 'mila': 1000, - 'centomila': 100000, - 'milione': 1000000, - 'miliardo': 1000000000, - 'primo': 1, - 'secondo': 2, - 'mezzo': 0.5, - 'mezza': 0.5, - 'paio': 2, - 'decina': 10, - 'decine': 10, - 'dozzina': 12, - 'dozzine': 12, - 'centinaio': 100, - 'centinaia': 100, - 'migliaio': 1000, - 'migliaia': 1000 -} - - -def isFractional_it(input_str, short_scale=False): - """ - This function takes the given text and checks if it is a fraction. - Updated to italian from en version 18.8.9 - - Args: - input_str (str): the string to check if fractional - short_scale (bool): use short scale if True, long scale if False - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - input_str = input_str.lower() - if input_str.endswith('i', -1) and len(input_str) > 2: - input_str = input_str[:-1] + "o" # normalizza plurali - - fracts_it = {"intero": 1, "mezza": 2, "mezzo": 2} - - if short_scale: - for num in SHORT_ORDINAL_STRING_IT: - if num > 2: - fracts_it[SHORT_ORDINAL_STRING_IT[num]] = num - else: - for num in LONG_ORDINAL_STRING_IT: - if num > 2: - fracts_it[LONG_ORDINAL_STRING_IT[num]] = num - - if input_str in fracts_it: - return 1.0 / fracts_it[input_str] - return False - - -def extractnumber_long_it(word): - """ - This function converts a long textual number like - milleventisette -> 1027 diecimila -> 10041 in - integer value, covers from 0 to 999999999999999 - for now limited to 999_e21 but ready for 999_e63 - example: - milleventisette -> 1027 - diecimilaquarantuno-> 10041 - centottomiladuecentotredici -> 108213 - Args: - word (str): the word to convert in number - Returns: - (bool) or (int): The extracted number or False if no number - was found - """ - - units = {'zero': 0, 'uno': 1, 'due': 2, 'tre': 3, 'quattro': 4, - 'cinque': 5, 'sei': 6, 'sette': 7, 'otto': 8, 'nove': 9} - - tens = {'dieci': 10, 'venti': 20, 'trenta': 30, 'quaranta': 40, - 'cinquanta': 50, 'sessanta': 60, 'settanta': 70, 'ottanta': 80, - 'novanta': 90} - - tens_short = {'vent': 20, 'trent': 30, 'quarant': 40, 'cinquant': 50, - 'sessant': 60, 'settant': 70, 'ottant': 80, 'novant': 90} - - nums_long = {'undici': 11, 'dodici': 12, 'tredici': 13, 'quattordici': 14, - 'quindici': 15, 'sedici': 16, 'diciassette': 17, - 'diciotto': 18, 'diciannove': 19} - - multipli_it = collections.OrderedDict([ - # (1e63, 'deciliardi'), - # (1e60, 'decilioni'), - # (1e57, 'noviliardi'), - # (1e54, 'novilioni'), - # (1e51, 'ottiliardi'), - # (1e48, 'ottilioni'), - # (1e45, 'settiliardi'), - # (1e42, 'settilioni'), - # (1e39, 'sestiliardi'), - # (1e36, 'sestilioni'), - # (1e33, 'quintiliardi'), - # (1e30, 'quintilioni'), - # (1e27, 'quadriliardi'), - # (1e24, 'quadrilioni'), # yotta - (1e21, 'triliardi'), # zetta - (1e18, 'trilioni'), # exa - (1e15, 'biliardi'), # peta - (1e12, 'bilioni'), # tera - (1e9, 'miliardi'), # giga - (1e6, 'milioni') # mega - ]) - - multiplier = {} - un_multiplier = {} - - for num in multipli_it: - if num > 1000 and num <= 1e21: - # plurali - multiplier[multipli_it[num]] = int(num) - # singolari - modificare per eccezioni *liardo - if multipli_it[num][-5:-1] == 'iard': - un_multiplier['un' + multipli_it[num][:-1] + 'o'] = int(num) - else: - un_multiplier['un' + multipli_it[num][:-1] + 'e'] = int(num) - - value = False - - # normalizza ordinali singoli o plurali -esimo -esimi - if word[-5:-1] == 'esim': - base = word[:-5] - normalize_ita3 = {'tre': '', 'ttr': 'o', 'sei': '', 'ott': 'o'} - normalize_ita2 = {'un': 'o', 'du': 'e', 'qu': 'e', 'tt': 'e', - 'ov': 'e'} - - if base[-3:] in normalize_ita3: - base += normalize_ita3[base[-3:]] - elif base[-2:] in normalize_ita2: - base += normalize_ita2[base[-2:]] - - word = base - - for item in un_multiplier: - components = word.split(item, 1) - if len(components) == 2: - if not components[0]: # inizia con un1^x - if not components[1]: # unmilione - word = str(int(un_multiplier[item])) - else: # unmilione + x - word = str(int(un_multiplier[item]) + - extractnumber_long_it(components[1])) - - for item in multiplier: - components = word.split(item, 1) - if len(components) == 2: - if not components[0]: # inizia con un1^x - word = str(int(multiplier[item]) + - extractnumber_long_it(components[1])) - else: - if not components[1]: - word = str(extractnumber_long_it(components[0])) + '*' \ - + str(int(multiplier[item])) - else: - word = str(extractnumber_long_it(components[0])) + '*' \ - + str(int(multiplier[item])) + '+' \ - + str(extractnumber_long_it(components[1])) - - for item in tens: - word = word.replace(item, '+' + str(tens[item])) - - for item in tens_short: - word = word.replace(item, '+' + str(tens_short[item])) - - for item in nums_long: - word = word.replace(item, '+' + str(nums_long[item])) - - word = word.replace('cento', '+1xx') - word = word.replace('cent', '+1xx') - word = word.replace('mille', '+1000') # unmilionemille - word = word.replace('mila', '*1000') # unmilioneduemila - - for item in units: - word = word.replace(item, '+' + str(units[item])) - - # normalizzo i cento - occorrenze = word.count('+1xx') - for _ in range(0, occorrenze): - components = word.rsplit('+1xx', 1) - if len(components[0]) > 1 and components[0].endswith('0'): - word = components[0] + '+100' + components[1] - else: - word = components[0] + '*100' + components[1] - - components = word.rsplit('*1000', 1) - if len(components) == 2: - if components[0].startswith('*'): # centomila - components[0] = components[0][1:] - word = str(extractnumber_long_it(components[0])) + \ - '*1000' + str(components[1]) - - # gestione eccezioni - if word.startswith('*') or word.startswith('+'): - word = word[1:] - - addends = word.split('+') - for c, _ in enumerate(addends): - if '*' in addends[c]: - factors = addends[c].split('*') - result = int(factors[0]) * int(factors[1]) - if len(factors) == 3: - result *= int(factors[2]) - addends[c] = str(result) - - # check if all token are numbers - if all([s.isdecimal() for s in addends]): - value = sum([int(s) for s in addends]) - else: - value = False - return value - - -def extractnumber_it(text, short_scale=False, ordinals=False): - """ - This function extracts a number from a text string, - handles pronunciations in long scale and short scale - - https://en.wikipedia.org/wiki/Names_of_large_numbers - - Args: - text (str): the string to normalize - short_scale (bool): use short scale if True, long scale if False - ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 - Returns: - (int) or (float) or False: The extracted number or False if no number - was found - - """ - - string_num_ordinal_it = {} - # first, second... - if ordinals: - if short_scale: - for num in SHORT_ORDINAL_STRING_IT: - num_string = SHORT_ORDINAL_STRING_IT[num] - string_num_ordinal_it[num_string] = num - STRING_NUM_ITA[num_string] = num - else: - for num in LONG_ORDINAL_STRING_IT: - num_string = LONG_ORDINAL_STRING_IT[num] - string_num_ordinal_it[num_string] = num - STRING_NUM_ITA[num_string] = num - - # negate next number (-2 = 0 - 2) - negatives = ['meno'] # 'negativo' non è usuale in italiano - - # multiply the previous number (one hundred = 1 * 100) - multiplies = ['decina', 'decine', 'dozzina', 'dozzine', - 'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila'] - - # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) - fraction_marker = [' e '] - - # decimal marker ( 1 point 5 = 1 + 0.5) - decimal_marker = [' punto ', ' virgola '] - - if short_scale: - for num in SHORT_SCALE_IT: - num_string = SHORT_SCALE_IT[num] - STRING_NUM_ITA[num_string] = num - multiplies.append(num_string) - else: - for num in LONG_SCALE_IT: - num_string = LONG_SCALE_IT[num] - STRING_NUM_ITA[num_string] = num - multiplies.append(num_string) - - # 2 e 3/4 ed altri casi - for separator in fraction_marker: - components = text.split(separator) - zeros = 0 - - if len(components) == 2: - # count zeros in fraction part - sub_components = components[1].split(' ') - for element in sub_components: - if element == 'zero' or element == '0': - zeros += 1 - else: - break - # ensure first is not a fraction and second is a fraction - num1 = extractnumber_it(components[0]) - num2 = extractnumber_it(components[1]) - if num1 is not None and num2 is not None \ - and num1 >= 1 and 0 < num2 < 1: - return num1 + num2 - # sette e quaranta sette e zero zero due - elif num1 is not None and num2 is not None \ - and num1 >= 1 and num2 > 1: - return num1 + num2 / pow(10, len(str(num2)) + zeros) - - # 2 punto 5 - for separator in decimal_marker: - zeros = 0 - # count zeros in fraction part - components = text.split(separator) - - if len(components) == 2: - sub_components = components[1].split(' ') - for element in sub_components: - if element == 'zero' or element == '0': - zeros += 1 - else: - break - - number = int(extractnumber_it(components[0])) - decimal = int(extractnumber_it(components[1])) - if number is not None and decimal is not None: - if '.' not in str(decimal): - return number + decimal / pow(10, - len(str(decimal)) + zeros) - - all_words = text.split() - val = False - prev_val = None - to_sum = [] - for idx, word in enumerate(all_words): - - if not word: - continue - prev_word = all_words[idx - 1] if idx > 0 else '' - next_word = all_words[idx + 1] if idx + 1 < len(all_words) else '' - - # is this word already a number ? - if is_numeric(word): - val = float(word) - - # is this word the name of a number ? - if word in STRING_NUM_ITA: - val = STRING_NUM_ITA[word] - - # tre quarti un quarto trenta secondi - if isFractional_it(word) and prev_val: - if word[:-1] == 'second' and not ordinals: - val = prev_val * 2 - else: - val = prev_val - - # is the prev word a number and should we multiply it? - # twenty hundred, six hundred - if word in multiplies: - if not prev_val: - prev_val = 1 - val = prev_val * val - - # is this a spoken fraction? - # mezza tazza - if val is False: - val = isFractional_it(word, short_scale=short_scale) - - # 2 quinti - if not ordinals: - next_value = isFractional_it(next_word, short_scale=short_scale) - if next_value: - if not val: - val = 1 - val = val * next_value - - # is this a negative number? - if val and prev_word and prev_word in negatives: - val = 0 - val - - if not val: - val = extractnumber_long_it(word) - - # let's make sure it isn't a fraction - if not val: - # look for fractions like '2/3' - all_pieces = word.split('/') - if look_for_fractions(all_pieces): - val = float(all_pieces[0]) / float(all_pieces[1]) - else: - prev_val = val - # handle long numbers - # six hundred sixty six - # two million five hundred thousand - if word in multiplies and next_word not in multiplies: - to_sum.append(val) - val = 0 - prev_val = 0 - elif extractnumber_long_it(word) > 100 and \ - extractnumber_long_it(next_word) and \ - next_word not in multiplies: - to_sum.append(val) - val = 0 - prev_val = 0 - - if val is not None: - for addend in to_sum: - val = val + addend - return val - - -def normalize_it(text, remove_articles): - """ IT string normalization """ - # replace ambiguous words - text = text.replace('un paio', 'due') - - words = text.split() # this also removed extra spaces - # Contractions are not common in IT - # Convert numbers into digits, e.g. 'quarantadue' -> '42' - normalized = '' - i = 0 - - while i < len(words): - word = words[i] - # remove articles - # Italian requires the article to define the grammatical gender - if remove_articles and word in ARTICLES_IT: - i += 1 - continue - - if word in STRING_NUM_ITA: - word = str(STRING_NUM_ITA[word]) - - val = int(extractnumber_it(word)) # era extractnumber_long_it - - if val: - word = str(val) - - normalized += ' ' + word - i += 1 - # indefinite articles in it-it can not be removed - - return normalized[1:] - - -def extract_datetime_it(string, dateNow, default_time): - def clean_string(s): - """ - cleans the input string of unneeded punctuation and capitalization - among other things. - Normalize italian plurals - """ - symbols = ['.', ',', ';', '?', '!', 'º', 'ª', '°', 'l\''] - - for word in symbols: - s = s.replace(word, '') - - s = s.lower().replace('á', 'a').replace('à', 'a').replace('è', "e'")\ - .replace('é', "e'").replace('ì', 'i').replace('ù', 'u')\ - .replace('ò', 'o').replace('-', ' ').replace('_', '') - - # normalizza plurali per semplificare analisi - s = s.replace('secondi', 'secondo').replace('minuti', 'minuto')\ - .replace('ore', 'ora').replace('giorni', 'giorno')\ - .replace('settimane', 'settimana').replace('mesi', 'mese')\ - .replace('anni', 'anno').replace('mattino', 'mattina')\ - .replace('prossima', 'prossimo').replace('questa', 'questo')\ - .replace('quarti', 'quarto').replace('in punto', 'in_punto')\ - .replace('decennio', 'decenni').replace('secoli', 'secolo')\ - .replace('millennio', 'millenni').replace(' un ', ' uno ')\ - .replace('scorsa', 'scorso').replace('passata', 'passato')\ - .replace('uno paio', 'due') - - noise_words = ['dello', 'la', 'del', 'al', 'il', 'di', 'tra', 'lo', - 'le', 'alle', 'alla', 'dai', 'delle', 'della', - 'a', 'e\'', 'era', 'questa', 'questo', 'e', 'nel', - 'nello', 'dallo', ' '] - - word_list = s.split() - word_list = [x for x in word_list if x not in noise_words] - # normalizza alcuni formati orari - for idx in range(0, len(word_list) - 1): - if word_list[idx][0].isdigit() and word_list[idx+1][0].isdigit(): - num0 = int(word_list[idx]) - num1 = int(word_list[idx+1]) - if 0 <= num0 <= 23 and 10 <= num1 <= 59: - word_list[idx] = str(num0) + ':' + str(num1) - word_list[idx+1] = '' - - word_list = [x for x in word_list if x] - - return word_list - - def date_found(): - return found or \ - (datestr != '' or time_str != '' or year_offset != 0 or - month_offset != 0 or day_offset is True or hr_offset != 0 or - hr_abs or min_offset != 0 or min_abs or sec_offset != 0) - - if string == '' or not dateNow: - return None - - found = False - day_specified = False - day_offset = False - month_offset = 0 - year_offset = 0 - today = dateNow.strftime('%w') - current_year = dateNow.strftime('%Y') - from_flag = False - datestr = '' - has_year = False - time_qualifier = '' - time_qualifiers_am = ['mattina', 'stamani', 'stamane'] - time_qualifiers_pm = ['pomeriggio', 'sera', 'stasera', 'stanotte'] - time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) - markers = ['alle', 'in', 'questo', 'per', 'di', 'tra', 'fra', 'entro'] - days = ['lunedi', 'martedi', 'mercoledi', - 'giovedi', 'venerdi', 'sabato', 'domenica'] - months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno', - 'luglio', 'agosto', 'settembre', 'ottobre', 'novembre', - 'dicembre'] - months_short = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago', - 'set', 'ott', 'nov', 'dic'] - year_multiples = ['decenni', 'secolo', 'millenni'] # decennio <- decenni - time_multiples = ['ora', 'minuto', 'secondo'] - day_multiples = ['settimana', 'mese', 'anno'] - noise_words_2 = ['tra', 'di', 'per', 'fra', 'un ', 'uno', 'lo', 'del', - 'l', 'in_punto', ' ', 'nella', 'dell'] - - words = clean_string(string) - - for idx, word in enumerate(words): - if word == '': - continue - word_prev_prev = words[idx - 2] if idx > 1 else '' - word_prev = words[idx - 1] if idx > 0 else '' - word_next = words[idx + 1] if idx + 1 < len(words) else '' - word_next_next = words[idx + 2] if idx + 2 < len(words) else '' - start = idx - used = 0 - # save timequalifier for later - if word == 'adesso' and not datestr: - # word == 'ora' va in conflitto con 'tra un ora' - words = [x for x in words if x != 'adesso'] - words = [x for x in words if x] - result_str = ' '.join(words) - extracted_date = dateNow.replace(microsecond=0) - return [extracted_date, result_str] - - # un paio di o tra tre settimane --> secoli - elif extractnumber_it(word) and (word_next in year_multiples or - word_next in day_multiples): - multiplier = int(extractnumber_it(word)) - used += 2 - if word_next == 'decenni': - year_offset = multiplier * 10 - elif word_next == 'secolo': - year_offset = multiplier * 100 - elif word_next == 'millenni': - year_offset = multiplier * 1000 - elif word_next == 'anno': - year_offset = multiplier - elif word_next == 'mese': - month_offset = multiplier - elif word_next == 'settimana': - day_offset = multiplier * 7 - elif word in time_qualifiers_list: - time_qualifier = word - # parse today, tomorrow, day after tomorrow - elif word == 'oggi' and not from_flag: - day_offset = 0 - used += 1 - elif word == 'domani' and not from_flag: - day_offset = 1 - used += 1 - elif word == 'ieri' and not from_flag: - day_offset -= 1 - used += 1 - elif word == 'dopodomani' and not from_flag: # after tomorrow - day_offset += 2 - used += 1 - elif word == 'dopo' and word_next == 'domani' and not from_flag: - day_offset += 1 - used += 2 - elif word == 'giorno': - if word_prev[0].isdigit(): - day_offset += int(word_prev) - start -= 1 - used = 2 - if word_next == 'dopo' and word_next_next == 'domani': - day_offset += 1 - used += 2 - elif word == 'settimana' and not from_flag: - if word_prev == 'prossimo': - day_offset = 7 - start -= 1 - used = 2 - elif word_prev == 'passato' or word_prev == 'scorso': - day_offset = -7 - start -= 1 - used = 2 - elif word_next == 'prossimo': - day_offset = 7 - used += 2 - elif word_next == 'passato' or word_next == 'scorso': - day_offset = -7 - used += 2 - # parse next month, last month - elif word == 'mese' and not from_flag: - if word_prev == 'prossimo': - month_offset = 1 - start -= 1 - used = 2 - elif word_prev == 'passato' or word_prev == 'scorso': - month_offset = -1 - start -= 1 - used = 2 - elif word_next == 'prossimo': - month_offset = 1 - used += 2 - elif word_next == 'passato' or word_next == 'scorso': - month_offset = -1 - used += 2 - # parse next year, last year - elif word == 'anno' and not from_flag: - if word_prev == 'prossimo': # prossimo anno - year_offset = 1 - start -= 1 - used = 2 - elif word_next == 'prossimo': # anno prossimo - year_offset = 1 - used = 2 - elif word_prev == 'passato' or word_prev == 'scorso': - year_offset = -1 - start -= 1 - used = 2 - elif word_next == 'passato' or word_next == 'scorso': - year_offset = -1 - used = 2 - elif word == 'decenni' and not from_flag: - if word_prev == 'prossimo': # prossimo mese - year_offset = 10 - start -= 1 - used = 2 - elif word_next == 'prossimo': # mese prossimo - year_offset = 10 - used = 2 - elif word_prev == 'passato' or word_prev == 'scorso': - year_offset = -10 - start -= 1 - used = 2 - elif word_next == 'passato' or word_next == 'scorso': - year_offset = -10 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not from_flag: - ddd = days.index(word) - day_offset = (ddd + 1) - int(today) - used = 1 - if day_offset < 0: - day_offset += 7 - if word_prev == 'prossimo': - day_offset += 7 - start -= 1 - used += 1 - elif word_prev == 'passato' or word_prev == 'scorso': - day_offset -= 7 - start -= 1 - used += 1 - if word_next == 'prossimo': - day_offset += 7 - used += 1 - elif word_next == 'passato' or word_next == 'scorso': - day_offset -= 7 - used += 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in months_short and not from_flag: - try: - mmm = months.index(word) - except ValueError: - mmm = months_short.index(word) - used += 1 - datestr = months[mmm] - if word_prev and extractnumber_it(word_prev): - datestr += ' ' + str(int(extractnumber_it(word_prev))) - start -= 1 - used += 1 - if word_next and extractnumber_it(word_next): - datestr += ' ' + str(int(extractnumber_it(word_next))) - used += 1 - has_year = True - else: - has_year = False - elif word_next and word_next[0].isdigit(): - datestr += ' ' + word_next - used += 1 - if word_next_next and word_next_next[0].isdigit(): - datestr += ' ' + word_next_next - used += 1 - has_year = True - else: - has_year = False - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - validFollowups = days + months + months_short - validFollowups.append('oggi') - validFollowups.append('domani') - validFollowups.append('prossimo') - validFollowups.append('passato') - validFollowups.append('adesso') - - if (word == 'da' or word == 'dopo') and word_next in validFollowups: - used = 0 - from_flag = True - if word_next == 'domani': - day_offset += 1 - used += 2 - elif word_next == 'oggi' or word_next == 'adesso': - used += 2 - elif word_next in days: - ddd = days.index(word_next) - tmp_offset = (ddd + 1) - int(today) - used += 2 - if tmp_offset < 0: - tmp_offset += 7 - if word_next_next == 'prossimo': - tmp_offset += 7 - used += 1 - elif word_next_next == 'passato' or word_next_next == 'scorso': - tmp_offset = (ddd + 1) - int(today) - used += 1 - day_offset += tmp_offset - elif word_next_next and word_next_next in days: - ddd = days.index(word_next_next) - tmp_offset = (ddd + 1) - int(today) - if word_next == 'prossimo': - tmp_offset += 7 - # elif word_next == 'passato' or word_next == 'scorso': - # tmp_offset -= 7 - day_offset += tmp_offset - used += 3 - - if used > 0: - if start - 1 > 0 and words[start - 1] == 'questo': - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = '' - - if start - 1 >= 0 and words[start - 1] in markers: - words[start - 1] = '' - found = True - day_specified = True - - # parse time - time_str = '' - hr_offset = 0 - min_offset = 0 - sec_offset = 0 - hr_abs = None - min_abs = None - military = False - - for idx, word in enumerate(words): - if word == '': - continue - word_prev_prev = words[idx - 2] if idx > 1 else '' - word_prev = words[idx - 1] if idx > 0 else '' - word_next = words[idx + 1] if idx + 1 < len(words) else '' - word_next_next = words[idx + 2] if idx + 2 < len(words) else '' - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word == 'mezzogiorno': - hr_abs = 12 - used += 1 - elif word == 'mezzanotte': - hr_abs = 24 - used += 1 - if word == 'mezzo' and word_next == 'giorno': - hr_abs = 12 - used += 2 - elif word == 'mezza' and word_next == 'notte': - hr_abs = 24 - used += 2 - elif word == 'mattina': - if not hr_abs: - hr_abs = 8 - used += 1 - if word_next and word_next[0].isdigit(): # mattina alle 5 - hr_abs = int(word_next) - used += 1 - elif word == 'pomeriggio': - if not hr_abs: - hr_abs = 15 - used += 1 - if word_next and word_next[0].isdigit(): # pomeriggio alle 5 - hr_abs = int(word_next) - used += 1 - if (hr_abs or 0) < 12: - hr_abs = (hr_abs or 0) + 12 - elif word == 'sera': - if not hr_abs: - hr_abs = 19 - used += 1 - if word_next and word_next[0].isdigit() \ - and ':' not in word_next: - hr_abs = int(word_next) - used += 1 - if (hr_abs or 0) < 12: - hr_abs = (hr_abs or 0) + 12 - # da verificare più a fondo - elif word == 'presto': - hr_abs -= 1 - used += 1 - elif word == 'tardi': - hr_abs += 1 - used += 1 - # un paio di minuti tra cinque minuti tra 5 ore - elif extractnumber_it(word) and (word_next in time_multiples): - d_time = int(extractnumber_it(word)) - used += 2 - if word_next == 'ora': - hr_offset = d_time - isTime = False - hr_abs = -1 - min_abs = -1 - elif word_next == 'minuto': - min_offset = d_time - isTime = False - hr_abs = -1 - min_abs = -1 - elif word_next == 'secondo': - sec_offset = d_time - isTime = False - hr_abs = -1 - min_abs = -1 - elif word == 'mezzora': - min_offset = 30 - used = 1 - isTime = False - hr_abs = -1 - min_abs = -1 - # if word_prev == 'uno' or word_prev == 'una': - # start -= 1 - # used += 1 - elif extractnumber_it(word) and word_next and \ - word_next == 'quarto' and word_next_next == 'ora': - if int(extractnumber_it(word)) == 1 \ - or int(extractnumber_it(word)) == 3: - min_offset = 15 * int(extractnumber_it(word)) - else: # elimina eventuali errori - min_offset = 15 - used = 3 - start -= 1 - isTime = False - hr_abs = -1 - min_abs = -1 - elif word[0].isdigit(): - isTime = True - str_hh = '' - str_mm = '' - remainder = '' - if ':' in word: - # parse colons - # '3:00 in the morning' - components = word.split(':') - if len(components) == 2: - num0 = int(extractnumber_it(components[0])) - num1 = int(extractnumber_it(components[1])) - if num0 is not False and num1 is not False \ - and 0 <= num0 <= 23 and 0 <= num1 <= 59: - str_hh = str(num0) - str_mm = str(num1) - elif 0 < int(extractnumber_it(word)) < 24 \ - and word_next != 'quarto': - str_hh = str(int(word)) - str_mm = '00' - elif 100 <= int(word) <= 2400: - str_hh = int(word) / 100 - str_mm = int(word) - str_hh * 100 - military = True - isTime = False - if extractnumber_it(word) and word_next \ - and word_next == 'quarto' and word_next_next != 'ora': - if int(extractnumber_it(word)) == 1 \ - or int(extractnumber_it(word)) == 3: - str_mm = str(15 * int(extractnumber_it(word))) - else: # elimina eventuali errori - str_mm = '0' - str_hh = str(hr_abs) - used = 2 - words[idx + 1] = '' - isTime = False - if extractnumber_it(word) and word_next \ - and word_next == 'in_punto': - str_hh = str(int(extractnumber_it(word))) - used = 2 - if word_next == 'pm': - remainder = 'pm' - hr_abs = int(str_hh) - min_abs = int(str_mm) - if hr_abs <= 12: - hr_abs = hr_abs + 12 - used = 2 - elif word_next == 'am': - remainder = 'am' - hr_abs = int(str_hh) - min_abs = int(str_mm) - used = 2 - elif word_next == 'mattina': - # ' 11 del mattina' - hh = int(str_hh) - mm = int(str_mm) - used = 2 - remainder = 'am' - isTime = False - hr_abs = hh - min_abs = mm - elif word_next == 'pomeriggio': - # ' 2 del pomeriggio' - hh = int(str_hh) - mm = int(str_mm) - if hh < 12: - hh += 12 - used = 2 - remainder = 'pm' - isTime = False - hr_abs = hh - min_abs = mm - elif word_next == 'sera': - # 'alle 8 di sera' - hh = int(str_hh) - mm = int(str_mm) - if hh < 12: - hh += 12 - used = 2 - remainder = 'pm' - isTime = False - hr_abs = hh - min_abs = mm - elif word_next == 'notte': - hh = int(str_hh) - mm = int(str_mm) - if hh > 5: - remainder = 'pm' - else: - remainder = 'am' - used = 2 - isTime = False - hr_abs = hh - min_abs = mm - # parse half an hour : undici e mezza - elif word_next and word_next == 'mezza': - hr_abs = int(str_hh) - min_abs = 30 - used = 2 - isTime = False - elif word_next and word_next == 'in_punto': - hr_abs = int(str_hh) - min_abs = 0 - str_mm = '0' - used = 2 - isTime = False - else: - # 17:30 - remainder = '' - hr_abs = int(str_hh) - min_abs = int(str_mm) - used = 1 - isTime = False - if word_prev == 'ora': - words[idx - 1] = '' - - if time_qualifier != '': - # military = True - if str_hh and int(str_hh) <= 12 and \ - (time_qualifier in time_qualifiers_pm): - str_hh = str(int(str_hh) + 12) - else: - isTime = False - - str_hh = int(str_hh) if str_hh else 0 - str_mm = int(str_mm) if str_mm else 0 - - str_hh = str_hh + 12 if remainder == 'pm' \ - and str_hh < 12 else str_hh - str_hh = str_hh - 12 if remainder == 'am' \ - and str_hh >= 12 else str_hh - - if (not military and - remainder not in ['am', 'pm'] and - ((not day_specified) or day_offset < 1)): - # ambiguous time, detect whether they mean this evening or - # the next morning based on whether it has already passed - hr_abs = str_hh - if dateNow.hour < str_hh: - pass # No modification needed - elif dateNow.hour < str_hh + 12: - str_hh += 12 - hr_abs = str_hh - else: - # has passed, assume the next morning - day_offset += 1 - - if time_qualifier in time_qualifiers_pm and str_hh < 12: - str_hh += 12 - - if str_hh > 24 or str_mm > 59: - isTime = False - used = 0 - if isTime: - hr_abs = str_hh * 1 - min_abs = str_mm * 1 - used += 1 - - if (hr_abs or 0) <= 12 and (time_qualifier == 'sera' or - time_qualifier == 'pomeriggio'): - hr_abs = (hr_abs or 0) + 12 - - if used > 0: - # removed parsed words from the sentence - for i in range(used): - words[idx + i] = '' - - if word_prev == 'o' or word_prev == 'oh': - words[words.index(word_prev)] = '' - - if idx > 0 and word_prev in markers: - words[idx - 1] = '' - if idx > 1 and word_prev_prev in markers: - words[idx - 2] = '' - - idx += used - 1 - found = True - - # check that we found a date - if not date_found: - return None - - if day_offset is False: - day_offset = 0 - - # perform date manipulation - - extracted_date = dateNow.replace(microsecond=0) - - if datestr != '': - en_months = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - en_months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', - 'aug', 'sept', 'oct', 'nov', 'dec'] - - for idx, en_month in enumerate(en_months): - datestr = datestr.replace(months[idx], en_month) - - for idx, en_month in enumerate(en_months_short): - datestr = datestr.replace(months_short[idx], en_month) - - try: - temp = datetime.strptime(datestr, '%B %d') - except ValueError: - # Try again, allowing the year - temp = datetime.strptime(datestr, '%B %d %Y') - extracted_date = extracted_date.replace(hour=0, minute=0, second=0) - if not has_year: - temp = temp.replace(year=extracted_date.year, - tzinfo=extracted_date.tzinfo) - if extracted_date < temp: - extracted_date = extracted_date.replace( - year=int(current_year), - month=int(temp.strftime('%m')), - day=int(temp.strftime('%d')), - tzinfo=extracted_date.tzinfo) - else: - extracted_date = extracted_date.replace( - year=int(current_year) + 1, - month=int(temp.strftime('%m')), - day=int(temp.strftime('%d')), - tzinfo=extracted_date.tzinfo) - else: - extracted_date = extracted_date.replace( - year=int(temp.strftime('%Y')), - month=int(temp.strftime('%m')), - day=int(temp.strftime('%d')), - tzinfo=extracted_date.tzinfo) - else: - # ignore the current HH:MM:SS if relative using days or greater - if hr_offset == 0 and min_offset == 0 and sec_offset == 0: - extracted_date = extracted_date.replace(hour=0, minute=0, second=0) - - if year_offset != 0: - extracted_date = extracted_date + relativedelta(years=year_offset) - if month_offset != 0: - extracted_date = extracted_date + relativedelta(months=month_offset) - if day_offset != 0: - extracted_date = extracted_date + relativedelta(days=day_offset) - if hr_abs != -1 and min_abs != -1: - # If no time was supplied in the string set the time to default - # time if it's available - if hr_abs is None and min_abs is None and default_time is not None: - hr_abs, min_abs = default_time.hour, default_time.minute - else: - hr_abs = hr_abs or 0 - min_abs = min_abs or 0 - - extracted_date = extracted_date + relativedelta(hours=hr_abs, - minutes=min_abs) - if (hr_abs != 0 or min_abs != 0) and datestr == '': - if not day_specified and dateNow > extracted_date: - extracted_date = extracted_date + relativedelta(days=1) - if hr_offset != 0: - extracted_date = extracted_date + relativedelta(hours=hr_offset) - if min_offset != 0: - extracted_date = extracted_date + relativedelta(minutes=min_offset) - if sec_offset != 0: - extracted_date = extracted_date + relativedelta(seconds=sec_offset) - - words = [x for x in words if x not in noise_words_2] - words = [x for x in words if x] - result_str = ' '.join(words) - - return [extracted_date, result_str] - - -def get_gender_it(word, raw_string=""): - """ - In Italian to define the grammatical gender of a word is necessary - analyze the article that precedes the word and not only the last - letter of the word. - - TODO: check if useful - """ - - gender = None - words = raw_string.split(' ') - for idx, w in enumerate(words): - if w == word and idx != 0: - previous = words[idx - 1] - gender = get_gender_it(previous) - break - - if not gender: - if word[-1] == 'a' or word[-1] == 'e': - gender = 'f' - if word[-1] == 'o' or word[-1] == 'n' \ - or word[-1] == 'l' or word[-1] == 'i': - gender = 'm' - - return gender - - -def extract_numbers_it(text, short_scale=False, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ - return extract_numbers_generic(text, pronounce_number_it, extractnumber_it, - short_scale=short_scale, ordinals=ordinals) +from lingua_franca.lang.parse_it import * diff --git a/mycroft/util/lang/parse_nl.py b/mycroft/util/lang/parse_nl.py index 711b054943..d537fa8f2c 100644 --- a/mycroft/util/lang/parse_nl.py +++ b/mycroft/util/lang/parse_nl.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright 2019 Mycroft AI Inc. # @@ -14,1467 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from collections import namedtuple -from datetime import datetime, timedelta -from dateutil.relativedelta import relativedelta +"""File kept for backwards compatibility. -from mycroft.util.lang.parse_common import is_numeric, look_for_fractions -from mycroft.util.lang.common_data_nl import _ARTICLES, _NUM_STRING_NL, \ - _LONG_ORDINAL_STRING_NL, _LONG_SCALE_NL, \ - _SHORT_SCALE_NL, _SHORT_ORDINAL_STRING_NL - -import re - - -def _invert_dict(original): - """ - Produce a dictionary with the keys and values - inverted, relative to the dict passed in. - - Args: - original dict: The dict like object to invert - - Returns: - dict - - """ - return {value: key for key, value in original.items()} - - -# negate next number (-2 = 0 - 2) -_NEGATIVES = {"min", "minus"} - -# sum the next number (twenty two = 20 + 2) -_SUMS = {'twintig', '20', 'dertig', '30', 'veertig', '40', 'vijftig', '50', - 'zestig', '60', 'zeventig', '70', 'techtig', '80', 'negentig', '90'} - -_MULTIPLIES_LONG_SCALE_NL = set(_LONG_SCALE_NL.values()) - -_MULTIPLIES_SHORT_SCALE_NL = set(_SHORT_SCALE_NL.values()) - -# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) -_FRACTION_MARKER = {"en"} - -# decimal marker ( 1 point 5 = 1 + 0.5) -_DECIMAL_MARKER = {"komma", "punt"} - -_STRING_NUM_NL = _invert_dict(_NUM_STRING_NL) -_STRING_NUM_NL.update({ - "half": 0.5, - "driekwart": 0.75, - "anderhalf": 1.5, - "paar": 2 -}) - -_STRING_SHORT_ORDINAL_NL = _invert_dict(_SHORT_ORDINAL_STRING_NL) -_STRING_LONG_ORDINAL_NL = _invert_dict(_LONG_ORDINAL_STRING_NL) - - -# _Token is intended to be used in the number processing functions in -# this module. The parsing requires slicing and dividing of the original -# text. To ensure things parse correctly, we need to know where text came -# from in the original input, hence this nametuple. -_Token = namedtuple('_Token', 'word index') - - -class _ReplaceableNumber(): - """ - Similar to _Token, this class is used in number parsing. - - Once we've found a number in a string, this class contains all - the info about the value, and where it came from in the original text. - In other words, it is the text, and the number that can replace it in - the string. - """ - - def __init__(self, value, tokens: [_Token]): - self.value = value - self.tokens = tokens - - def __bool__(self): - return bool(self.value is not None and self.value is not False) - - @property - def start_index(self): - return self.tokens[0].index - - @property - def end_index(self): - return self.tokens[-1].index - - @property - def text(self): - return ' '.join([t.word for t in self.tokens]) - - def __setattr__(self, key, value): - try: - getattr(self, key) - except AttributeError: - super().__setattr__(key, value) - else: - raise Exception("Immutable!") - - def __str__(self): - return "({v}, {t})".format(v=self.value, t=self.tokens) - - def __repr__(self): - return "{n}({v}, {t})".format(n=self.__class__.__name__, v=self.value, - t=self.tokens) - - -def _tokenize(text): - """ - Generate a list of token object, given a string. - Args: - text str: Text to tokenize. - - Returns: - [_Token] - - """ - return [_Token(word, index) for index, word in enumerate(text.split())] - - -def _partition_list(items, split_on): - """ - Partition a list of items. - - Works similarly to str.partition - - Args: - items: - split_on callable: - Should return a boolean. Each item will be passed to - this callable in succession, and partitions will be - created any time it returns True. - - Returns: - [[any]] - - """ - splits = [] - current_split = [] - for item in items: - if split_on(item): - splits.append(current_split) - splits.append([item]) - current_split = [] - else: - current_split.append(item) - splits.append(current_split) - return list(filter(lambda x: len(x) != 0, splits)) - - -def _convert_words_to_numbers(text, short_scale=True, ordinals=False): - """ - Convert words in a string into their equivalent numbers. - Args: - text str: - short_scale boolean: True if short scale numbers should be used. - ordinals boolean: True if ordinals (e.g. first, second, third) should - be parsed to their number values (1, 2, 3...) - - Returns: - str - The original text, with numbers subbed in where appropriate. - - """ - text = text.lower() - tokens = _tokenize(text) - numbers_to_replace = \ - _extract_numbers_with_text(tokens, short_scale, ordinals) - numbers_to_replace.sort(key=lambda number: number.start_index) - - results = [] - for token in tokens: - if not numbers_to_replace or \ - token.index < numbers_to_replace[0].start_index: - results.append(token.word) - else: - if numbers_to_replace and \ - token.index == numbers_to_replace[0].start_index: - results.append(str(numbers_to_replace[0].value)) - if numbers_to_replace and \ - token.index == numbers_to_replace[0].end_index: - numbers_to_replace.pop(0) - - return ' '.join(results) - - -def _extract_numbers_with_text(tokens, short_scale=True, - ordinals=False, fractional_numbers=True): - """ - Extract all numbers from a list of _Tokens, with the words that - represent them. - - Args: - [_Token]: The tokens to parse. - short_scale bool: True if short scale numbers should be used, False for - long scale. True by default. - ordinals bool: True if ordinal words (first, second, third, etc) should - be parsed. - fractional_numbers bool: True if we should look for fractions and - decimals. - - Returns: - [_ReplaceableNumber]: A list of tuples, each containing a number and a - string. - - """ - placeholder = "" # inserted to maintain correct indices - results = [] - while True: - to_replace = \ - _extract_number_with_text_nl(tokens, short_scale, - ordinals, fractional_numbers) - - if not to_replace: - break - - results.append(to_replace) - - tokens = [ - t if not - to_replace.start_index <= t.index <= to_replace.end_index - else - _Token(placeholder, t.index) for t in tokens - ] - results.sort(key=lambda n: n.start_index) - return results - - -def _extract_number_with_text_nl(tokens, short_scale=True, - ordinals=False, fractional_numbers=True): - """ - This function extracts a number from a list of _Tokens. - - Args: - tokens str: the string to normalize - short_scale (bool): use short scale if True, long scale if False - ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 - fractional_numbers (bool): True if we should look for fractions and - decimals. - Returns: - _ReplaceableNumber - - """ - number, tokens = \ - _extract_number_with_text_nl_helper(tokens, short_scale, - ordinals, fractional_numbers) - while tokens and tokens[0].word in _ARTICLES: - tokens.pop(0) - return _ReplaceableNumber(number, tokens) - - -def _extract_number_with_text_nl_helper(tokens, - short_scale=True, ordinals=False, - fractional_numbers=True): - """ - Helper for _extract_number_with_text_en. - - This contains the real logic for parsing, but produces - a result that needs a little cleaning (specific, it may - contain leading articles that can be trimmed off). - - Args: - tokens [_Token]: - short_scale boolean: - ordinals boolean: - fractional_numbers boolean: - - Returns: - int or float, [_Tokens] - - """ - if fractional_numbers: - fraction, fraction_text = \ - _extract_fraction_with_text_nl(tokens, short_scale, ordinals) - if fraction: - return fraction, fraction_text - - decimal, decimal_text = \ - _extract_decimal_with_text_nl(tokens, short_scale, ordinals) - if decimal: - return decimal, decimal_text - - return _extract_whole_number_with_text_nl(tokens, short_scale, ordinals) - - -def _extract_fraction_with_text_nl(tokens, short_scale, ordinals): - """ - Extract fraction numbers from a string. - - This function handles text such as '2 and 3/4'. Note that "one half" or - similar will be parsed by the whole number function. - - Args: - tokens [_Token]: words and their indexes in the original string. - short_scale boolean: - ordinals boolean: - - Returns: - (int or float, [_Token]) - The value found, and the list of relevant tokens. - (None, None) if no fraction value is found. - - """ - for c in _FRACTION_MARKER: - partitions = _partition_list(tokens, lambda t: t.word == c) - - if len(partitions) == 3: - numbers1 = \ - _extract_numbers_with_text(partitions[0], short_scale, - ordinals, fractional_numbers=False) - numbers2 = \ - _extract_numbers_with_text(partitions[2], short_scale, - ordinals, fractional_numbers=True) - - if not numbers1 or not numbers2: - return None, None - - # ensure first is not a fraction and second is a fraction - num1 = numbers1[-1] - num2 = numbers2[0] - if num1.value >= 1 and 0 < num2.value < 1: - return num1.value + num2.value, \ - num1.tokens + partitions[1] + num2.tokens - - return None, None - - -def _extract_decimal_with_text_nl(tokens, short_scale, ordinals): - """ - Extract decimal numbers from a string. - - This function handles text such as '2 point 5'. - - Notes: - While this is a helper for extractnumber_en, it also depends on - extractnumber_en, to parse out the components of the decimal. - - This does not currently handle things like: - number dot number number number - - Args: - tokens [_Token]: The text to parse. - short_scale boolean: - ordinals boolean: - - Returns: - (float, [_Token]) - The value found and relevant tokens. - (None, None) if no decimal value is found. - - """ - for c in _DECIMAL_MARKER: - partitions = _partition_list(tokens, lambda t: t.word == c) - - if len(partitions) == 3: - numbers1 = \ - _extract_numbers_with_text(partitions[0], short_scale, - ordinals, fractional_numbers=False) - numbers2 = \ - _extract_numbers_with_text(partitions[2], short_scale, - ordinals, fractional_numbers=False) - - if not numbers1 or not numbers2: - return None, None - - number = numbers1[-1] - decimal = numbers2[0] - - # TODO handle number dot number number number - if "." not in str(decimal.text): - return number.value + float('0.' + str(decimal.value)), \ - number.tokens + partitions[1] + decimal.tokens - return None, None - - -def _extract_whole_number_with_text_nl(tokens, short_scale, ordinals): - """ - Handle numbers not handled by the decimal or fraction functions. This is - generally whole numbers. Note that phrases such as "one half" will be - handled by this function, while "one and a half" are handled by the - fraction function. - - Args: - tokens [_Token]: - short_scale boolean: - ordinals boolean: - - Returns: - int or float, [_Tokens] - The value parsed, and tokens that it corresponds to. - - """ - multiplies, string_num_ordinal, string_num_scale = \ - _initialize_number_data(short_scale) - - number_words = [] # type: [_Token] - val = False - prev_val = None - next_val = None - to_sum = [] - for idx, token in enumerate(tokens): - current_val = None - if next_val: - next_val = None - continue - - word = token.word - if word in _ARTICLES or word in _NEGATIVES: - number_words.append(token) - continue - - prev_word = tokens[idx - 1].word if idx > 0 else "" - next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" - - if word not in string_num_scale and \ - word not in _STRING_NUM_NL and \ - word not in _SUMS and \ - word not in multiplies and \ - not (ordinals and word in string_num_ordinal) and \ - not is_numeric(word) and \ - not isFractional_nl(word, short_scale=short_scale) and \ - not look_for_fractions(word.split('/')): - words_only = [token.word for token in number_words] - if number_words and not all([w in _ARTICLES | - _NEGATIVES for w in words_only]): - break - else: - number_words = [] - continue - elif word not in multiplies \ - and prev_word not in multiplies \ - and prev_word not in _SUMS \ - and not (ordinals and prev_word in string_num_ordinal) \ - and prev_word not in _NEGATIVES \ - and prev_word not in _ARTICLES: - number_words = [token] - elif prev_word in _SUMS and word in _SUMS: - number_words = [token] - else: - number_words.append(token) - - # is this word already a number ? - if is_numeric(word): - if word.isdigit(): # doesn't work with decimals - val = int(word) - else: - val = float(word) - current_val = val - - # is this word the name of a number ? - if word in _STRING_NUM_NL: - val = _STRING_NUM_NL.get(word) - current_val = val - elif word in string_num_scale: - val = string_num_scale.get(word) - current_val = val - elif ordinals and word in string_num_ordinal: - val = string_num_ordinal[word] - current_val = val - - # is the prev word an ordinal number and current word is one? - # second one, third one - if ordinals and prev_word in string_num_ordinal and val == 1: - val = prev_val - - # is the prev word a number and should we sum it? - # twenty two, fifty six - if prev_word in _SUMS and val and val < 10: - val = prev_val + val - - # is the prev word a number and should we multiply it? - # twenty hundred, six hundred - if word in multiplies: - if not prev_val: - prev_val = 1 - val = prev_val * val - - # is this a spoken fraction? - # half cup - if val is False: - val = isFractional_nl(word, short_scale=short_scale) - current_val = val - - # 2 fifths - if not ordinals: - next_val = isFractional_nl(next_word, short_scale=short_scale) - if next_val: - if not val: - val = 1 - val = val * next_val - number_words.append(tokens[idx + 1]) - - # is this a negative number? - if val and prev_word and prev_word in _NEGATIVES: - val = 0 - val - - # let's make sure it isn't a fraction - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - current_val = val - - else: - if prev_word in _SUMS and word not in _SUMS and current_val >= 10: - # Backtrack - we've got numbers we can't sum. - number_words.pop() - val = prev_val - break - prev_val = val - - # handle long numbers - # six hundred sixty six - # two million five hundred thousand - if word in multiplies and next_word not in multiplies: - to_sum.append(val) - val = 0 - prev_val = 0 - - if val is not None and to_sum: - val += sum(to_sum) - - return val, number_words - - -def _initialize_number_data(short_scale): - """ - Generate dictionaries of words to numbers, based on scale. - - This is a helper function for _extract_whole_number. - - Args: - short_scale boolean: - - Returns: - (set(str), dict(str, number), dict(str, number)) - multiplies, string_num_ordinal, string_num_scale - - """ - multiplies = _MULTIPLIES_SHORT_SCALE_NL if short_scale \ - else _MULTIPLIES_LONG_SCALE_NL - - string_num_ordinal_nl = _STRING_SHORT_ORDINAL_NL if short_scale \ - else _STRING_LONG_ORDINAL_NL - - string_num_scale_nl = _SHORT_SCALE_NL if short_scale else _LONG_SCALE_NL - string_num_scale_nl = _invert_dict(string_num_scale_nl) - - return multiplies, string_num_ordinal_nl, string_num_scale_nl - - -def extractnumber_nl(text, short_scale=True, ordinals=False): - """ - This function extracts a number from a text string, - handles pronunciations in long scale and short scale - - https://en.wikipedia.org/wiki/Names_of_large_numbers - - Args: - text (str): the string to normalize - short_scale (bool): use short scale if True, long scale if False - ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 - Returns: - (int) or (float) or False: The extracted number or False if no number - was found - - """ - return _extract_number_with_text_nl(_tokenize(text), - short_scale, ordinals).value - - -def extract_duration_nl(text): - """ - Convert an english phrase into a number of seconds - - Convert things like: - "10 minute" - "2 and a half hours" - "3 days 8 hours 10 minutes and 49 seconds" - into an int, representing the total number of seconds. - - The words used in the duration will be consumed, and - the remainder returned. - - As an example, "set a timer for 5 minutes" would return - (300, "set a timer for"). - - Args: - text (str): string containing a duration - - Returns: - (timedelta, str): - A tuple containing the duration and the remaining text - not consumed in the parsing. The first value will - be None if no duration is found. The text returned - will have whitespace stripped from the ends. - """ - if not text: - return None - - time_units = { - 'microseconden': None, - 'milliseconden': None, - 'seconden': None, - 'minuten': None, - 'uren': None, - 'dagen': None, - 'weken': None - } - - pattern = r"(?P\d+(?:\.?\d+)?)\s+{unit}s?" - text = _convert_words_to_numbers(text) - - for unit in time_units: - unit_pattern = pattern.format(unit=unit[:-1]) # remove 's' from unit - matches = re.findall(unit_pattern, text) - value = sum(map(float, matches)) - time_units[unit] = value - text = re.sub(unit_pattern, '', text) - - text = text.strip() - duration = timedelta(**time_units) if any(time_units.values()) else None - - return (duration, text) - - -def extract_datetime_nl(string, dateNow, default_time): - """ Convert a human date reference into an exact datetime - - Convert things like - "today" - "tomorrow afternoon" - "next Tuesday at 4pm" - "August 3rd" - into a datetime. If a reference date is not provided, the current - local time is used. Also consumes the words used to define the date - returning the remaining string. For example, the string - "what is Tuesday's weather forecast" - returns the date for the forthcoming Tuesday relative to the reference - date and the remainder string - "what is weather forecast". - - The "next" instance of a day or weekend is considered to be no earlier than - 48 hours in the future. On Friday, "next Monday" would be in 3 days. - On Saturday, "next Monday" would be in 9 days. - - Args: - string (str): string containing date words - dateNow (datetime): A reference date/time for "tommorrow", etc - default_time (time): Time to set if no time was found in the string - - Returns: - [datetime, str]: An array containing the datetime and the remaining - text not consumed in the parsing, or None if no - date or time related text was found. - """ - - def clean_string(s): - # clean unneeded punctuation and capitalization among other things. - s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ - .replace(' de ', ' ').replace(' het ', ' ').replace(' het ', ' ') \ - .replace("paar", "2").replace("eeuwen", "eeuw") \ - .replace("decennia", "decennium") \ - .replace("millennia", "millennium") - - wordList = s.split() - for idx, word in enumerate(wordList): - ordinals = ["ste", "de"] - if word[0].isdigit(): - for ordinal in ordinals: - # "second" is the only case we should not do this - if ordinal in word and "second" not in word: - word = word.replace(ordinal, "") - wordList[idx] = word - - return wordList - - def date_found(): - return found or \ - ( - datestr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs or minOffset != 0 or - minAbs or secOffset != 0 - ) - - if string == "" or not dateNow: - return None - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - timeQualifiersAM = ['ochtend'] - timeQualifiersPM = ['middag', 'avond', 'nacht'] - timeQualifiersList = timeQualifiersAM + timeQualifiersPM - timeQualifierOffsets = [8, 15, 19, 0] - markers = ['op', 'in', 'om', 'tegen', 'over', - 'deze', 'rond', 'voor', 'van', "binnen"] - days = ["maandag", "dinsdag", "woensdag", "donderdag", "vrijdag", - "zaterdag", "zondag"] - day_parts = [a + b for a in days for b in timeQualifiersList] - months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', - 'juli', 'augustus', 'september', 'oktober', 'november', - 'december'] - recur_markers = days + [d+'en' for d in days] + ['weekeinde', 'werkdag', - 'weekeinden', 'werkdagen'] - months_short = ['jan', 'feb', 'mar', 'apr', 'mei', 'jun', 'jul', 'aug', - 'sep', 'okt', 'nov', 'dec'] - year_multiples = ["decennium", "eeuw", "millennium"] - day_multiples = ["dagen", "weken", "maanden", "jaren"] - - words = clean_string(string) - - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - - start = idx - used = 0 - # save timequalifier for later - - if word == "nu" and not datestr: - resultStr = " ".join(words[idx + 1:]) - resultStr = ' '.join(resultStr.split()) - extractedDate = dateNow.replace(microsecond=0) - return [extractedDate, resultStr] - elif wordNext in year_multiples: - multiplier = None - if is_numeric(word): - multiplier = extractnumber_nl(word) - multiplier = multiplier or 1 - multiplier = int(multiplier) - used += 2 - if wordNext == "decennium": - yearOffset = multiplier * 10 - elif wordNext == "eeuw": - yearOffset = multiplier * 100 - elif wordNext == "millennium": - yearOffset = multiplier * 1000 - # paar - elif word == "2" and \ - wordNextNext in year_multiples: - multiplier = 2 - used += 2 - if wordNextNext == "decennia": - yearOffset = multiplier * 10 - elif wordNextNext == "eeuwen": - yearOffset = multiplier * 100 - elif wordNextNext == "millennia": - yearOffset = multiplier * 1000 - elif word == "2" and \ - wordNextNext in day_multiples: - multiplier = 2 - used += 2 - if wordNextNext == "jaren": - yearOffset = multiplier - elif wordNextNext == "maanden": - monthOffset = multiplier - elif wordNextNext == "weken": - dayOffset = multiplier * 7 - elif word in timeQualifiersList: - timeQualifier = word - # parse today, tomorrow, day after tomorrow - elif word == "vandaag" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "morgen" and not fromFlag: - dayOffset = 1 - used += 1 - elif word == "overmorgen" and not fromFlag: - dayOffset = 2 - used += 1 - # parse 5 days, 10 weeks, last week, next week - elif word == "dag" or word == "dagen": - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used = 2 - elif word == "week" or word == "weken" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - elif wordPrev == "volgende": - dayOffset = 7 - start -= 1 - used = 2 - elif wordPrev == "vorige": - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "maand" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev == "volgende": - monthOffset = 1 - start -= 1 - used = 2 - elif wordPrev == "vorige": - monthOffset = -1 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "jaar" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev == "volgend": - yearOffset = 1 - start -= 1 - used = 2 - elif wordPrev == "vorig": - yearOffset = -1 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - if wordPrev == "volgende": - if dayOffset <= 2: - dayOffset += 7 - used += 1 - start -= 1 - elif wordPrev == "vorige": - dayOffset -= 7 - used += 1 - start -= 1 - elif word in day_parts and not fromFlag: - d = day_parts.index(word) / len(timeQualifiersList) - dayOffset = (d + 1) - int(today) - if dayOffset < 0: - dayOffset += 7 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in months_short and not fromFlag: - try: - m = months.index(word) - except ValueError: - m = months_short.index(word) - used += 1 - datestr = months[m] - if wordPrev and \ - (wordPrev[0].isdigit() or (wordPrev == "van" and - wordPrevPrev[0].isdigit())): - if wordPrev == "van" and wordPrevPrev[0].isdigit(): - datestr += " " + words[idx - 2] - used += 1 - start -= 1 - else: - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - validFollowups = days + months + months_short - validFollowups.append("vandaag") - validFollowups.append("morgen") - validFollowups.append("volgende") - validFollowups.append("vorige") - validFollowups.append("nu") - if (word == "van" or word == "na") and wordNext in validFollowups: - used = 2 - fromFlag = True - if wordNext == "morgen": - dayOffset += 1 - elif wordNext == "overmorgen": - dayOffset += 2 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if tmpOffset < 0: - tmpOffset += 7 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNext == "volgende": - if dayOffset <= 2: - tmpOffset += 7 - used += 1 - start -= 1 - elif wordNext == "vorige": - tmpOffset -= 7 - used += 1 - start -= 1 - dayOffset += tmpOffset - if used > 0: - if start - 1 > 0 and words[start - 1] == "deze": - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in markers: - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = None - minAbs = None - military = False - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - # parse nacht ochtend, middag, avond - used = 0 - if word.startswith("gister"): - dayOffset = -1 - elif word.startswith("morgen"): - dayOffset = 1 - - if word.endswith("nacht"): - if hrAbs is None: - hrAbs = 0 - used += 1 - elif word.endswith("ochtend"): - if hrAbs is None: - hrAbs = 8 - used += 1 - elif word.endswith("middag"): - if hrAbs is None: - hrAbs = 15 - used += 1 - elif word.endswith("avond"): - if hrAbs is None: - hrAbs = 19 - used += 1 - - # "paar" time_unit - elif word == "2" and \ - wordNextNext in ["uur", "minuten", "seconden"]: - used += 2 - if wordNextNext == "uur": - hrOffset = 2 - elif wordNextNext == "minuten": - minOffset = 2 - elif wordNextNext == "seconden": - secOffset = 2 - # parse half an hour, quarter hour - elif word == "uur" and \ - (wordPrev in markers or wordPrevPrev in markers): - if wordPrev == "half": - minOffset = 30 - elif wordPrev == "kwartier": - minOffset = 15 - elif wordPrevPrev == "kwartier": - minOffset = 15 - if idx > 2 and words[idx - 3] in markers: - words[idx - 3] = "" - if words[idx - 3] == "deze": - daySpecified = True - words[idx - 2] = "" - elif wordPrev == "binnen": - hrOffset = 1 - else: - hrOffset = 1 - if wordPrevPrev in markers: - words[idx - 2] = "" - if wordPrevPrev == "deze": - daySpecified = True - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - # parse "over een minuut" - elif word == "minuut" and wordPrev == "over": - minOffset = 1 - words[idx - 1] = "" - used += 1 - # parse "over een seconde" - elif word == "seconde" and wordPrev == "over": - secOffset = 1 - words[idx - 1] = "" - used += 1 - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - wordNextNextNext = words[idx + 3] \ - if idx + 3 < len(words) else "" - if wordNext == "vannacht" or wordNextNext == "vannacht" or \ - wordPrev == "vannacht" or wordPrevPrev == "vannacht" or \ - wordNextNextNext == "vannacht": - remainder = "pm" - used += 1 - if wordPrev == "vannacht": - words[idx - 1] = "" - if wordPrevPrev == "vannacht": - words[idx - 2] = "" - if wordNextNext == "vannacht": - used += 1 - if wordNextNextNext == "vannacht": - used += 1 - - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - - elif wordNext == "in" and wordNextNext == "ochtend": - remainder = "am" - used += 2 - elif wordNext == "in" and wordNextNext == "middag": - remainder = "pm" - used += 2 - elif wordNext == "in" and wordNextNext == "avond": - remainder = "pm" - used += 2 - elif wordNext == "'s" and wordNextNext == "ochtends": - remainder = "am" - used += 2 - elif wordNext == "'s" and wordNextNext == "middags": - remainder = "pm" - used += 2 - elif wordNext == "'s" and wordNextNext == "avonds": - remainder = "pm" - used += 2 - elif wordNext == "deze" and wordNextNext == "ochtend": - remainder = "am" - used = 2 - daySpecified = True - elif wordNext == "deze" and wordNextNext == "middag": - remainder = "pm" - used = 2 - daySpecified = True - elif wordNext == "deze" and wordNextNext == "avond": - remainder = "pm" - used = 2 - daySpecified = True - elif wordNext == "'s" and wordNextNext == "nachts": - if strHH and int(strHH) > 5: - remainder = "pm" - else: - remainder = "am" - used += 2 - - else: - if timeQualifier != "": - military = True - if strHH and int(strHH) <= 12 and \ - (timeQualifier in timeQualifiersPM): - strHH += str(int(strHH) + 12) - - else: - # try to parse numbers without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - elif ( - remainder in recur_markers or - wordNext in recur_markers or - wordNextNext in recur_markers): - # Ex: "7 on mondays" or "3 this friday" - # Set strHH so that isTime == True - # when am or pm is not specified - strHH = strNum - used = 1 - else: - if ( - (wordNext == "uren" or wordNext == "uur" or - remainder == "uren" or remainder == "uur") and - word[0] != '0' and - ( - int(strNum) < 100 or - int(strNum) > 2400 - )): - # ignores military time - # "in 3 hours" - hrOffset = int(strNum) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "minuten" or wordNext == "minuut" or \ - remainder == "minuten" or remainder == "minuut": - # "in 10 minutes" - minOffset = int(strNum) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "seconden" or wordNext == "seconde" \ - or remainder == "seconden" or \ - remainder == "seconde": - # in 5 seconds - secOffset = int(strNum) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif int(strNum) > 100: - # military time, eg. "3300 hours" - strHH = str(int(strNum) // 100) - strMM = str(int(strNum) % 100) - military = True - if wordNext == "uur" or remainder == "uur": - used += 1 - elif wordNext and wordNext[0].isdigit(): - # military time, e.g. "04 38 hours" - strHH = strNum - strMM = wordNext - military = True - used += 1 - if (wordNextNext == "uur" or remainder == "uur"): - used += 1 - elif ( - wordNext == "" or wordNext == "uur" or - ( - wordNext == "in" and - ( - wordNextNext == "de" or - wordNextNext == timeQualifier - ) - ) or wordNext == 'vannacht' or - wordNextNext == 'vannacht'): - - strHH = strNum - strMM = "00" - if wordNext == "uur": - used += 1 - - if wordNext == "in" or wordNextNext == "in": - used += (1 if wordNext == "in" else 2) - wordNextNextNext = words[idx + 3] \ - if idx + 3 < len(words) else "" - - if (wordNextNext and - (wordNextNext in timeQualifier or - wordNextNextNext in timeQualifier)): - if (wordNextNext in timeQualifiersPM or - wordNextNextNext in timeQualifiersPM): - remainder = "pm" - used += 1 - if (wordNextNext in timeQualifiersAM or - wordNextNextNext in timeQualifiersAM): - remainder = "am" - used += 1 - - if timeQualifier != "": - if timeQualifier in timeQualifiersPM: - remainder = "pm" - used += 1 - - elif timeQualifier in timeQualifiersAM: - remainder = "am" - used += 1 - else: - # TODO: Unsure if this is 100% accurate - used += 1 - military = True - else: - isTime = False - HH = int(strHH) if strHH else 0 - MM = int(strMM) if strMM else 0 - HH = HH + 12 if remainder == "pm" and HH < 12 else HH - HH = HH - 12 if remainder == "am" and HH >= 12 else HH - - if (not military and - remainder not in ['am', 'pm', 'uren', 'minuten', - "seconde", "seconden", - "uur", "minuut"] and - ((not daySpecified) or dayOffset < 1)): - # ambiguous time, detect whether they mean this evening or - # the next morning based on whether it has already passed - if dateNow.hour < HH or (dateNow.hour == HH and - dateNow.minute < MM): - pass # No modification needed - elif dateNow.hour < HH + 12: - HH += 12 - else: - # has passed, assume the next morning - dayOffset += 1 - - if timeQualifier in timeQualifiersPM and HH < 12: - HH += 12 - - if HH > 24 or MM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = HH - minAbs = MM - used += 1 - - if used > 0: - # removed parsed words from the sentence - for i in range(used): - if idx + i >= len(words): - break - words[idx + i] = "" - - if wordPrev == "vroeg": - hrOffset = -1 - words[idx - 1] = "" - idx -= 1 - elif wordPrev == "laat": - hrOffset = 1 - words[idx - 1] = "" - idx -= 1 - if idx > 0 and wordPrev in markers: - words[idx - 1] = "" - if wordPrev == "deze": - daySpecified = True - if idx > 1 and wordPrevPrev in markers: - words[idx - 2] = "" - if wordPrevPrev == "deze": - daySpecified = True - - idx += used - 1 - found = True - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow.replace(microsecond=0) - - if datestr != "": - # date included an explicit date, e.g. "june 5" or "june 2, 2017" - try: - temp = datetime.strptime(datestr, "%B %d") - except ValueError: - # Try again, allowing the year - temp = datetime.strptime(datestr, "%B %d %Y") - extractedDate = extractedDate.replace(hour=0, minute=0, second=0) - if not hasYear: - temp = temp.replace(year=extractedDate.year, - tzinfo=extractedDate.tzinfo) - if extractedDate < temp: - extractedDate = extractedDate.replace( - year=int(currentYear), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d")), - tzinfo=extractedDate.tzinfo) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d")), - tzinfo=extractedDate.tzinfo) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d")), - tzinfo=extractedDate.tzinfo) - else: - # ignore the current HH:MM:SS if relative using days or greater - if hrOffset == 0 and minOffset == 0 and secOffset == 0: - extractedDate = extractedDate.replace(hour=0, minute=0, second=0) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - if hrAbs != -1 and minAbs != -1: - # If no time was supplied in the string set the time to default - # time if it's available - if hrAbs is None and minAbs is None and default_time is not None: - hrAbs, minAbs = default_time.hour, default_time.minute - else: - hrAbs = hrAbs or 0 - minAbs = minAbs or 0 - - extractedDate = extractedDate.replace(hour=hrAbs, - minute=minAbs) - if (hrAbs != 0 or minAbs != 0) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - for idx, word in enumerate(words): - if words[idx] == "en" and \ - words[idx - 1] == "" and words[idx + 1] == "": - words[idx] = "" - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - return [extractedDate, resultStr] - - -def isFractional_nl(input_str, short_scale=True): - """ - This function takes the given text and checks if it is a fraction. - - Args: - input_str (str): the string to check if fractional - short_scale (bool): use short scale if True, long scale if False - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - fracts = {"heel": 1, "half": 2, "halve": 2, "kwart": 4} - if short_scale: - for num in _SHORT_ORDINAL_STRING_NL: - if num > 2: - fracts[_SHORT_ORDINAL_STRING_NL[num]] = num - else: - for num in _LONG_ORDINAL_STRING_NL: - if num > 2: - fracts[_LONG_ORDINAL_STRING_NL[num]] = num - - if input_str.lower() in fracts: - return 1.0 / fracts[input_str.lower()] - return False - - -def extract_numbers_nl(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ - results = _extract_numbers_with_text(_tokenize(text), - short_scale, ordinals) - return [float(result.value) for result in results] - - -def normalize_nl(text, remove_articles): - """ English string normalization """ - - words = text.split() # this also removed extra spaces - normalized = "" - for word in words: - if remove_articles and word in _ARTICLES: - continue - - # Convert numbers into digits, e.g. "two" -> "2" - textNumbers = ["nul", "een", "twee", "drie", "vier", "vijf", "zes", - "zeven", "acht", "negen", "tien", "elf", "twaalf", - "dertien", "veertien", "vijftien", "zestien", - "zeventien", "achttien", "negentien", "twintig"] - - if word in textNumbers: - word = str(textNumbers.index(word)) - - normalized += " " + word - - return normalized[1:] # strip the initial space +TODO: Remove in 20.02 +""" +from lingua_franca.lang.parse_nl import * diff --git a/mycroft/util/lang/parse_pt.py b/mycroft/util/lang/parse_pt.py index 8343a7f376..68b12719c0 100644 --- a/mycroft/util/lang/parse_pt.py +++ b/mycroft/util/lang/parse_pt.py @@ -13,1126 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # + +"""File kept for backwards compatibility. + +TODO: Remove in 20.02 """ - Parse functions for Portuguese (PT-PT) - - TODO: numbers greater than 999999 - TODO: date time pt -""" - -from datetime import datetime -from dateutil.relativedelta import relativedelta -from mycroft.util.lang.parse_common import is_numeric, look_for_fractions -from mycroft.util.lang.common_data_pt import _FRACTION_STRING_PT, \ - _PT_ARTICLES, _PT_NUMBERS - - -def isFractional_pt(input_str): - """ - This function takes the given text and checks if it is a fraction. - - Args: - text (str): the string to check if fractional - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - - aFrac = ["meio", "terço", "quarto", "quinto", "sexto", - "setimo", "oitavo", "nono", "décimo"] - - if input_str.lower() in aFrac: - return 1.0 / (aFrac.index(input_str) + 2) - if input_str == "vigésimo": - return 1.0 / 20 - if input_str == "trigésimo": - return 1.0 / 30 - if input_str == "centésimo": - return 1.0 / 100 - if input_str == "milésimo": - return 1.0 / 1000 - if (input_str == "sétimo" or input_str == "septimo" or - input_str == "séptimo"): - return 1.0 / 7 - - return False - - -def extractnumber_pt(text): - """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - - """ - aWords = text.split() - count = 0 - result = None - while count < len(aWords): - val = 0 - word = aWords[count] - next_next_word = None - if count + 1 < len(aWords): - next_word = aWords[count + 1] - if count + 2 < len(aWords): - next_next_word = aWords[count + 2] - else: - next_word = None - - # is current word a number? - if word in _PT_NUMBERS: - val = _PT_NUMBERS[word] - elif word.isdigit(): # doesn't work with decimals - val = int(word) - elif is_numeric(word): - val = float(word) - elif isFractional_pt(word): - if not result: - result = 1 - result = result * isFractional_pt(word) - count += 1 - continue - - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - - if val: - if result is None: - result = 0 - # handle fractions - if next_word != "avos": - result += val - else: - result = float(result) / float(val) - - if next_word is None: - break - - # number word and fraction - ands = ["e"] - if next_word in ands: - zeros = 0 - if result is None: - count += 1 - continue - newWords = aWords[count + 2:] - newText = "" - for word in newWords: - newText += word + " " - - afterAndVal = extractnumber_pt(newText[:-1]) - if afterAndVal: - if result < afterAndVal or result < 20: - while afterAndVal > 1: - afterAndVal = afterAndVal / 10.0 - for word in newWords: - if word == "zero" or word == "0": - zeros += 1 - else: - break - for _ in range(0, zeros): - afterAndVal = afterAndVal / 10.0 - result += afterAndVal - break - elif next_next_word is not None: - if next_next_word in ands: - newWords = aWords[count + 3:] - newText = "" - for word in newWords: - newText += word + " " - afterAndVal = extractnumber_pt(newText[:-1]) - if afterAndVal: - if result is None: - result = 0 - result += afterAndVal - break - - decimals = ["ponto", "virgula", "vírgula", ".", ","] - if next_word in decimals: - zeros = 0 - newWords = aWords[count + 2:] - newText = "" - for word in newWords: - newText += word + " " - for word in newWords: - if word == "zero" or word == "0": - zeros += 1 - else: - break - afterDotVal = str(extractnumber_pt(newText[:-1])) - afterDotVal = zeros * "0" + afterDotVal - result = float(str(result) + "." + afterDotVal) - break - count += 1 - - if result is None: - return False - - # Return the $str with the number related words removed - # (now empty strings, so strlen == 0) - # aWords = [word for word in aWords if len(word) > 0] - # text = ' '.join(aWords) - if "." in str(result): - integer, dec = str(result).split(".") - # cast float to int - if dec == "0": - result = int(integer) - - return result - - -def pt_number_parse(words, i): - def pt_cte(i, s): - if i < len(words) and s == words[i]: - return s, i + 1 - return None - - def pt_number_word(i, mi, ma): - if i < len(words): - v = _PT_NUMBERS.get(words[i]) - if v and v >= mi and v <= ma: - return v, i + 1 - return None - - def pt_number_1_99(i): - r1 = pt_number_word(i, 1, 29) - if r1: - return r1 - - r1 = pt_number_word(i, 30, 90) - if r1: - v1, i1 = r1 - r2 = pt_cte(i1, "e") - if r2: - i2 = r2[1] - r3 = pt_number_word(i2, 1, 9) - if r3: - v3, i3 = r3 - return v1 + v3, i3 - return r1 - return None - - def pt_number_1_999(i): - # [2-9]cientos [1-99]? - r1 = pt_number_word(i, 100, 900) - if r1: - v1, i1 = r1 - r2 = pt_number_1_99(i1) - if r2: - v2, i2 = r2 - return v1 + v2, i2 - else: - return r1 - - # [1-99] - r1 = pt_number_1_99(i) - if r1: - return r1 - - return None - - def pt_number(i): - # check for cero - r1 = pt_number_word(i, 0, 0) - if r1: - return r1 - - # check for [1-999] (mil [0-999])? - r1 = pt_number_1_999(i) - if r1: - v1, i1 = r1 - r2 = pt_cte(i1, "mil") - if r2: - i2 = r2[1] - r3 = pt_number_1_999(i2) - if r3: - v3, i3 = r3 - return v1 * 1000 + v3, i3 - else: - return v1 * 1000, i2 - else: - return r1 - return None - - return pt_number(i) - - -def normalize_pt(text, remove_articles): - """ PT string normalization """ - - words = text.split() # this also removed extra spaces - normalized = "" - # Contractions are not common in PT - - # Convert numbers into digits, e.g. "dois" -> "2" - normalized = "" - i = 0 - while i < len(words): - word = words[i] - # remove articles - if remove_articles and word in _PT_ARTICLES: - i += 1 - continue - - # Convert numbers into digits - r = pt_number_parse(words, i) - if r: - v, i = r - normalized += " " + str(v) - continue - - # NOTE temporary , handle some numbers above >999 - if word in _PT_NUMBERS: - word = str(_PT_NUMBERS[word]) - # end temporary - - normalized += " " + word - i += 1 - # some articles in pt-pt can not be removed, but many words can - # this is experimental and some meaning may be lost - # maybe agressive should default to False - # only usage will tell, as a native speaker this seems reasonable - return pt_pruning(normalized[1:], agressive=remove_articles) - - -def extract_datetime_pt(input_str, currentDate, default_time): - def clean_string(s): - # cleans the input string of unneeded punctuation and capitalization - # among other things - symbols = [".", ",", ";", "?", "!", "º", "ª"] - noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", - "ao", "aos"] - - for word in symbols: - s = s.replace(word, "") - for word in noise_words: - s = s.replace(" " + word + " ", " ") - s = s.lower().replace( - "á", - "a").replace( - "ç", - "c").replace( - "à", - "a").replace( - "ã", - "a").replace( - "é", - "e").replace( - "è", - "e").replace( - "ê", - "e").replace( - "ó", - "o").replace( - "ò", - "o").replace( - "-", - " ").replace( - "_", - "") - # handle synonims and equivalents, "tomorrow early = tomorrow morning - synonims = {"manha": ["manhazinha", "cedo", "cedinho"], - "tarde": ["tardinha", "tarde"], - "noite": ["noitinha", "anoitecer"], - "todos": ["ao", "aos"], - "em": ["do", "da", "dos", "das", "de"]} - for syn in synonims: - for word in synonims[syn]: - s = s.replace(" " + word + " ", " " + syn + " ") - # relevant plurals, cant just extract all s in pt - wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", - "minutos", "segundos", "nas", "nos", "proximas", - "seguintes", "horas"] - for _, word in enumerate(wordlist): - s = s.replace(word, word.rstrip('s')) - s = s.replace("meses", "mes").replace("anteriores", "anterior") - return s - - def date_found(): - return found or \ - ( - datestr != "" or timeStr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs or minOffset != 0 or - minAbs or secOffset != 0 - ) - - if input_str == "" or not currentDate: - return None - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - dateNow = currentDate - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - words = clean_string(input_str).split(" ") - timeQualifiersList = ['manha', 'tarde', 'noite'] - time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", - "no", "dia", "hora"] - days = ['segunda', 'terca', 'quarta', - 'quinta', 'sexta', 'sabado', 'domingo'] - months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', - 'julho', 'agosto', 'setembro', 'outubro', 'novembro', - 'dezembro'] - monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', - 'set', 'out', 'nov', 'dec'] - nexts = ["proximo", "proxima"] - suffix_nexts = ["seguinte", "subsequente", "seguir"] - lasts = ["ultimo", "ultima"] - suffix_lasts = ["passada", "passado", "anterior", "antes"] - nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] - prevs = ["antes", "ante", "previa", "previamente", "anterior"] - froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", - "depois", "por", "proxima", "proximo", "da", "do", "de"] - thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", - "nessa"] - froms += thises - lists = nxts + prevs + froms + time_indicators - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" - - start = idx - used = 0 - # save timequalifier for later - if word in timeQualifiersList: - timeQualifier = word - - # parse today, tomorrow, yesterday - elif word == "hoje" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "amanha" and not fromFlag: - dayOffset = 1 - used += 1 - elif word == "ontem" and not fromFlag: - dayOffset -= 1 - used += 1 - # "before yesterday" and "before before yesterday" - elif (word == "anteontem" or - (word == "ante" and wordNext == "ontem")) and not fromFlag: - dayOffset -= 2 - used += 1 - if wordNext == "ontem": - used += 1 - elif word == "ante" and wordNext == "ante" and wordNextNext == \ - "ontem" and not fromFlag: - dayOffset -= 3 - used += 3 - elif word == "anteanteontem" and not fromFlag: - dayOffset -= 3 - used += 1 - # day after tomorrow - elif word == "depois" and wordNext == "amanha" and not fromFlag: - dayOffset += 2 - used = 2 - # day before yesterday - elif word == "antes" and wordNext == "ontem" and not fromFlag: - dayOffset -= 2 - used = 2 - # parse 5 days, 10 weeks, last week, next week, week after - elif word == "dia": - if wordNext == "depois" or wordNext == "antes": - used += 1 - if wordPrev and wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used += 1 - elif (wordPrev and wordPrev[0].isdigit() and - wordNext not in months and - wordNext not in monthsShort): - dayOffset += int(wordPrev) - start -= 1 - used += 2 - elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ - months and wordNextNext not in monthsShort: - dayOffset += int(wordNext) - start -= 1 - used += 2 - - elif word == "semana" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - dayOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - dayOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - dayOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "mes" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - monthOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - monthOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - monthOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - monthOffset = -7 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "ano" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - for w in nexts: - if wordPrev == w: - yearOffset = 7 - start -= 1 - used = 2 - for w in lasts: - if wordPrev == w: - yearOffset = -7 - start -= 1 - used = 2 - for w in suffix_nexts: - if wordNext == w: - yearOffset = 7 - start -= 1 - used = 2 - for w in suffix_lasts: - if wordNext == w: - yearOffset = -7 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - for w in nexts: - if wordPrev == w: - dayOffset += 7 - used += 1 - start -= 1 - for w in lasts: - if wordPrev == w: - dayOffset -= 7 - used += 1 - start -= 1 - for w in suffix_nexts: - if wordNext == w: - dayOffset += 7 - used += 1 - start -= 1 - for w in suffix_lasts: - if wordNext == w: - dayOffset -= 7 - used += 1 - start -= 1 - if wordNext == "feira": - used += 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in monthsShort: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months[m] - if wordPrev and wordPrev[0].isdigit(): - # 13 maio - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - # maio 13 - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordPrevPrev and wordPrevPrev[0].isdigit(): - # 13 dia maio - datestr += " " + wordPrevPrev - - start -= 2 - used += 2 - if wordNext and word[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNextNext and wordNextNext[0].isdigit(): - # maio dia 13 - datestr += " " + wordNextNext - used += 2 - if wordNextNextNext and wordNextNextNext[0].isdigit(): - datestr += " " + wordNextNextNext - used += 1 - hasYear = True - else: - hasYear = False - - if datestr in months: - datestr = "" - - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - validFollowups = days + months + monthsShort - validFollowups.append("hoje") - validFollowups.append("amanha") - validFollowups.append("ontem") - validFollowups.append("anteontem") - validFollowups.append("agora") - validFollowups.append("ja") - validFollowups.append("ante") - - # TODO debug word "depois" that one is failing for some reason - if word in froms and wordNext in validFollowups: - - if not (wordNext == "amanha" and wordNext == "ontem") and not ( - word == "depois" or word == "antes" or word == "em"): - used = 2 - fromFlag = True - if wordNext == "amanha" and word != "depois": - dayOffset += 1 - elif wordNext == "ontem": - dayOffset -= 1 - elif wordNext == "anteontem": - dayOffset -= 2 - elif wordNext == "ante" and wordNextNext == "ontem": - dayOffset -= 2 - elif (wordNext == "ante" and wordNext == "ante" and - wordNextNextNext == "ontem"): - dayOffset -= 3 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if wordNextNext == "feira": - used += 1 - if tmpOffset < 0: - tmpOffset += 7 - if wordNextNext: - if wordNextNext in nxts: - tmpOffset += 7 - used += 1 - elif wordNextNext in prevs: - tmpOffset -= 7 - used += 1 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNextNextNext: - if wordNextNextNext in nxts: - tmpOffset += 7 - used += 1 - elif wordNextNextNext in prevs: - tmpOffset -= 7 - used += 1 - dayOffset += tmpOffset - if wordNextNextNext == "feira": - used += 1 - if wordNext in months: - used -= 1 - if used > 0: - - if start - 1 > 0 and words[start - 1] in lists: - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in lists: - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - timeStr = "" - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = None - minAbs = None - military = False - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word == "meio" and wordNext == "dia": - hrAbs = 12 - used += 2 - elif word == "meia" and wordNext == "noite": - hrAbs = 0 - used += 2 - elif word == "manha": - if not hrAbs: - hrAbs = 8 - used += 1 - elif word == "tarde": - if not hrAbs: - hrAbs = 15 - used += 1 - elif word == "meio" and wordNext == "tarde": - if not hrAbs: - hrAbs = 17 - used += 2 - elif word == "meio" and wordNext == "manha": - if not hrAbs: - hrAbs = 10 - used += 2 - elif word == "fim" and wordNext == "tarde": - if not hrAbs: - hrAbs = 19 - used += 2 - elif word == "fim" and wordNext == "manha": - if not hrAbs: - hrAbs = 11 - used += 2 - elif word == "tantas" and wordNext == "manha": - if not hrAbs: - hrAbs = 4 - used += 2 - elif word == "noite": - if not hrAbs: - hrAbs = 22 - used += 1 - # parse half an hour, quarter hour - elif word == "hora" and \ - (wordPrev in time_indicators or wordPrevPrev in - time_indicators): - if wordPrev == "meia": - minOffset = 30 - elif wordPrev == "quarto": - minOffset = 15 - elif wordPrevPrev == "quarto": - minOffset = 15 - if idx > 2 and words[idx - 3] in time_indicators: - words[idx - 3] = "" - words[idx - 2] = "" - else: - hrOffset = 1 - if wordPrevPrev in time_indicators: - words[idx - 2] = "" - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - elif wordNext == "manha": - remainder = "am" - used += 1 - elif wordNext == "tarde": - remainder = "pm" - used += 1 - elif wordNext == "noite": - if 0 < int(word[0]) < 6: - remainder = "am" - else: - remainder = "pm" - used += 1 - elif wordNext in thises and wordNextNext == "manha": - remainder = "am" - used = 2 - elif wordNext in thises and wordNextNext == "tarde": - remainder = "pm" - used = 2 - elif wordNext in thises and wordNextNext == "noite": - remainder = "pm" - used = 2 - else: - if timeQualifier != "": - military = True - if strHH <= 12 and \ - (timeQualifier == "manha" or - timeQualifier == "tarde"): - strHH += 12 - - else: - # try to parse # s without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - else: - if (wordNext == "pm" or - wordNext == "p.m." or - wordNext == "tarde"): - strHH = strNum - remainder = "pm" - used = 1 - elif (wordNext == "am" or - wordNext == "a.m." or - wordNext == "manha"): - strHH = strNum - remainder = "am" - used = 1 - elif (int(word) > 100 and - ( - wordPrev == "o" or - wordPrev == "oh" or - wordPrev == "zero" - )): - # 0800 hours (pronounced oh-eight-hundred) - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - military = True - if wordNext == "hora": - used += 1 - elif ( - wordNext == "hora" and - word[0] != '0' and - ( - int(word) < 100 and - int(word) > 2400 - )): - # ignores military time - # "in 3 hours" - hrOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "minuto": - # "in 10 minutes" - minOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "segundo": - # in 5 seconds - secOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif int(word) > 100: - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - military = True - if wordNext == "hora": - used += 1 - - elif wordNext == "" or ( - wordNext == "em" and wordNextNext == "ponto"): - strHH = word - strMM = 00 - if wordNext == "em" and wordNextNext == "ponto": - used += 2 - if wordNextNextNext == "tarde": - remainder = "pm" - used += 1 - elif wordNextNextNext == "manha": - remainder = "am" - used += 1 - elif wordNextNextNext == "noite": - if 0 > int(strHH) > 6: - remainder = "am" - else: - remainder = "pm" - used += 1 - - elif wordNext[0].isdigit(): - strHH = word - strMM = wordNext - military = True - used += 1 - if wordNextNext == "hora": - used += 1 - else: - isTime = False - - strHH = int(strHH) if strHH else 0 - strMM = int(strMM) if strMM else 0 - strHH = strHH + 12 if (remainder == "pm" and - 0 < strHH < 12) else strHH - strHH = strHH - 12 if (remainder == "am" and - 0 < strHH >= 12) else strHH - if strHH > 24 or strMM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = strHH * 1 - minAbs = strMM * 1 - used += 1 - - if used > 0: - # removed parsed words from the sentence - for i in range(used): - words[idx + i] = "" - - if wordPrev == "em" or wordPrev == "ponto": - words[words.index(wordPrev)] = "" - - if idx > 0 and wordPrev in time_indicators: - words[idx - 1] = "" - if idx > 1 and wordPrevPrev in time_indicators: - words[idx - 2] = "" - - idx += used - 1 - found = True - - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow - extractedDate = extractedDate.replace(microsecond=0, - second=0, - minute=0, - hour=0) - if datestr != "": - en_months = ['january', 'february', 'march', 'april', 'may', 'june', - 'july', 'august', 'september', 'october', 'november', - 'december'] - en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', - 'aug', - 'sept', 'oct', 'nov', 'dec'] - for idx, en_month in enumerate(en_months): - datestr = datestr.replace(months[idx], en_month) - for idx, en_month in enumerate(en_monthsShort): - datestr = datestr.replace(monthsShort[idx], en_month) - - temp = datetime.strptime(datestr, "%B %d") - if not hasYear: - temp = temp.replace(year=extractedDate.year) - if extractedDate < temp: - extractedDate = extractedDate.replace(year=int(currentYear), - month=int( - temp.strftime( - "%m")), - day=int(temp.strftime( - "%d"))) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - - if timeStr != "": - temp = datetime(timeStr) - extractedDate = extractedDate.replace(hour=temp.strftime("%H"), - minute=temp.strftime("%M"), - second=temp.strftime("%S")) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - if (hrAbs or 0) != -1 and (minAbs or 0) != -1: - if hrAbs is None and minAbs is None and default_time: - hrAbs = default_time.hour - minAbs = default_time.minute - extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, - minutes=minAbs or 0) - if (hrAbs or minAbs) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - resultStr = pt_pruning(resultStr) - return [extractedDate, resultStr] - - -def pt_pruning(text, symbols=True, accents=True, agressive=True): - # agressive pt word pruning - words = ["a", "o", "os", "as", "de", "dos", "das", - "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", - "este", - "esta", "deste", "desta", "neste", "nesta", "nesse", - "nessa", "foi", "que"] - if symbols: - symbols = [".", ",", ";", ":", "!", "?", "�", "�"] - for symbol in symbols: - text = text.replace(symbol, "") - text = text.replace("-", " ").replace("_", " ") - if accents: - accents = {"a": ["á", "à", "ã", "â"], - "e": ["ê", "è", "é"], - "i": ["í", "ì"], - "o": ["ò", "ó"], - "u": ["ú", "ù"], - "c": ["ç"]} - for char in accents: - for acc in accents[char]: - text = text.replace(acc, char) - if agressive: - text_words = text.split(" ") - for idx, word in enumerate(text_words): - if word in words: - text_words[idx] = "" - text = " ".join(text_words) - text = ' '.join(text.split()) - return text - - -def get_gender_pt(word, raw_string=""): - word = word.rstrip("s") - gender = None - words = raw_string.split(" ") - for idx, w in enumerate(words): - if w == word and idx != 0: - previous = words[idx - 1] - gender = get_gender_pt(previous) - break - if not gender: - if word[-1] == "a": - gender = "f" - if word[-1] == "o" or word[-1] == "e": - gender = "m" - return gender +from lingua_franca.lang.parse_pt import * diff --git a/mycroft/util/lang/parse_sv.py b/mycroft/util/lang/parse_sv.py index 3ad7e35db0..5137cdd415 100644 --- a/mycroft/util/lang/parse_sv.py +++ b/mycroft/util/lang/parse_sv.py @@ -13,765 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from datetime import datetime -from dateutil.relativedelta import relativedelta -from mycroft.util.lang.parse_common import is_numeric, look_for_fractions +"""File kept for backwards compatibility. - -def extractnumber_sv(text): - """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - """ - aWords = text.split() - and_pass = False - valPreAnd = False - val = False - count = 0 - while count < len(aWords): - word = aWords[count] - if is_numeric(word): - val = float(word) - elif word == "första": - val = 1 - elif word == "andra": - val = 2 - elif word == "tredje": - val = 3 - elif word == "fjärde": - val = 4 - elif word == "femte": - val = 5 - elif word == "sjätte": - val = 6 - elif is_fractional_sv(word): - val = is_fractional_sv(word) - else: - if word == "en": - val = 1 - if word == "ett": - val = 1 - elif word == "två": - val = 2 - elif word == "tre": - val = 3 - elif word == "fyra": - val = 4 - elif word == "fem": - val = 5 - elif word == "sex": - val = 6 - elif word == "sju": - val = 7 - elif word == "åtta": - val = 8 - elif word == "nio": - val = 9 - elif word == "tio": - val = 10 - if val: - if count < (len(aWords) - 1): - wordNext = aWords[count + 1] - else: - wordNext = "" - valNext = is_fractional_sv(wordNext) - - if valNext: - val = val * valNext - aWords[count + 1] = "" - - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - elif and_pass: - # added to value, quit here - val = valPreAnd - break - else: - count += 1 - continue - - aWords[count] = "" - - if and_pass: - aWords[count - 1] = '' # remove "och" - val += valPreAnd - elif count + 1 < len(aWords) and aWords[count + 1] == 'och': - and_pass = True - valPreAnd = val - val = False - count += 2 - continue - elif count + 2 < len(aWords) and aWords[count + 2] == 'och': - and_pass = True - valPreAnd = val - val = False - count += 3 - continue - - break - - if not val: - return False - - return val - - -def extract_datetime_sv(string, currentDate, default_time): - def clean_string(s): - """ - cleans the input string of unneeded punctuation and capitalization - among other things. - """ - s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ - .replace(' den ', ' ').replace(' en ', ' ') - wordList = s.split() - for idx, word in enumerate(wordList): - word = word.replace("'s", "") - - ordinals = ["rd", "st", "nd", "th"] - if word[0].isdigit(): - for ordinal in ordinals: - if ordinal in word: - word = word.replace(ordinal, "") - wordList[idx] = word - - return wordList - - def date_found(): - return found or \ - ( - datestr != "" or timeStr != "" or - yearOffset != 0 or monthOffset != 0 or - dayOffset is True or hrOffset != 0 or - hrAbs or minOffset != 0 or - minAbs or secOffset != 0 - ) - - if string == "" or not currentDate: - return None - - found = False - daySpecified = False - dayOffset = False - monthOffset = 0 - yearOffset = 0 - dateNow = currentDate - today = dateNow.strftime("%w") - currentYear = dateNow.strftime("%Y") - fromFlag = False - datestr = "" - hasYear = False - timeQualifier = "" - - timeQualifiersList = ['morgon', 'förmiddag', 'eftermiddag', 'kväll'] - markers = ['på', 'i', 'den här', 'kring', 'efter'] - days = ['måndag', 'tisdag', 'onsdag', 'torsdag', - 'fredag', 'lördag', 'söndag'] - months = ['januari', 'februari', 'mars', 'april', 'maj', 'juni', - 'juli', 'augusti', 'september', 'oktober', 'november', - 'december'] - monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', - 'sept', 'oct', 'nov', 'dec'] - - words = clean_string(string) - - for idx, word in enumerate(words): - if word == "": - continue - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - - # this isn't in clean string because I don't want to save back to words - word = word.rstrip('s') - start = idx - used = 0 - # save timequalifier for later - if word in timeQualifiersList: - timeQualifier = word - # parse today, tomorrow, day after tomorrow - elif word == "idag" and not fromFlag: - dayOffset = 0 - used += 1 - elif word == "imorgon" and not fromFlag: - dayOffset = 1 - used += 1 - elif word == "morgondagen" or word == "morgondagens" and not fromFlag: - dayOffset = 1 - used += 1 - elif word == "övermorgon" and not fromFlag: - dayOffset = 2 - used += 1 - # parse 5 days, 10 weeks, last week, next week - elif word == "dag" or word == "dagar": - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) - start -= 1 - used = 2 - elif word == "vecka" or word == "veckor" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 - start -= 1 - used = 2 - elif wordPrev == "nästa": - dayOffset = 7 - start -= 1 - used = 2 - elif wordPrev == "förra": - dayOffset = -7 - start -= 1 - used = 2 - # parse 10 months, next month, last month - elif word == "månad" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev == "nästa": - monthOffset = 1 - start -= 1 - used = 2 - elif wordPrev == "förra": - monthOffset = -1 - start -= 1 - used = 2 - # parse 5 years, next year, last year - elif word == "år" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) - start -= 1 - used = 2 - elif wordPrev == "nästa": - yearOffset = 1 - start -= 1 - used = 2 - elif wordPrev == "förra": - yearOffset = -1 - start -= 1 - used = 2 - # parse Monday, Tuesday, etc., and next Monday, - # last Tuesday, etc. - elif word in days and not fromFlag: - d = days.index(word) - dayOffset = (d + 1) - int(today) - used = 1 - if dayOffset < 0: - dayOffset += 7 - if wordPrev == "nästa": - dayOffset += 7 - used += 1 - start -= 1 - elif wordPrev == "förra": - dayOffset -= 7 - used += 1 - start -= 1 - # parse 15 of July, June 20th, Feb 18, 19 of February - elif word in months or word in monthsShort and not fromFlag: - try: - m = months.index(word) - except ValueError: - m = monthsShort.index(word) - used += 1 - datestr = months[m] - if wordPrev and (wordPrev[0].isdigit() or - (wordPrev == "of" and wordPrevPrev[0].isdigit())): - if wordPrev == "of" and wordPrevPrev[0].isdigit(): - datestr += " " + words[idx - 2] - used += 1 - start -= 1 - else: - datestr += " " + wordPrev - start -= 1 - used += 1 - if wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - hasYear = True - else: - hasYear = False - - elif wordNext and wordNext[0].isdigit(): - datestr += " " + wordNext - used += 1 - if wordNextNext and wordNextNext[0].isdigit(): - datestr += " " + wordNextNext - used += 1 - hasYear = True - else: - hasYear = False - # parse 5 days from tomorrow, 10 weeks from next thursday, - # 2 months from July - validFollowups = days + months + monthsShort - validFollowups.append("idag") - validFollowups.append("imorgon") - validFollowups.append("nästa") - validFollowups.append("förra") - validFollowups.append("nu") - if (word == "från" or word == "efter") and wordNext in validFollowups: - used = 2 - fromFlag = True - if wordNext == "imorgon": - dayOffset += 1 - elif wordNext in days: - d = days.index(wordNext) - tmpOffset = (d + 1) - int(today) - used = 2 - if tmpOffset < 0: - tmpOffset += 7 - dayOffset += tmpOffset - elif wordNextNext and wordNextNext in days: - d = days.index(wordNextNext) - tmpOffset = (d + 1) - int(today) - used = 3 - if wordNext == "nästa": - tmpOffset += 7 - used += 1 - start -= 1 - elif wordNext == "förra": - tmpOffset -= 7 - used += 1 - start -= 1 - dayOffset += tmpOffset - if used > 0: - if start - 1 > 0 and words[start - 1] == "denna": - start -= 1 - used += 1 - - for i in range(0, used): - words[i + start] = "" - - if start - 1 >= 0 and words[start - 1] in markers: - words[start - 1] = "" - found = True - daySpecified = True - - # parse time - timeStr = "" - hrOffset = 0 - minOffset = 0 - secOffset = 0 - hrAbs = None - minAbs = None - - for idx, word in enumerate(words): - if word == "": - continue - - wordPrevPrev = words[idx - 2] if idx > 1 else "" - wordPrev = words[idx - 1] if idx > 0 else "" - wordNext = words[idx + 1] if idx + 1 < len(words) else "" - wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - # parse noon, midnight, morning, afternoon, evening - used = 0 - if word == "middag": - hrAbs = 12 - used += 1 - elif word == "midnatt": - hrAbs = 0 - used += 1 - elif word == "morgon": - if not hrAbs: - hrAbs = 8 - used += 1 - elif word == "förmiddag": - if not hrAbs: - hrAbs = 10 - used += 1 - elif word == "eftermiddag": - if not hrAbs: - hrAbs = 15 - used += 1 - elif word == "kväll": - if not hrAbs: - hrAbs = 19 - used += 1 - # parse half an hour, quarter hour - elif wordPrev in markers or wordPrevPrev in markers: - if word == "halvtimme" or word == "halvtimma": - minOffset = 30 - elif word == "kvart": - minOffset = 15 - elif word == "timme" or word == "timma": - hrOffset = 1 - words[idx - 1] = "" - used += 1 - hrAbs = -1 - minAbs = -1 - # parse 5:00 am, 12:00 p.m., etc - elif word[0].isdigit(): - isTime = True - strHH = "" - strMM = "" - remainder = "" - if ':' in word: - # parse colons - # "3:00 in the morning" - stage = 0 - length = len(word) - for i in range(length): - if stage == 0: - if word[i].isdigit(): - strHH += word[i] - elif word[i] == ":": - stage = 1 - else: - stage = 2 - i -= 1 - elif stage == 1: - if word[i].isdigit(): - strMM += word[i] - else: - stage = 2 - i -= 1 - elif stage == 2: - remainder = word[i:].replace(".", "") - break - if remainder == "": - nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - elif nextWord == "tonight": - remainder = "pm" - used += 1 - elif wordNext == "in" and wordNextNext == "the" and \ - words[idx + 3] == "morning": - remainder = "am" - used += 3 - elif wordNext == "in" and wordNextNext == "the" and \ - words[idx + 3] == "afternoon": - remainder = "pm" - used += 3 - elif wordNext == "in" and wordNextNext == "the" and \ - words[idx + 3] == "evening": - remainder = "pm" - used += 3 - elif wordNext == "in" and wordNextNext == "morning": - remainder = "am" - used += 2 - elif wordNext == "in" and wordNextNext == "afternoon": - remainder = "pm" - used += 2 - elif wordNext == "in" and wordNextNext == "evening": - remainder = "pm" - used += 2 - elif wordNext == "this" and wordNextNext == "morning": - remainder = "am" - used = 2 - elif wordNext == "this" and wordNextNext == "afternoon": - remainder = "pm" - used = 2 - elif wordNext == "this" and wordNextNext == "evening": - remainder = "pm" - used = 2 - elif wordNext == "at" and wordNextNext == "night": - if strHH > 5: - remainder = "pm" - else: - remainder = "am" - used += 2 - else: - if timeQualifier != "": - if strHH <= 12 and \ - (timeQualifier == "evening" or - timeQualifier == "afternoon"): - strHH += 12 - else: - # try to parse # s without colons - # 5 hours, 10 minutes etc. - length = len(word) - strNum = "" - remainder = "" - for i in range(length): - if word[i].isdigit(): - strNum += word[i] - else: - remainder += word[i] - - if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() - - if ( - remainder == "pm" or - wordNext == "pm" or - remainder == "p.m." or - wordNext == "p.m."): - strHH = strNum - remainder = "pm" - used = 1 - elif ( - remainder == "am" or - wordNext == "am" or - remainder == "a.m." or - wordNext == "a.m."): - strHH = strNum - remainder = "am" - used = 1 - else: - if wordNext == "pm" or wordNext == "p.m.": - strHH = strNum - remainder = "pm" - used = 1 - elif wordNext == "am" or wordNext == "a.m.": - strHH = strNum - remainder = "am" - used = 1 - elif ( - int(word) > 100 and - ( - wordPrev == "o" or - wordPrev == "oh" - )): - # 0800 hours (pronounced oh-eight-hundred) - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - if wordNext == "hours": - used += 1 - elif ( - wordNext == "hours" and - word[0] != '0' and - ( - int(word) < 100 and - int(word) > 2400 - )): - # "in 3 hours" - hrOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - - elif wordNext == "minutes": - # "in 10 minutes" - minOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif wordNext == "seconds": - # in 5 seconds - secOffset = int(word) - used = 2 - isTime = False - hrAbs = -1 - minAbs = -1 - elif int(word) > 100: - strHH = int(word) / 100 - strMM = int(word) - strHH * 100 - if wordNext == "hours": - used += 1 - elif wordNext[0].isdigit(): - strHH = word - strMM = wordNext - used += 1 - if wordNextNext == "hours": - used += 1 - elif ( - wordNext == "" or wordNext == "o'clock" or - ( - wordNext == "in" and - ( - wordNextNext == "the" or - wordNextNext == timeQualifier - ) - )): - strHH = word - strMM = 00 - if wordNext == "o'clock": - used += 1 - if wordNext == "in" or wordNextNext == "in": - used += (1 if wordNext == "in" else 2) - if (wordNextNext and - wordNextNext in timeQualifier or - (words[words.index(wordNextNext) + 1] and - words[words.index(wordNextNext) + 1] in - timeQualifier)): - if (wordNextNext == "afternoon" or - (len(words) > - words.index(wordNextNext) + 1 and - words[words.index( - wordNextNext) + 1] == "afternoon")): - remainder = "pm" - if (wordNextNext == "evening" or - (len(words) > - (words.index(wordNextNext) + 1) and - words[words.index( - wordNextNext) + 1] == "evening")): - remainder = "pm" - if (wordNextNext == "morning" or - (len(words) > - words.index(wordNextNext) + 1 and - words[words.index( - wordNextNext) + 1] == "morning")): - remainder = "am" - else: - isTime = False - - strHH = int(strHH) if strHH else 0 - strMM = int(strMM) if strMM else 0 - strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH - strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH - if strHH > 24 or strMM > 59: - isTime = False - used = 0 - if isTime: - hrAbs = strHH * 1 - minAbs = strMM * 1 - used += 1 - if used > 0: - # removed parsed words from the sentence - for i in range(used): - words[idx + i] = "" - - if wordPrev == "o" or wordPrev == "oh": - words[words.index(wordPrev)] = "" - - if wordPrev == "early": - hrOffset = -1 - words[idx - 1] = "" - idx -= 1 - elif wordPrev == "late": - hrOffset = 1 - words[idx - 1] = "" - idx -= 1 - if idx > 0 and wordPrev in markers: - words[idx - 1] = "" - if idx > 1 and wordPrevPrev in markers: - words[idx - 2] = "" - - idx += used - 1 - found = True - - # check that we found a date - if not date_found: - return None - - if dayOffset is False: - dayOffset = 0 - - # perform date manipulation - - extractedDate = dateNow - extractedDate = extractedDate.replace(microsecond=0, - second=0, - minute=0, - hour=0) - if datestr != "": - temp = datetime.strptime(datestr, "%B %d") - if not hasYear: - temp = temp.replace(year=extractedDate.year) - if extractedDate < temp: - extractedDate = extractedDate.replace(year=int(currentYear), - month=int( - temp.strftime( - "%m")), - day=int(temp.strftime( - "%d"))) - else: - extractedDate = extractedDate.replace( - year=int(currentYear) + 1, - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - else: - extractedDate = extractedDate.replace( - year=int(temp.strftime("%Y")), - month=int(temp.strftime("%m")), - day=int(temp.strftime("%d"))) - - if timeStr != "": - temp = datetime(timeStr) - extractedDate = extractedDate.replace(hour=temp.strftime("%H"), - minute=temp.strftime("%M"), - second=temp.strftime("%S")) - - if yearOffset != 0: - extractedDate = extractedDate + relativedelta(years=yearOffset) - if monthOffset != 0: - extractedDate = extractedDate + relativedelta(months=monthOffset) - if dayOffset != 0: - extractedDate = extractedDate + relativedelta(days=dayOffset) - - if hrAbs is None and minAbs is None and default_time: - hrAbs = default_time.hour - minAbs = default_time.minute - if hrAbs != -1 and minAbs != -1: - extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, - minutes=minAbs or 0) - if (hrAbs or minAbs) and datestr == "": - if not daySpecified and dateNow > extractedDate: - extractedDate = extractedDate + relativedelta(days=1) - if hrOffset != 0: - extractedDate = extractedDate + relativedelta(hours=hrOffset) - if minOffset != 0: - extractedDate = extractedDate + relativedelta(minutes=minOffset) - if secOffset != 0: - extractedDate = extractedDate + relativedelta(seconds=secOffset) - for idx, word in enumerate(words): - if words[idx] == "and" and words[idx - 1] == "" and words[ - idx + 1] == "": - words[idx] = "" - - resultStr = " ".join(words) - resultStr = ' '.join(resultStr.split()) - return [extractedDate, resultStr] - - -def is_fractional_sv(input_str): - """ - This function takes the given text and checks if it is a fraction. - - Args: - input_str (str): the string to check if fractional - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.endswith('ars', -3): - input_str = input_str[:len(input_str) - 3] # e.g. "femtedelar" - if input_str.endswith('ar', -2): - input_str = input_str[:len(input_str) - 2] # e.g. "femtedelar" - if input_str.endswith('a', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "halva" - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "halva" - - aFrac = ["hel", "halv", "tredjedel", "fjärdedel", "femtedel", "sjättedel", - "sjundedel", "åttondel", "niondel", "tiondel", "elftedel", - "tolftedel"] - if input_str.lower() in aFrac: - return 1.0 / (aFrac.index(input_str) + 1) - if input_str == "kvart": - return 1.0 / 4 - if input_str == "trekvart": - return 3.0 / 4 - - return False - - -def normalize_sv(text, remove_articles): - """ English string normalization """ - - words = text.split() # this also removed extra spaces - normalized = '' - for word in words: - # Convert numbers into digits, e.g. "two" -> "2" - if word == 'en': - word = 'ett' - textNumbers = ["noll", "ett", "två", "tre", "fyra", "fem", "sex", - "sju", "åtta", "nio", "tio", "elva", "tolv", - "tretton", "fjorton", "femton", "sexton", - "sjutton", "arton", "nitton", "tjugo"] - if word in textNumbers: - word = str(textNumbers.index(word)) - - normalized += " " + word - - return normalized[1:] # strip the initial space +TODO: Remove in 20.02 +""" +from lingua_franca.lang.parse_sv import *