From f7b790b660f745ab17082752dc392c0a7286c4e1 Mon Sep 17 00:00:00 2001 From: SoloVeniaASaludar Date: Fri, 5 May 2017 13:27:35 +0200 Subject: [PATCH] + --- mycroft/util/parse.py | 203 +++++++++++++++++++++++++++++++++------- test/util/test_parse.py | 59 ++++++++++++ 2 files changed, 228 insertions(+), 34 deletions(-) diff --git a/mycroft/util/parse.py b/mycroft/util/parse.py index 4e883047b3..eb665d7f1c 100644 --- a/mycroft/util/parse.py +++ b/mycroft/util/parse.py @@ -31,9 +31,11 @@ def normalize(text, lang="en-us", remove_articles=True): Returns: (str): The normalized string. """ - if str(lang).lower().startswith("en"): + + lang_lower = str(lang).lower() + if lang_lower.startswith("en"): return normalize_en(text, remove_articles) - elif str(lang).lower().startswith("es"): + elif lang_lower.startswith("es"): return normalize_es(text, remove_articles) # TODO: Normalization for other languages @@ -109,54 +111,187 @@ def normalize_en(text, remove_articles): return normalized[1:] # strip the initial space -# TODO: it should be modular in indepent files -# TODO: numbers greaters than 100 +#################################################################### +# Spanish normalization +# +# TODO: numbers greater than 999999 +#################################################################### -es_articles = ["el", "la", "los", "las", "un", "una", "unos", "unas"] -es_numbers_0_9 = [ - "cero", "uno", "dos", "tres", "cuatro", - "cinco", "seis", "siete", "ocho", "nueve"] -es_numbers_10_29 = [ - u"diez", u"once", u"doce", u"trece", u"catorce", - u"quince", u"dieciséis", u"diecisiete", - u"dieciocho", u"diecinueve", - u"veinte", u"veintiuno", u"veintidós", - u"veintitrés", u"veinticuatro", - u"veinticinco", u"veintiséis", u"veintisiete", - u"veintiocho", u"veintinueve"] -es_numbers_10n = ["treinta", "cuarenta", "cincuenta", "sesenta", - "setenta", "ochenta", "noventa"] +# Undefined articles ["un", "una", "unos", "unas"] can not be supressed, +# in Spanish, "un caballo" means "a horse" or "one horse". +es_articles = ["el", "la", "los", "las"] + +es_numbers_xlat = { + "un": 1, + "uno": 1, + "una": 1, + "dos": 2, + "tres": 3, + "cuatro": 4, + "cinco": 5, + "seis": 6, + "siete": 7, + "ocho": 8, + "nueve": 9, + "diez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + u"dieciséis": 16, + "diecisiete": 17, + "dieciocho": 18, + "diecinueve": 19, + "veinte": 20, + "veintiuno": 21, + u"veintidós": 22, + u"veintitrés": 23, + "veinticuatro": 24, + "veinticinco": 25, + u"veintiséis": 26, + "veintisiete": 27, + "veintiocho": 28, + "veintinueve": 29, + "treinta": 30, + "cuarenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "ochenta": 80, + "noventa": 90, + "cien": 100, + "ciento": 100, + "doscientos": 200, + "doscientas": 200, + "trescientos": 300, + "trescientas": 300, + "cuatrocientos": 400, + "cuatrocientas": 400, + "quinientos": 500, + "quinientas": 500, + "seiscientos": 600, + "seiscientas": 600, + "setecientos": 700, + "setecientas": 700, + "ochocientos": 800, + "ochocientas": 800, + "novecientos": 900, + "novecientas": 900} + + +def es_parse(words, i): + def es_cte(i, s): + if i < len(words) and s == words[i]: + return s, i+1 + return None + + def es_number_word(i, mi, ma): + if i < len(words): + v = es_numbers_xlat.get(words[i]) + if v and v >= mi and v <= ma: + return v, i+1 + return None + + def es_number_1_99(i): + r1 = es_number_word(i, 1, 29) + if r1: + return r1 + + r1 = es_number_word(i, 30, 90) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "y") + if r2: + v2, i2 = r2 + r3 = es_number_word(i2, 1, 9) + if r3: + v3, i3 = r3 + return v1+v3, i3 + return r1 + return None + + def es_number_1_999(i): + # [2-9]cientos [1-99])? + r1 = es_number_word(i, 200, 900) + if r1: + v1, i1 = r1 + r2 = es_number_1_99(i1) + if r2: + v2, i2 = r2 + return v1+v2, i2 + else: + return v1, i2 + + # ciento [1-99] + r1 = es_cte(i, "ciento") + if r1: + v1, i1 = r1 + r2 = es_number_1_99(i1) + if r2: + v2, i2 = r2 + return (100+v2, i2) + + # 100 + r1 = es_number_word(i, 100, 100) + if r1: + return r1 + + # [1-99] + r1 = es_number_1_99(i) + if r1: + return r1 + + return None + + def es_number(i): + # check for cero + r1 = es_number_word(i, 0, 0) + if r1: + return r1 + + # check for [1-999] (mil [0-999])? + r1 = es_number_1_999(i) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "mil") + if r2: + v2, i2 = r2 + r3 = es_number_1_999(i2) + if r3: + v3, i3 = r3 + return v1*1000+v3, i3 + else: + return v1*1000, i2 + else: + return r1 + return None + + return es_number(i) def normalize_es(text, remove_articles): """ Spanish string normalization """ words = text.split() # this also removed extra spaces + normalized = "" i = 0 - while i < len(words): word = words[i] - i += 1 if remove_articles and word in es_articles: + i += 1 continue - # Convert numbers into digits: from 0 to 99 - elif word in es_numbers_0_9: - word = str(es_numbers_0_9.index(word)) - - elif word in es_numbers_10_29: - word = str(es_numbers_10_29.index(word)+10) - - elif word in es_numbers_10n: - n = es_numbers_10n.index(word)*10+30 - if i+1 < len(words) and words[i] == "y" and \ - words[i+1] in es_numbers_0_9: - n += es_numbers_0_9.index(words[i+1]) - i += 2 - word = str(n) + # Convert numbers into digits + r = es_parse(words, i) + if r: + v, i = r + normalized += " " + str(v) + continue normalized += " " + word + i += 1 return normalized[1:] # strip the initial space diff --git a/test/util/test_parse.py b/test/util/test_parse.py index 7411798e76..a41ca118e0 100644 --- a/test/util/test_parse.py +++ b/test/util/test_parse.py @@ -1,3 +1,6 @@ + +# -*- coding: iso-8859-15 -*- + import unittest from mycroft.util.parse import normalize @@ -193,6 +196,62 @@ class TestNormalize(unittest.TestCase): self.assertEqual(normalize("whats 8 + 4"), "what is 8 + 4") + # + # Spanish + # + def test_articles_es(self): + self.assertEqual(normalize("esta es la prueba", lang="es", + remove_articles=True), + "esta es prueba") + self.assertEqual(normalize("y otra prueba", lang="es", + remove_articles=True), + "y otra prueba") + + def test_numbers_es(self): + self.assertEqual(normalize("esto es un uno una", lang="es"), + "esto es 1 1 1") + self.assertEqual(normalize("esto es dos tres prueba", lang="es"), + "esto es 2 3 prueba") + self.assertEqual(normalize("esto es cuatro cinco seis prueba", + lang="es"), + "esto es 4 5 6 prueba") + self.assertEqual(normalize("siete más ocho más nueve", lang="es"), + "7 más 8 más 9") + self.assertEqual(normalize("diez once doce trece catorce quince", + lang="es"), + "10 11 12 13 14 15") + self.assertEqual(normalize(u"dieciséis diecisiete", lang="es"), + "16 17") + self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"), + "18 19") + self.assertEqual(normalize(u"veinte treinta cuarenta", lang="es"), + "20 30 40") + self.assertEqual(normalize(u"treinta y dos caballos", lang="es"), + "32 caballos") + self.assertEqual(normalize(u"cien caballos", lang="es"), + "100 caballos") + self.assertEqual(normalize(u"ciento once caballos", lang="es"), + "111 caballos") + self.assertEqual(normalize(u"había cuatrocientas una vacas", + lang="es"), + u"había 401 vacas") + self.assertEqual(normalize(u"dos mil", lang="es"), + "2000") + self.assertEqual(normalize(u"dos mil trescientas cuarenta y cinco", + lang="es"), + "2345") + self.assertEqual(normalize( + u"ciento veintitrés mil cuatrocientas cincuenta y seis", + lang="es"), + "123456") + self.assertEqual(normalize( + u"quinientas veinticinco mil", lang="es"), + "525000") + self.assertEqual(normalize( + u"novecientos noventa y nueve mil novecientos noventa y nueve", + lang="es"), + "999999") + if __name__ == "__main__": unittest.main()