pull/754/head
SoloVeniaASaludar 2017-05-05 13:27:35 +02:00 committed by Augusto Monteiro 'Sparky
parent 8232772ee6
commit f7b790b660
2 changed files with 228 additions and 34 deletions

View File

@ -31,9 +31,11 @@ def normalize(text, lang="en-us", remove_articles=True):
Returns: Returns:
(str): The normalized string. (str): The normalized string.
""" """
if str(lang).lower().startswith("en"):
lang_lower = str(lang).lower()
if lang_lower.startswith("en"):
return normalize_en(text, remove_articles) return normalize_en(text, remove_articles)
elif str(lang).lower().startswith("es"): elif lang_lower.startswith("es"):
return normalize_es(text, remove_articles) return normalize_es(text, remove_articles)
# TODO: Normalization for other languages # TODO: Normalization for other languages
@ -109,54 +111,187 @@ def normalize_en(text, remove_articles):
return normalized[1:] # strip the initial space return normalized[1:] # strip the initial space
# TODO: it should be modular in indepent files ####################################################################
# TODO: numbers greaters than 100 # Spanish normalization
#
# TODO: numbers greater than 999999
####################################################################
es_articles = ["el", "la", "los", "las", "un", "una", "unos", "unas"] # Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
es_numbers_0_9 = [ # in Spanish, "un caballo" means "a horse" or "one horse".
"cero", "uno", "dos", "tres", "cuatro", es_articles = ["el", "la", "los", "las"]
"cinco", "seis", "siete", "ocho", "nueve"]
es_numbers_10_29 = [ es_numbers_xlat = {
u"diez", u"once", u"doce", u"trece", u"catorce", "un": 1,
u"quince", u"dieciséis", u"diecisiete", "uno": 1,
u"dieciocho", u"diecinueve", "una": 1,
u"veinte", u"veintiuno", u"veintidós", "dos": 2,
u"veintitrés", u"veinticuatro", "tres": 3,
u"veinticinco", u"veintiséis", u"veintisiete", "cuatro": 4,
u"veintiocho", u"veintinueve"] "cinco": 5,
es_numbers_10n = ["treinta", "cuarenta", "cincuenta", "sesenta", "seis": 6,
"setenta", "ochenta", "noventa"] "siete": 7,
"ocho": 8,
"nueve": 9,
"diez": 10,
"once": 11,
"doce": 12,
"trece": 13,
"catorce": 14,
"quince": 15,
u"dieciséis": 16,
"diecisiete": 17,
"dieciocho": 18,
"diecinueve": 19,
"veinte": 20,
"veintiuno": 21,
u"veintidós": 22,
u"veintitrés": 23,
"veinticuatro": 24,
"veinticinco": 25,
u"veintiséis": 26,
"veintisiete": 27,
"veintiocho": 28,
"veintinueve": 29,
"treinta": 30,
"cuarenta": 40,
"cincuenta": 50,
"sesenta": 60,
"setenta": 70,
"ochenta": 80,
"noventa": 90,
"cien": 100,
"ciento": 100,
"doscientos": 200,
"doscientas": 200,
"trescientos": 300,
"trescientas": 300,
"cuatrocientos": 400,
"cuatrocientas": 400,
"quinientos": 500,
"quinientas": 500,
"seiscientos": 600,
"seiscientas": 600,
"setecientos": 700,
"setecientas": 700,
"ochocientos": 800,
"ochocientas": 800,
"novecientos": 900,
"novecientas": 900}
def es_parse(words, i):
def es_cte(i, s):
if i < len(words) and s == words[i]:
return s, i+1
return None
def es_number_word(i, mi, ma):
if i < len(words):
v = es_numbers_xlat.get(words[i])
if v and v >= mi and v <= ma:
return v, i+1
return None
def es_number_1_99(i):
r1 = es_number_word(i, 1, 29)
if r1:
return r1
r1 = es_number_word(i, 30, 90)
if r1:
v1, i1 = r1
r2 = es_cte(i1, "y")
if r2:
v2, i2 = r2
r3 = es_number_word(i2, 1, 9)
if r3:
v3, i3 = r3
return v1+v3, i3
return r1
return None
def es_number_1_999(i):
# [2-9]cientos [1-99])?
r1 = es_number_word(i, 200, 900)
if r1:
v1, i1 = r1
r2 = es_number_1_99(i1)
if r2:
v2, i2 = r2
return v1+v2, i2
else:
return v1, i2
# ciento [1-99]
r1 = es_cte(i, "ciento")
if r1:
v1, i1 = r1
r2 = es_number_1_99(i1)
if r2:
v2, i2 = r2
return (100+v2, i2)
# 100
r1 = es_number_word(i, 100, 100)
if r1:
return r1
# [1-99]
r1 = es_number_1_99(i)
if r1:
return r1
return None
def es_number(i):
# check for cero
r1 = es_number_word(i, 0, 0)
if r1:
return r1
# check for [1-999] (mil [0-999])?
r1 = es_number_1_999(i)
if r1:
v1, i1 = r1
r2 = es_cte(i1, "mil")
if r2:
v2, i2 = r2
r3 = es_number_1_999(i2)
if r3:
v3, i3 = r3
return v1*1000+v3, i3
else:
return v1*1000, i2
else:
return r1
return None
return es_number(i)
def normalize_es(text, remove_articles): def normalize_es(text, remove_articles):
""" Spanish string normalization """ """ Spanish string normalization """
words = text.split() # this also removed extra spaces words = text.split() # this also removed extra spaces
normalized = "" normalized = ""
i = 0 i = 0
while i < len(words): while i < len(words):
word = words[i] word = words[i]
i += 1
if remove_articles and word in es_articles: if remove_articles and word in es_articles:
i += 1
continue continue
# Convert numbers into digits: from 0 to 99 # Convert numbers into digits
elif word in es_numbers_0_9: r = es_parse(words, i)
word = str(es_numbers_0_9.index(word)) if r:
v, i = r
elif word in es_numbers_10_29: normalized += " " + str(v)
word = str(es_numbers_10_29.index(word)+10) continue
elif word in es_numbers_10n:
n = es_numbers_10n.index(word)*10+30
if i+1 < len(words) and words[i] == "y" and \
words[i+1] in es_numbers_0_9:
n += es_numbers_0_9.index(words[i+1])
i += 2
word = str(n)
normalized += " " + word normalized += " " + word
i += 1
return normalized[1:] # strip the initial space return normalized[1:] # strip the initial space

View File

@ -1,3 +1,6 @@
# -*- coding: iso-8859-15 -*-
import unittest import unittest
from mycroft.util.parse import normalize from mycroft.util.parse import normalize
@ -193,6 +196,62 @@ class TestNormalize(unittest.TestCase):
self.assertEqual(normalize("whats 8 + 4"), "what is 8 + 4") self.assertEqual(normalize("whats 8 + 4"), "what is 8 + 4")
#
# Spanish
#
def test_articles_es(self):
self.assertEqual(normalize("esta es la prueba", lang="es",
remove_articles=True),
"esta es prueba")
self.assertEqual(normalize("y otra prueba", lang="es",
remove_articles=True),
"y otra prueba")
def test_numbers_es(self):
self.assertEqual(normalize("esto es un uno una", lang="es"),
"esto es 1 1 1")
self.assertEqual(normalize("esto es dos tres prueba", lang="es"),
"esto es 2 3 prueba")
self.assertEqual(normalize("esto es cuatro cinco seis prueba",
lang="es"),
"esto es 4 5 6 prueba")
self.assertEqual(normalize("siete más ocho más nueve", lang="es"),
"7 más 8 más 9")
self.assertEqual(normalize("diez once doce trece catorce quince",
lang="es"),
"10 11 12 13 14 15")
self.assertEqual(normalize(u"dieciséis diecisiete", lang="es"),
"16 17")
self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"),
"18 19")
self.assertEqual(normalize(u"veinte treinta cuarenta", lang="es"),
"20 30 40")
self.assertEqual(normalize(u"treinta y dos caballos", lang="es"),
"32 caballos")
self.assertEqual(normalize(u"cien caballos", lang="es"),
"100 caballos")
self.assertEqual(normalize(u"ciento once caballos", lang="es"),
"111 caballos")
self.assertEqual(normalize(u"había cuatrocientas una vacas",
lang="es"),
u"había 401 vacas")
self.assertEqual(normalize(u"dos mil", lang="es"),
"2000")
self.assertEqual(normalize(u"dos mil trescientas cuarenta y cinco",
lang="es"),
"2345")
self.assertEqual(normalize(
u"ciento veintitrés mil cuatrocientas cincuenta y seis",
lang="es"),
"123456")
self.assertEqual(normalize(
u"quinientas veinticinco mil", lang="es"),
"525000")
self.assertEqual(normalize(
u"novecientos noventa y nueve mil novecientos noventa y nueve",
lang="es"),
"999999")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()