pull/754/head
parent
8232772ee6
commit
f7b790b660
|
@ -31,9 +31,11 @@ def normalize(text, lang="en-us", remove_articles=True):
|
||||||
Returns:
|
Returns:
|
||||||
(str): The normalized string.
|
(str): The normalized string.
|
||||||
"""
|
"""
|
||||||
if str(lang).lower().startswith("en"):
|
|
||||||
|
lang_lower = str(lang).lower()
|
||||||
|
if lang_lower.startswith("en"):
|
||||||
return normalize_en(text, remove_articles)
|
return normalize_en(text, remove_articles)
|
||||||
elif str(lang).lower().startswith("es"):
|
elif lang_lower.startswith("es"):
|
||||||
return normalize_es(text, remove_articles)
|
return normalize_es(text, remove_articles)
|
||||||
|
|
||||||
# TODO: Normalization for other languages
|
# TODO: Normalization for other languages
|
||||||
|
@ -109,54 +111,187 @@ def normalize_en(text, remove_articles):
|
||||||
return normalized[1:] # strip the initial space
|
return normalized[1:] # strip the initial space
|
||||||
|
|
||||||
|
|
||||||
# TODO: it should be modular in indepent files
|
####################################################################
|
||||||
# TODO: numbers greaters than 100
|
# Spanish normalization
|
||||||
|
#
|
||||||
|
# TODO: numbers greater than 999999
|
||||||
|
####################################################################
|
||||||
|
|
||||||
es_articles = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
|
# Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
|
||||||
es_numbers_0_9 = [
|
# in Spanish, "un caballo" means "a horse" or "one horse".
|
||||||
"cero", "uno", "dos", "tres", "cuatro",
|
es_articles = ["el", "la", "los", "las"]
|
||||||
"cinco", "seis", "siete", "ocho", "nueve"]
|
|
||||||
es_numbers_10_29 = [
|
es_numbers_xlat = {
|
||||||
u"diez", u"once", u"doce", u"trece", u"catorce",
|
"un": 1,
|
||||||
u"quince", u"dieciséis", u"diecisiete",
|
"uno": 1,
|
||||||
u"dieciocho", u"diecinueve",
|
"una": 1,
|
||||||
u"veinte", u"veintiuno", u"veintidós",
|
"dos": 2,
|
||||||
u"veintitrés", u"veinticuatro",
|
"tres": 3,
|
||||||
u"veinticinco", u"veintiséis", u"veintisiete",
|
"cuatro": 4,
|
||||||
u"veintiocho", u"veintinueve"]
|
"cinco": 5,
|
||||||
es_numbers_10n = ["treinta", "cuarenta", "cincuenta", "sesenta",
|
"seis": 6,
|
||||||
"setenta", "ochenta", "noventa"]
|
"siete": 7,
|
||||||
|
"ocho": 8,
|
||||||
|
"nueve": 9,
|
||||||
|
"diez": 10,
|
||||||
|
"once": 11,
|
||||||
|
"doce": 12,
|
||||||
|
"trece": 13,
|
||||||
|
"catorce": 14,
|
||||||
|
"quince": 15,
|
||||||
|
u"dieciséis": 16,
|
||||||
|
"diecisiete": 17,
|
||||||
|
"dieciocho": 18,
|
||||||
|
"diecinueve": 19,
|
||||||
|
"veinte": 20,
|
||||||
|
"veintiuno": 21,
|
||||||
|
u"veintidós": 22,
|
||||||
|
u"veintitrés": 23,
|
||||||
|
"veinticuatro": 24,
|
||||||
|
"veinticinco": 25,
|
||||||
|
u"veintiséis": 26,
|
||||||
|
"veintisiete": 27,
|
||||||
|
"veintiocho": 28,
|
||||||
|
"veintinueve": 29,
|
||||||
|
"treinta": 30,
|
||||||
|
"cuarenta": 40,
|
||||||
|
"cincuenta": 50,
|
||||||
|
"sesenta": 60,
|
||||||
|
"setenta": 70,
|
||||||
|
"ochenta": 80,
|
||||||
|
"noventa": 90,
|
||||||
|
"cien": 100,
|
||||||
|
"ciento": 100,
|
||||||
|
"doscientos": 200,
|
||||||
|
"doscientas": 200,
|
||||||
|
"trescientos": 300,
|
||||||
|
"trescientas": 300,
|
||||||
|
"cuatrocientos": 400,
|
||||||
|
"cuatrocientas": 400,
|
||||||
|
"quinientos": 500,
|
||||||
|
"quinientas": 500,
|
||||||
|
"seiscientos": 600,
|
||||||
|
"seiscientas": 600,
|
||||||
|
"setecientos": 700,
|
||||||
|
"setecientas": 700,
|
||||||
|
"ochocientos": 800,
|
||||||
|
"ochocientas": 800,
|
||||||
|
"novecientos": 900,
|
||||||
|
"novecientas": 900}
|
||||||
|
|
||||||
|
|
||||||
|
def es_parse(words, i):
|
||||||
|
def es_cte(i, s):
|
||||||
|
if i < len(words) and s == words[i]:
|
||||||
|
return s, i+1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def es_number_word(i, mi, ma):
|
||||||
|
if i < len(words):
|
||||||
|
v = es_numbers_xlat.get(words[i])
|
||||||
|
if v and v >= mi and v <= ma:
|
||||||
|
return v, i+1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def es_number_1_99(i):
|
||||||
|
r1 = es_number_word(i, 1, 29)
|
||||||
|
if r1:
|
||||||
|
return r1
|
||||||
|
|
||||||
|
r1 = es_number_word(i, 30, 90)
|
||||||
|
if r1:
|
||||||
|
v1, i1 = r1
|
||||||
|
r2 = es_cte(i1, "y")
|
||||||
|
if r2:
|
||||||
|
v2, i2 = r2
|
||||||
|
r3 = es_number_word(i2, 1, 9)
|
||||||
|
if r3:
|
||||||
|
v3, i3 = r3
|
||||||
|
return v1+v3, i3
|
||||||
|
return r1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def es_number_1_999(i):
|
||||||
|
# [2-9]cientos [1-99])?
|
||||||
|
r1 = es_number_word(i, 200, 900)
|
||||||
|
if r1:
|
||||||
|
v1, i1 = r1
|
||||||
|
r2 = es_number_1_99(i1)
|
||||||
|
if r2:
|
||||||
|
v2, i2 = r2
|
||||||
|
return v1+v2, i2
|
||||||
|
else:
|
||||||
|
return v1, i2
|
||||||
|
|
||||||
|
# ciento [1-99]
|
||||||
|
r1 = es_cte(i, "ciento")
|
||||||
|
if r1:
|
||||||
|
v1, i1 = r1
|
||||||
|
r2 = es_number_1_99(i1)
|
||||||
|
if r2:
|
||||||
|
v2, i2 = r2
|
||||||
|
return (100+v2, i2)
|
||||||
|
|
||||||
|
# 100
|
||||||
|
r1 = es_number_word(i, 100, 100)
|
||||||
|
if r1:
|
||||||
|
return r1
|
||||||
|
|
||||||
|
# [1-99]
|
||||||
|
r1 = es_number_1_99(i)
|
||||||
|
if r1:
|
||||||
|
return r1
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def es_number(i):
|
||||||
|
# check for cero
|
||||||
|
r1 = es_number_word(i, 0, 0)
|
||||||
|
if r1:
|
||||||
|
return r1
|
||||||
|
|
||||||
|
# check for [1-999] (mil [0-999])?
|
||||||
|
r1 = es_number_1_999(i)
|
||||||
|
if r1:
|
||||||
|
v1, i1 = r1
|
||||||
|
r2 = es_cte(i1, "mil")
|
||||||
|
if r2:
|
||||||
|
v2, i2 = r2
|
||||||
|
r3 = es_number_1_999(i2)
|
||||||
|
if r3:
|
||||||
|
v3, i3 = r3
|
||||||
|
return v1*1000+v3, i3
|
||||||
|
else:
|
||||||
|
return v1*1000, i2
|
||||||
|
else:
|
||||||
|
return r1
|
||||||
|
return None
|
||||||
|
|
||||||
|
return es_number(i)
|
||||||
|
|
||||||
|
|
||||||
def normalize_es(text, remove_articles):
|
def normalize_es(text, remove_articles):
|
||||||
""" Spanish string normalization """
|
""" Spanish string normalization """
|
||||||
|
|
||||||
words = text.split() # this also removed extra spaces
|
words = text.split() # this also removed extra spaces
|
||||||
|
|
||||||
normalized = ""
|
normalized = ""
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
while i < len(words):
|
while i < len(words):
|
||||||
word = words[i]
|
word = words[i]
|
||||||
i += 1
|
|
||||||
|
|
||||||
if remove_articles and word in es_articles:
|
if remove_articles and word in es_articles:
|
||||||
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Convert numbers into digits: from 0 to 99
|
# Convert numbers into digits
|
||||||
elif word in es_numbers_0_9:
|
r = es_parse(words, i)
|
||||||
word = str(es_numbers_0_9.index(word))
|
if r:
|
||||||
|
v, i = r
|
||||||
elif word in es_numbers_10_29:
|
normalized += " " + str(v)
|
||||||
word = str(es_numbers_10_29.index(word)+10)
|
continue
|
||||||
|
|
||||||
elif word in es_numbers_10n:
|
|
||||||
n = es_numbers_10n.index(word)*10+30
|
|
||||||
if i+1 < len(words) and words[i] == "y" and \
|
|
||||||
words[i+1] in es_numbers_0_9:
|
|
||||||
n += es_numbers_0_9.index(words[i+1])
|
|
||||||
i += 2
|
|
||||||
word = str(n)
|
|
||||||
|
|
||||||
normalized += " " + word
|
normalized += " " + word
|
||||||
|
i += 1
|
||||||
|
|
||||||
return normalized[1:] # strip the initial space
|
return normalized[1:] # strip the initial space
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
|
||||||
|
# -*- coding: iso-8859-15 -*-
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
from mycroft.util.parse import normalize
|
from mycroft.util.parse import normalize
|
||||||
|
|
||||||
|
@ -193,6 +196,62 @@ class TestNormalize(unittest.TestCase):
|
||||||
|
|
||||||
self.assertEqual(normalize("whats 8 + 4"), "what is 8 + 4")
|
self.assertEqual(normalize("whats 8 + 4"), "what is 8 + 4")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Spanish
|
||||||
|
#
|
||||||
|
def test_articles_es(self):
|
||||||
|
self.assertEqual(normalize("esta es la prueba", lang="es",
|
||||||
|
remove_articles=True),
|
||||||
|
"esta es prueba")
|
||||||
|
self.assertEqual(normalize("y otra prueba", lang="es",
|
||||||
|
remove_articles=True),
|
||||||
|
"y otra prueba")
|
||||||
|
|
||||||
|
def test_numbers_es(self):
|
||||||
|
self.assertEqual(normalize("esto es un uno una", lang="es"),
|
||||||
|
"esto es 1 1 1")
|
||||||
|
self.assertEqual(normalize("esto es dos tres prueba", lang="es"),
|
||||||
|
"esto es 2 3 prueba")
|
||||||
|
self.assertEqual(normalize("esto es cuatro cinco seis prueba",
|
||||||
|
lang="es"),
|
||||||
|
"esto es 4 5 6 prueba")
|
||||||
|
self.assertEqual(normalize("siete más ocho más nueve", lang="es"),
|
||||||
|
"7 más 8 más 9")
|
||||||
|
self.assertEqual(normalize("diez once doce trece catorce quince",
|
||||||
|
lang="es"),
|
||||||
|
"10 11 12 13 14 15")
|
||||||
|
self.assertEqual(normalize(u"dieciséis diecisiete", lang="es"),
|
||||||
|
"16 17")
|
||||||
|
self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"),
|
||||||
|
"18 19")
|
||||||
|
self.assertEqual(normalize(u"veinte treinta cuarenta", lang="es"),
|
||||||
|
"20 30 40")
|
||||||
|
self.assertEqual(normalize(u"treinta y dos caballos", lang="es"),
|
||||||
|
"32 caballos")
|
||||||
|
self.assertEqual(normalize(u"cien caballos", lang="es"),
|
||||||
|
"100 caballos")
|
||||||
|
self.assertEqual(normalize(u"ciento once caballos", lang="es"),
|
||||||
|
"111 caballos")
|
||||||
|
self.assertEqual(normalize(u"había cuatrocientas una vacas",
|
||||||
|
lang="es"),
|
||||||
|
u"había 401 vacas")
|
||||||
|
self.assertEqual(normalize(u"dos mil", lang="es"),
|
||||||
|
"2000")
|
||||||
|
self.assertEqual(normalize(u"dos mil trescientas cuarenta y cinco",
|
||||||
|
lang="es"),
|
||||||
|
"2345")
|
||||||
|
self.assertEqual(normalize(
|
||||||
|
u"ciento veintitrés mil cuatrocientas cincuenta y seis",
|
||||||
|
lang="es"),
|
||||||
|
"123456")
|
||||||
|
self.assertEqual(normalize(
|
||||||
|
u"quinientas veinticinco mil", lang="es"),
|
||||||
|
"525000")
|
||||||
|
self.assertEqual(normalize(
|
||||||
|
u"novecientos noventa y nueve mil novecientos noventa y nueve",
|
||||||
|
lang="es"),
|
||||||
|
"999999")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
Loading…
Reference in New Issue