2017-05-03 09:37:00 +00:00
|
|
|
|
|
|
|
# -*- coding: iso-8859-15 -*-
|
|
|
|
|
2017-02-23 12:40:46 +00:00
|
|
|
# Copyright 2017 Mycroft AI, Inc.
|
|
|
|
#
|
|
|
|
# This file is part of Mycroft Core.
|
|
|
|
#
|
|
|
|
# Mycroft Core is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# Mycroft Core is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
|
|
def normalize(text, lang="en-us", remove_articles=True):
|
|
|
|
"""Prepare a string for parsing
|
|
|
|
|
|
|
|
This function prepares the given text for parsing by making
|
|
|
|
numbers consistent, getting rid of contractions, etc.
|
2017-02-25 05:59:00 +00:00
|
|
|
Args:
|
|
|
|
text (str): the string to normalize
|
|
|
|
lang (str): the code for the language text is in
|
|
|
|
remove_articles (bool): whether to remove articles (like 'a', or 'the')
|
|
|
|
Returns:
|
|
|
|
(str): The normalized string.
|
2017-02-23 12:40:46 +00:00
|
|
|
"""
|
|
|
|
if str(lang).lower().startswith("en"):
|
|
|
|
return normalize_en(text, remove_articles)
|
2017-05-03 09:37:00 +00:00
|
|
|
elif str(lang).lower().startswith("es"):
|
|
|
|
return normalize_es(text, remove_articles)
|
2017-02-23 12:40:46 +00:00
|
|
|
|
|
|
|
# TODO: Normalization for other languages
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_en(text, remove_articles):
|
|
|
|
""" English string normalization """
|
|
|
|
|
|
|
|
words = text.split() # this also removed extra spaces
|
|
|
|
normalized = ""
|
|
|
|
for word in words:
|
|
|
|
if remove_articles and word in ["the", "a", "an"]:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Expand common contractions, e.g. "isn't" -> "is not"
|
|
|
|
contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
|
|
|
|
"didn't", "doesn't", "don't", "gonna", "gotta",
|
|
|
|
"hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
|
|
|
|
"how'd", "how'll", "how's", "I'd", "I'll", "I'm",
|
|
|
|
"I've", "isn't", "it'd", "it'll", "it's", "mightn't",
|
|
|
|
"might've", "mustn't", "must've", "needn't", "oughtn't",
|
|
|
|
"shan't", "she'd", "she'll", "she's", "shouldn't",
|
|
|
|
"should've", "somebody's", "someone'd", "someone'll",
|
|
|
|
"someone's", "that'll", "that's", "that'd", "there'd",
|
|
|
|
"there're", "there's", "they'd", "they'll", "they're",
|
|
|
|
"they've", "wasn't", "we'd", "we'll", "we're", "we've",
|
|
|
|
"weren't", "what'd", "what'll", "what're", "what's",
|
2017-02-25 05:59:00 +00:00
|
|
|
"whats", # technically incorrect but some STT does this
|
2017-02-23 12:40:46 +00:00
|
|
|
"what've", "when's", "when'd", "where'd", "where's",
|
|
|
|
"where've", "who'd", "who'd've", "who'll", "who're",
|
|
|
|
"who's", "who've", "why'd", "why're", "why's", "won't",
|
|
|
|
"won't've", "would've", "wouldn't", "wouldn't've",
|
|
|
|
"y'all", "ya'll", "you'd", "you'd've", "you'll",
|
|
|
|
"y'aint", "y'ain't", "you're", "you've"]
|
|
|
|
if word in contraction:
|
|
|
|
expansion = ["is not", "are not", "can not", "could have",
|
|
|
|
"could not", "did not", "does not", "do not",
|
|
|
|
"going to", "got to", "had not", "has not",
|
|
|
|
"have not", "he would", "he will", "he is", "how did",
|
|
|
|
"how will", "how is", "I would", "I will", "I am",
|
|
|
|
"I have", "is not", "it would", "it will", "it is",
|
|
|
|
"might not", "might have", "must not", "must have",
|
|
|
|
"need not", "ought not", "shall not", "she would",
|
|
|
|
"she will", "she is", "should not", "should have",
|
|
|
|
"somebody is", "someone would", "someone will",
|
|
|
|
"someone is", "that will", "that is", "that would",
|
|
|
|
"there would", "there are", "there is", "they would",
|
|
|
|
"they will", "they are", "they have", "was not",
|
|
|
|
"we would", "we will", "we are", "we have",
|
|
|
|
"were not", "what did", "what will", "what are",
|
2017-03-14 18:43:41 +00:00
|
|
|
"what is",
|
2017-02-23 12:40:46 +00:00
|
|
|
"what is", "what have", "when is", "when did",
|
|
|
|
"where did", "where is", "where have", "who would",
|
|
|
|
"who would have", "who will", "who are", "who is",
|
|
|
|
"who have", "why did", "why are", "why is",
|
|
|
|
"will not", "will not have", "would have",
|
|
|
|
"would not", "would not have", "you all", "you all",
|
|
|
|
"you would", "you would have", "you will",
|
|
|
|
"you are not", "you are not", "you are", "you have"]
|
|
|
|
word = expansion[contraction.index(word)]
|
|
|
|
|
|
|
|
# Convert numbers into digits, e.g. "two" -> "2"
|
|
|
|
textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
|
|
|
|
"seven", "eight", "nine", "ten", "eleven", "twelve",
|
|
|
|
"thirteen", "fourteen", "fifteen", "sixteen",
|
|
|
|
"seventeen", "eighteen", "nineteen", "twenty"]
|
|
|
|
if word in textNumbers:
|
|
|
|
word = str(textNumbers.index(word))
|
|
|
|
|
2017-03-14 18:43:41 +00:00
|
|
|
normalized += " " + word
|
2017-02-23 12:40:46 +00:00
|
|
|
|
|
|
|
return normalized[1:] # strip the initial space
|
2017-05-03 09:37:00 +00:00
|
|
|
|
|
|
|
|
2017-05-03 17:48:08 +00:00
|
|
|
# TODO: it should be modular in indepent files
|
|
|
|
# TODO: numbers greaters than 100
|
|
|
|
|
|
|
|
es_articles = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
|
|
|
|
es_numbers_0_9 = [
|
|
|
|
"cero", "uno", "dos", "tres", "cuatro",
|
|
|
|
"cinco", "seis", "siete", "ocho", "nueve"]
|
|
|
|
es_numbers_10_29 = [
|
2017-05-03 09:37:00 +00:00
|
|
|
u"diez", u"once", u"doce", u"trece", u"catorce",
|
|
|
|
u"quince", u"dieciséis", u"diecisiete",
|
|
|
|
u"dieciocho", u"diecinueve",
|
2017-05-03 17:48:08 +00:00
|
|
|
u"veinte", u"veintiuno", u"veintidós",
|
2017-05-03 09:37:00 +00:00
|
|
|
u"veintitrés", u"veinticuatro",
|
|
|
|
u"veinticinco", u"veintiséis", u"veintisiete",
|
|
|
|
u"veintiocho", u"veintinueve"]
|
2017-05-03 17:48:08 +00:00
|
|
|
es_numbers_10n = ["treinta", "cuarenta", "cincuenta", "sesenta",
|
|
|
|
"setenta", "ochenta", "noventa"]
|
2017-05-03 09:37:00 +00:00
|
|
|
|
|
|
|
|
|
|
|
def normalize_es(text, remove_articles):
|
|
|
|
""" Spanish string normalization """
|
|
|
|
|
|
|
|
words = text.split() # this also removed extra spaces
|
|
|
|
normalized = ""
|
2017-05-03 17:48:08 +00:00
|
|
|
i = 0
|
|
|
|
|
|
|
|
while i < len(words):
|
|
|
|
word = words[i]
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
if remove_articles and word in es_articles:
|
2017-05-03 09:37:00 +00:00
|
|
|
continue
|
|
|
|
|
2017-05-03 17:48:08 +00:00
|
|
|
# Convert numbers into digits: from 0 to 99
|
|
|
|
elif word in es_numbers_0_9:
|
|
|
|
word = str(es_numbers_0_9.index(word))
|
|
|
|
|
|
|
|
elif word in es_numbers_10_29:
|
|
|
|
word = str(es_numbers_10_29.index(word)+10)
|
2017-05-03 09:37:00 +00:00
|
|
|
|
2017-05-03 17:48:08 +00:00
|
|
|
elif word in es_numbers_10n:
|
|
|
|
n = es_numbers_10n.index(word)*10+30
|
|
|
|
if i+1 < len(words) and words[i] == "y" and \
|
|
|
|
words[i+1] in es_numbers_0_9:
|
|
|
|
n += es_numbers_0_9.index(words[i+1])
|
|
|
|
i += 2
|
|
|
|
word = str(n)
|
2017-05-03 09:37:00 +00:00
|
|
|
|
|
|
|
normalized += " " + word
|
|
|
|
|
|
|
|
return normalized[1:] # strip the initial space
|