2017-05-03 09:37:00 +00:00
# -*- coding: iso-8859-15 -*-
2017-02-23 12:40:46 +00:00
# Copyright 2017 Mycroft AI, Inc.
# This file is part of Mycroft Core.
# Mycroft Core is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# Mycroft Core is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.
def normalize(text, lang="en-us", remove_articles=True):
"""Prepare a string for parsing
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
2017-02-25 05:59:00 +00:00
text (str): the string to normalize
lang (str): the code for the language text is in
remove_articles (bool): whether to remove articles (like 'a', or 'the')
(str): The normalized string.
2017-02-23 12:40:46 +00:00
if str(lang).lower().startswith("en"):
return normalize_en(text, remove_articles)
2017-05-03 09:37:00 +00:00
elif str(lang).lower().startswith("es"):
return normalize_es(text, remove_articles)
2017-02-23 12:40:46 +00:00
# TODO: Normalization for other languages
return text
def normalize_en(text, remove_articles):
""" English string normalization """
words = text.split() # this also removed extra spaces
normalized = ""
for word in words:
if remove_articles and word in ["the", "a", "an"]:
# Expand common contractions, e.g. "isn't" -> "is not"
contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
"didn't", "doesn't", "don't", "gonna", "gotta",
"hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
"how'd", "how'll", "how's", "I'd", "I'll", "I'm",
"I've", "isn't", "it'd", "it'll", "it's", "mightn't",
"might've", "mustn't", "must've", "needn't", "oughtn't",
"shan't", "she'd", "she'll", "she's", "shouldn't",
"should've", "somebody's", "someone'd", "someone'll",
"someone's", "that'll", "that's", "that'd", "there'd",
"there're", "there's", "they'd", "they'll", "they're",
"they've", "wasn't", "we'd", "we'll", "we're", "we've",
"weren't", "what'd", "what'll", "what're", "what's",
2017-02-25 05:59:00 +00:00
"whats", # technically incorrect but some STT does this
2017-02-23 12:40:46 +00:00
"what've", "when's", "when'd", "where'd", "where's",
"where've", "who'd", "who'd've", "who'll", "who're",
"who's", "who've", "why'd", "why're", "why's", "won't",
"won't've", "would've", "wouldn't", "wouldn't've",
"y'all", "ya'll", "you'd", "you'd've", "you'll",
"y'aint", "y'ain't", "you're", "you've"]
if word in contraction:
expansion = ["is not", "are not", "can not", "could have",
"could not", "did not", "does not", "do not",
"going to", "got to", "had not", "has not",
"have not", "he would", "he will", "he is", "how did",
"how will", "how is", "I would", "I will", "I am",
"I have", "is not", "it would", "it will", "it is",
"might not", "might have", "must not", "must have",
"need not", "ought not", "shall not", "she would",
"she will", "she is", "should not", "should have",
"somebody is", "someone would", "someone will",
"someone is", "that will", "that is", "that would",
"there would", "there are", "there is", "they would",
"they will", "they are", "they have", "was not",
"we would", "we will", "we are", "we have",
"were not", "what did", "what will", "what are",
2017-03-14 18:43:41 +00:00
"what is",
2017-02-23 12:40:46 +00:00
"what is", "what have", "when is", "when did",
"where did", "where is", "where have", "who would",
"who would have", "who will", "who are", "who is",
"who have", "why did", "why are", "why is",
"will not", "will not have", "would have",
"would not", "would not have", "you all", "you all",
"you would", "you would have", "you will",
"you are not", "you are not", "you are", "you have"]
word = expansion[contraction.index(word)]
# Convert numbers into digits, e.g. "two" -> "2"
textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
"seven", "eight", "nine", "ten", "eleven", "twelve",
"thirteen", "fourteen", "fifteen", "sixteen",
"seventeen", "eighteen", "nineteen", "twenty"]
if word in textNumbers:
word = str(textNumbers.index(word))
2017-03-14 18:43:41 +00:00
normalized += " " + word
2017-02-23 12:40:46 +00:00
return normalized[1:] # strip the initial space
2017-05-03 09:37:00 +00:00
2017-05-03 17:48:08 +00:00
# TODO: it should be modular in indepent files
# TODO: numbers greaters than 100
es_articles = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
es_numbers_0_9 = [
"cero", "uno", "dos", "tres", "cuatro",
"cinco", "seis", "siete", "ocho", "nueve"]
es_numbers_10_29 = [
2017-05-03 09:37:00 +00:00
u"diez", u"once", u"doce", u"trece", u"catorce",
u"quince", u"dieciséis", u"diecisiete",
u"dieciocho", u"diecinueve",
2017-05-03 17:48:08 +00:00
u"veinte", u"veintiuno", u"veintidós",
2017-05-03 09:37:00 +00:00
u"veintitrés", u"veinticuatro",
u"veinticinco", u"veintiséis", u"veintisiete",
u"veintiocho", u"veintinueve"]
2017-05-03 17:48:08 +00:00
es_numbers_10n = ["treinta", "cuarenta", "cincuenta", "sesenta",
"setenta", "ochenta", "noventa"]
2017-05-03 09:37:00 +00:00
def normalize_es(text, remove_articles):
""" Spanish string normalization """
words = text.split() # this also removed extra spaces
normalized = ""
2017-05-03 17:48:08 +00:00
i = 0
while i < len(words):
word = words[i]
i += 1
if remove_articles and word in es_articles:
2017-05-03 09:37:00 +00:00
2017-05-03 17:48:08 +00:00
# Convert numbers into digits: from 0 to 99
elif word in es_numbers_0_9:
word = str(es_numbers_0_9.index(word))
elif word in es_numbers_10_29:
word = str(es_numbers_10_29.index(word)+10)
2017-05-03 09:37:00 +00:00
2017-05-03 17:48:08 +00:00
elif word in es_numbers_10n:
n = es_numbers_10n.index(word)*10+30
if i+1 < len(words) and words[i] == "y" and \
words[i+1] in es_numbers_0_9:
n += es_numbers_0_9.index(words[i+1])
i += 2
word = str(n)
2017-05-03 09:37:00 +00:00
normalized += " " + word
return normalized[1:] # strip the initial space