2017-02-23 12:40:46 +00:00
|
|
|
# Copyright 2017 Mycroft AI, Inc.
|
|
|
|
#
|
|
|
|
# This file is part of Mycroft Core.
|
|
|
|
#
|
|
|
|
# Mycroft Core is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# Mycroft Core is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
|
|
def normalize(text, lang="en-us", remove_articles=True):
|
|
|
|
"""Prepare a string for parsing
|
|
|
|
|
|
|
|
This function prepares the given text for parsing by making
|
|
|
|
numbers consistent, getting rid of contractions, etc.
|
2017-02-25 05:59:00 +00:00
|
|
|
|
|
|
|
Args:
|
|
|
|
text (str): the string to normalize
|
|
|
|
lang (str): the code for the language text is in
|
|
|
|
remove_articles (bool): whether to remove articles (like 'a', or 'the')
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
(str): The normalized string.
|
2017-02-23 12:40:46 +00:00
|
|
|
"""
|
|
|
|
if str(lang).lower().startswith("en"):
|
|
|
|
return normalize_en(text, remove_articles)
|
|
|
|
|
|
|
|
# TODO: Normalization for other languages
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_en(text, remove_articles):
|
|
|
|
""" English string normalization """
|
|
|
|
|
|
|
|
words = text.split() # this also removed extra spaces
|
|
|
|
normalized = ""
|
|
|
|
for word in words:
|
|
|
|
if remove_articles and word in ["the", "a", "an"]:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Expand common contractions, e.g. "isn't" -> "is not"
|
|
|
|
contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
|
|
|
|
"didn't", "doesn't", "don't", "gonna", "gotta",
|
|
|
|
"hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
|
|
|
|
"how'd", "how'll", "how's", "I'd", "I'll", "I'm",
|
|
|
|
"I've", "isn't", "it'd", "it'll", "it's", "mightn't",
|
|
|
|
"might've", "mustn't", "must've", "needn't", "oughtn't",
|
|
|
|
"shan't", "she'd", "she'll", "she's", "shouldn't",
|
|
|
|
"should've", "somebody's", "someone'd", "someone'll",
|
|
|
|
"someone's", "that'll", "that's", "that'd", "there'd",
|
|
|
|
"there're", "there's", "they'd", "they'll", "they're",
|
|
|
|
"they've", "wasn't", "we'd", "we'll", "we're", "we've",
|
|
|
|
"weren't", "what'd", "what'll", "what're", "what's",
|
2017-02-25 05:59:00 +00:00
|
|
|
"whats", # technically incorrect but some STT does this
|
2017-02-23 12:40:46 +00:00
|
|
|
"what've", "when's", "when'd", "where'd", "where's",
|
|
|
|
"where've", "who'd", "who'd've", "who'll", "who're",
|
|
|
|
"who's", "who've", "why'd", "why're", "why's", "won't",
|
|
|
|
"won't've", "would've", "wouldn't", "wouldn't've",
|
|
|
|
"y'all", "ya'll", "you'd", "you'd've", "you'll",
|
|
|
|
"y'aint", "y'ain't", "you're", "you've"]
|
|
|
|
if word in contraction:
|
|
|
|
expansion = ["is not", "are not", "can not", "could have",
|
|
|
|
"could not", "did not", "does not", "do not",
|
|
|
|
"going to", "got to", "had not", "has not",
|
|
|
|
"have not", "he would", "he will", "he is", "how did",
|
|
|
|
"how will", "how is", "I would", "I will", "I am",
|
|
|
|
"I have", "is not", "it would", "it will", "it is",
|
|
|
|
"might not", "might have", "must not", "must have",
|
|
|
|
"need not", "ought not", "shall not", "she would",
|
|
|
|
"she will", "she is", "should not", "should have",
|
|
|
|
"somebody is", "someone would", "someone will",
|
|
|
|
"someone is", "that will", "that is", "that would",
|
|
|
|
"there would", "there are", "there is", "they would",
|
|
|
|
"they will", "they are", "they have", "was not",
|
|
|
|
"we would", "we will", "we are", "we have",
|
|
|
|
"were not", "what did", "what will", "what are",
|
2017-02-25 05:59:00 +00:00
|
|
|
"what is",
|
2017-02-23 12:40:46 +00:00
|
|
|
"what is", "what have", "when is", "when did",
|
|
|
|
"where did", "where is", "where have", "who would",
|
|
|
|
"who would have", "who will", "who are", "who is",
|
|
|
|
"who have", "why did", "why are", "why is",
|
|
|
|
"will not", "will not have", "would have",
|
|
|
|
"would not", "would not have", "you all", "you all",
|
|
|
|
"you would", "you would have", "you will",
|
|
|
|
"you are not", "you are not", "you are", "you have"]
|
|
|
|
word = expansion[contraction.index(word)]
|
|
|
|
|
|
|
|
# Convert numbers into digits, e.g. "two" -> "2"
|
|
|
|
textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
|
|
|
|
"seven", "eight", "nine", "ten", "eleven", "twelve",
|
|
|
|
"thirteen", "fourteen", "fifteen", "sixteen",
|
|
|
|
"seventeen", "eighteen", "nineteen", "twenty"]
|
|
|
|
if word in textNumbers:
|
|
|
|
word = str(textNumbers.index(word))
|
|
|
|
|
|
|
|
normalized += " "+word
|
|
|
|
|
|
|
|
return normalized[1:] # strip the initial space
|