pt unicode bug fix in normalization

pull/1301/head
jarbasai 2017-12-12 03:10:00 +00:00
parent b301890cc5
commit d7926cde07
1 changed files with 6 additions and 6 deletions

View File

@ -2125,12 +2125,12 @@ def pt_pruning(text, symbols=True, accents=True, agressive=True):
text = text.replace(symbol, "")
text = text.replace("-", " ").replace("_", " ")
if accents:
accents = {"a": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
"e": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
"i": [u"<EFBFBD>", u"<EFBFBD>"],
"o": [u"<EFBFBD>", u"<EFBFBD>"],
"u": [u"<EFBFBD>", u"<EFBFBD>"],
"c": [u"<EFBFBD>", u"<EFBFBD>"]}
accents = {"a": [u"á", u"à", u"ã", u"â"],
"e": [u"ê", u"è", u"é"],
"i": [u"í", u"ì"],
"o": [u"ò", u"ó"],
"u": [u"ú", u"ù"],
"c": [u"ç"]}
for char in accents:
for acc in accents[char]:
text = text.replace(acc, char)