pt unicode bug fix in normalization
parent
b301890cc5
commit
d7926cde07
|
@ -2125,12 +2125,12 @@ def pt_pruning(text, symbols=True, accents=True, agressive=True):
|
|||
text = text.replace(symbol, "")
|
||||
text = text.replace("-", " ").replace("_", " ")
|
||||
if accents:
|
||||
accents = {"a": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
|
||||
"e": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
|
||||
"i": [u"<EFBFBD>", u"<EFBFBD>"],
|
||||
"o": [u"<EFBFBD>", u"<EFBFBD>"],
|
||||
"u": [u"<EFBFBD>", u"<EFBFBD>"],
|
||||
"c": [u"<EFBFBD>", u"<EFBFBD>"]}
|
||||
accents = {"a": [u"á", u"à", u"ã", u"â"],
|
||||
"e": [u"ê", u"è", u"é"],
|
||||
"i": [u"í", u"ì"],
|
||||
"o": [u"ò", u"ó"],
|
||||
"u": [u"ú", u"ù"],
|
||||
"c": [u"ç"]}
|
||||
for char in accents:
|
||||
for acc in accents[char]:
|
||||
text = text.replace(acc, char)
|
||||
|
|
Loading…
Reference in New Issue