pt unicode bug fix in normalization
parent
b301890cc5
commit
d7926cde07
|
@ -2125,12 +2125,12 @@ def pt_pruning(text, symbols=True, accents=True, agressive=True):
|
||||||
text = text.replace(symbol, "")
|
text = text.replace(symbol, "")
|
||||||
text = text.replace("-", " ").replace("_", " ")
|
text = text.replace("-", " ").replace("_", " ")
|
||||||
if accents:
|
if accents:
|
||||||
accents = {"a": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
|
accents = {"a": [u"á", u"à", u"ã", u"â"],
|
||||||
"e": [u"<EFBFBD>", u"<EFBFBD>", u"<EFBFBD>"],
|
"e": [u"ê", u"è", u"é"],
|
||||||
"i": [u"<EFBFBD>", u"<EFBFBD>"],
|
"i": [u"í", u"ì"],
|
||||||
"o": [u"<EFBFBD>", u"<EFBFBD>"],
|
"o": [u"ò", u"ó"],
|
||||||
"u": [u"<EFBFBD>", u"<EFBFBD>"],
|
"u": [u"ú", u"ù"],
|
||||||
"c": [u"<EFBFBD>", u"<EFBFBD>"]}
|
"c": [u"ç"]}
|
||||||
for char in accents:
|
for char in accents:
|
||||||
for acc in accents[char]:
|
for acc in accents[char]:
|
||||||
text = text.replace(acc, char)
|
text = text.replace(acc, char)
|
||||||
|
|
Loading…
Reference in New Issue