[AbstractRuleBasedInterpreter] Fix spanish tokenization (#2889)
Signed-off-by: Miguel Álvarez Díez <miguelwork92@gmail.com>pull/2895/head
parent
b5de891ebd
commit
7442220830
|
@ -600,7 +600,8 @@ public abstract class AbstractRuleBasedInterpreter implements HumanLanguageInter
|
||||||
split = text.toLowerCase(locale).replaceAll("[\\']", " ").replaceAll("[^\\w\\sàâäçéèêëîïôùûü]", " ")
|
split = text.toLowerCase(locale).replaceAll("[\\']", " ").replaceAll("[^\\w\\sàâäçéèêëîïôùûü]", " ")
|
||||||
.split("\\s");
|
.split("\\s");
|
||||||
} else if ("es".equalsIgnoreCase(locale.getLanguage())) {
|
} else if ("es".equalsIgnoreCase(locale.getLanguage())) {
|
||||||
split = text.toLowerCase(locale).replaceAll("[\\']", " ").replaceAll("[^\\w\\sáéíóúü]", " ").split("\\s");
|
split = text.toLowerCase(locale).replaceAll("[\\']", " ").replaceAll("[^\\w\\sáéíóúïüñç]", " ")
|
||||||
|
.split("\\s");
|
||||||
} else {
|
} else {
|
||||||
split = text.toLowerCase(locale).replaceAll("[\\']", "").replaceAll("[^\\w\\s]", " ").split("\\s");
|
split = text.toLowerCase(locale).replaceAll("[\\']", "").replaceAll("[^\\w\\s]", " ").split("\\s");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue