fix french_cleaners

pull/510/head
WeberJulian 2021-03-05 19:56:50 +01:00 committed by Eren Gölge
parent c1742c9928
commit a1839d3245
2 changed files with 42 additions and 38 deletions

View File

@ -24,38 +24,44 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
]] ]]
# List of (regular expression, replacement) pairs for abbreviations in french: # List of (regular expression, replacement) pairs for abbreviations in french:
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1]) abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [ for x in [
('M', 'monsieur'), ('M', 'monsieur'),
('Mlle', 'mademoiselle'), ('Mlle', 'mademoiselle'),
('Mlles', 'mesdemoiselles'), ('Mlles', 'mesdemoiselles'),
('Mme', 'Madame'), ('Mme', 'Madame'),
('Mmes', 'Mesdames'), ('Mmes', 'Mesdames'),
('N.B', 'nota bene'), ('N.B', 'nota bene'),
('M', 'monsieur'), ('M', 'monsieur'),
('p.c.q', 'parce que'), ('p.c.q', 'parce que'),
('Pr', 'professeur'), ('Pr', 'professeur'),
('qqch', 'quelque chose'), ('qqch', 'quelque chose'),
('rdv', 'rendez-vous'), ('rdv', 'rendez-vous'),
('max', 'maximum'), ('max', 'maximum'),
('min', 'minimum'), ('min', 'minimum'),
('no', 'numéro'), ('no', 'numéro'),
('adr', 'adresse'), ('adr', 'adresse'),
('dr', 'docteur'), ('dr', 'docteur'),
('st', 'saint'), ('st', 'saint'),
('co', 'companie'), ('co', 'companie'),
('jr', 'junior'), ('jr', 'junior'),
('sgt', 'sergent'), ('sgt', 'sergent'),
('capt', 'capitain'), ('capt', 'capitain'),
('col', 'colonel'), ('col', 'colonel'),
('av', 'avenue'), ('av', 'avenue'),
('av. J.-C', 'avant Jésus-Christ'), ('av. J.-C', 'avant Jésus-Christ'),
('apr. J.-C', 'après Jésus-Christ'), ('apr. J.-C', 'après Jésus-Christ'),
('art', 'article'), ('art', 'article'),
('boul', 'boulevard'), ('boul', 'boulevard'),
('c.-à-d', 'cest-à-dire'), ('c.-à-d', 'cest-à-dire'),
('etc', 'et cetera'), ('etc', 'et cetera'),
('ex', 'exemple'), ('ex', 'exemple'),
('excl', 'exclusivement'), ('excl', 'exclusivement'),
('boul', 'boulevard'), ('boul', 'boulevard'),
]] ]] + [(re.compile('\\b%s' % x[0]), x[1])
for x in [
('Mlle', 'mademoiselle'),
('Mlles', 'mesdemoiselles'),
('Mme', 'Madame'),
('Mmes', 'Mesdames'),
]]

View File

@ -108,8 +108,8 @@ def english_cleaners(text):
def french_cleaners(text): def french_cleaners(text):
'''Pipeline for French text. There is no need to expand numbers, phonemizer already does that''' '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
text = lowercase(text)
text = expand_abbreviations(text, lang='fr') text = expand_abbreviations(text, lang='fr')
text = lowercase(text)
text = replace_symbols(text, lang='fr') text = replace_symbols(text, lang='fr')
text = remove_aux_symbols(text) text = remove_aux_symbols(text)
text = collapse_whitespace(text) text = collapse_whitespace(text)
@ -129,8 +129,6 @@ def chinese_mandarin_cleaners(text: str) -> str:
text = replace_numbers_to_characters_in_text(text) text = replace_numbers_to_characters_in_text(text)
return text return text
def phoneme_cleaners(text): def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.''' '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = expand_numbers(text) text = expand_numbers(text)