fix french_cleaners

pull/510/head
WeberJulian 2021-03-05 19:56:50 +01:00 committed by Eren Gölge
parent c1742c9928
commit a1839d3245
2 changed files with 42 additions and 38 deletions

View File

@ -24,38 +24,44 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
]]
# List of (regular expression, replacement) pairs for abbreviations in french:
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
for x in [
('M', 'monsieur'),
('Mlle', 'mademoiselle'),
('Mlles', 'mesdemoiselles'),
('Mme', 'Madame'),
('Mmes', 'Mesdames'),
('N.B', 'nota bene'),
('M', 'monsieur'),
('p.c.q', 'parce que'),
('Pr', 'professeur'),
('qqch', 'quelque chose'),
('rdv', 'rendez-vous'),
('max', 'maximum'),
('min', 'minimum'),
('no', 'numéro'),
('adr', 'adresse'),
('dr', 'docteur'),
('st', 'saint'),
('co', 'companie'),
('jr', 'junior'),
('sgt', 'sergent'),
('capt', 'capitain'),
('col', 'colonel'),
('av', 'avenue'),
('av. J.-C', 'avant Jésus-Christ'),
('apr. J.-C', 'après Jésus-Christ'),
('art', 'article'),
('boul', 'boulevard'),
('c.-à-d', 'cest-à-dire'),
('etc', 'et cetera'),
('ex', 'exemple'),
('excl', 'exclusivement'),
('boul', 'boulevard'),
]]
abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('M', 'monsieur'),
('Mlle', 'mademoiselle'),
('Mlles', 'mesdemoiselles'),
('Mme', 'Madame'),
('Mmes', 'Mesdames'),
('N.B', 'nota bene'),
('M', 'monsieur'),
('p.c.q', 'parce que'),
('Pr', 'professeur'),
('qqch', 'quelque chose'),
('rdv', 'rendez-vous'),
('max', 'maximum'),
('min', 'minimum'),
('no', 'numéro'),
('adr', 'adresse'),
('dr', 'docteur'),
('st', 'saint'),
('co', 'companie'),
('jr', 'junior'),
('sgt', 'sergent'),
('capt', 'capitain'),
('col', 'colonel'),
('av', 'avenue'),
('av. J.-C', 'avant Jésus-Christ'),
('apr. J.-C', 'après Jésus-Christ'),
('art', 'article'),
('boul', 'boulevard'),
('c.-à-d', 'cest-à-dire'),
('etc', 'et cetera'),
('ex', 'exemple'),
('excl', 'exclusivement'),
('boul', 'boulevard'),
]] + [(re.compile('\\b%s' % x[0]), x[1])
for x in [
('Mlle', 'mademoiselle'),
('Mlles', 'mesdemoiselles'),
('Mme', 'Madame'),
('Mmes', 'Mesdames'),
]]

View File

@ -108,8 +108,8 @@ def english_cleaners(text):
def french_cleaners(text):
'''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
text = lowercase(text)
text = expand_abbreviations(text, lang='fr')
text = lowercase(text)
text = replace_symbols(text, lang='fr')
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
@ -129,8 +129,6 @@ def chinese_mandarin_cleaners(text: str) -> str:
text = replace_numbers_to_characters_in_text(text)
return text
def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = expand_numbers(text)