From ea7c2e15c00e06afc5a949a4cfce8adbb9b4332a Mon Sep 17 00:00:00 2001 From: Julian WEBER Date: Tue, 29 Sep 2020 15:43:39 +0200 Subject: [PATCH] Adding french abbreviations --- TTS/tts/utils/text/abbreviations.py | 61 +++++++++++++++++++++++++++++ TTS/tts/utils/text/cleaners.py | 34 ++++------------ 2 files changed, 69 insertions(+), 26 deletions(-) create mode 100644 TTS/tts/utils/text/abbreviations.py diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py new file mode 100644 index 00000000..d14426e1 --- /dev/null +++ b/TTS/tts/utils/text/abbreviations.py @@ -0,0 +1,61 @@ +import re + +# List of (regular expression, replacement) pairs for abbreviations in english: +abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) + for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), + ]] + +# List of (regular expression, replacement) pairs for abbreviations in french: +abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1]) + for x in [ + ('M', 'monsieur'), + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ('N.B', 'nota bene'), + ('M', 'monsieur'), + ('p.c.q', 'parce que'), + ('Pr', 'professeur'), + ('qqch', 'quelque chose'), + ('rdv', 'rendez-vous'), + ('max', 'maximum'), + ('min', 'minimum'), + ('no', 'numéro'), + ('adr', 'adresse'), + ('dr', 'docteur'), + ('st', 'saint'), + ('co', 'companie'), + ('jr', 'junior'), + ('sgt', 'sergent'), + ('capt', 'capitain'), + ('col', 'colonel'), + ('av', 'avenue'), + ('av. J.-C', 'avant Jésus-Christ'), + ('apr. J.-C', 'après Jésus-Christ'), + ('art', 'article'), + ('boul', 'boulevard'), + ('c.-à-d', 'c’est-à-dire'), + ('etc', 'et cetera'), + ('ex', 'exemple'), + ('excl', 'exclusivement'), + ('boul', 'boulevard'), + ]] \ No newline at end of file diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 68bbc389..492a1017 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -13,35 +13,17 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use import re from unidecode import unidecode from .number_norm import normalize_numbers +from .abbreviations import abbreviations_en, abbreviations_fr # Regular expression matching whitespace: _whitespace_re = re.compile(r'\s+') -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), - ]] - -def expand_abbreviations(text): +def expand_abbreviations(text, lang='en'): + if lang == 'en': + _abbreviations = abbreviations_en + elif lang == 'fr': + _abbreviations = abbreviations_fr for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text @@ -121,9 +103,9 @@ def english_cleaners(text): return text def french_cleaners(text): - '''Basic pipeline for French text. There is no need to expand abbreviation and - numbers, phonemizer already does that''' + '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that''' text = lowercase(text) + text = expand_abbreviations(text, lang='fr') text = replace_symbols(text, lang='fr') text = remove_aux_symbols(text) text = collapse_whitespace(text)