From 8e22147a194b351e1961561015d1af7cd94fdec9 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 21 Nov 2018 17:05:45 +0100 Subject: [PATCH] phonem extraction for training --- datasets/TTSDataset.py | 2 +- utils/text/__init__.py | 20 ++++++- utils/text/cleaners.py | 2 +- utils/text/cmudict.py | 78 +++++++++++++++++++++---- utils/text/numbers.py | 129 ----------------------------------------- utils/text/symbols.py | 3 +- 6 files changed, 90 insertions(+), 144 deletions(-) delete mode 100644 utils/text/numbers.py diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index e97b38af..d6282de3 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -6,7 +6,7 @@ import torch import random from torch.utils.data import Dataset -from utils.text import text_to_sequence +from utils.text import text_to_sequence, phoneme_to_sequence from utils.data import (prepare_data, pad_per_step, prepare_tensor, prepare_stop_target) diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 37716fa9..ed4b6e3a 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -2,16 +2,26 @@ import re from utils.text import cleaners -from utils.text.symbols import symbols +from utils.text.symbols import symbols, phonemes # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)} +_phonemes_to_id = {s: i for i, s in enumerate(phonemes)} +_id_to_phonemes = {i: s for i, s in enumerate(phonemes)} + # Regular expression matching text enclosed in curly braces: _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') +def phoneme_to_sequence(text, cleaner_names): + sequence = [] + sequence += _phonem_to_sequence(_clean_text(text, cleaner_names)) + sequence.append(_phonemes_to_id['~']) + return sequence + + def text_to_sequence(text, cleaner_names): '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. @@ -69,9 +79,17 @@ def _symbols_to_sequence(symbols): return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] +def _phonem_to_sequence(phonemes): + return [_phonemes_to_id[s] for s in phonemes if _should_keep_phonem(s)] + + def _arpabet_to_sequence(text): return _symbols_to_sequence(['@' + s for s in text.split()]) def _should_keep_symbol(s): return s in _symbol_to_id and s is not '_' and s is not '~' + + +def _should_keep_phonem(p): + return p in _phonemes_to_id and p is not '_' and p is not '~' diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py index 31c04ae4..a33f91b5 100644 --- a/utils/text/cleaners.py +++ b/utils/text/cleaners.py @@ -12,7 +12,7 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use import re from unidecode import unidecode -from .numbers import normalize_numbers +from .number_norm import normalize_numbers # Regular expression matching whitespace: _whitespace_re = re.compile(r'\s+') diff --git a/utils/text/cmudict.py b/utils/text/cmudict.py index 291ad33f..c74076cb 100644 --- a/utils/text/cmudict.py +++ b/utils/text/cmudict.py @@ -1,19 +1,75 @@ # -*- coding: utf-8 -*- import re +import phonemizer -valid_symbols = [ - 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', - 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', - 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', - 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', - 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', - 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', - 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', - 'Y', 'Z', 'ZH' -] +# valid_symbols = [ +# 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', +# 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', +# 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', +# 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', +# 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', +# 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', +# 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', +# 'Y', 'Z', 'ZH' +# ] -_valid_symbol_set = set(valid_symbols) + +_phonemes = { + 'aa', + 'ae', + 'ah', + 'ao', + 'aw', + 'ax', + 'ay', + 'b', + 'ch', + 'd', + 'dh', + 'eh', + 'ey', + 'f', + 'g', + 'hh', + 'i', + 'ih', + 'iy', + 'jh', + 'k', + 'l', + 'm', + 'n', + 'ng', + 'ow', + 'oy', + 'p', + 'pau', + 'r', + 's', + 'sh', + 'ssil', + 't', + 'th', + 'uh', + 'uw', + 'v', + 'w', + 'y', + 'z' +} + +_phonemes = set(_phonemes) + + +def text2phone(text): + seperator = phonemizer.separator.Separator('', '', ' ') + ph = phonemizer.phonemize(text, separator=seperator) + ph = ph.split(' ') + ph.remove('') + + result = [char2code[p] for p in ph] + return result class CMUDict: diff --git a/utils/text/numbers.py b/utils/text/numbers.py deleted file mode 100644 index 9cc6f4df..00000000 --- a/utils/text/numbers.py +++ /dev/null @@ -1,129 +0,0 @@ -import re - -_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') -_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') -_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') -_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)') -_number_re = re.compile(r'[0-9]+') - -_units = [ - '', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', - 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', - 'seventeen', 'eighteen', 'nineteen' -] - -_tens = [ - '', - 'ten', - 'twenty', - 'thirty', - 'forty', - 'fifty', - 'sixty', - 'seventy', - 'eighty', - 'ninety', -] - -_digit_groups = [ - '', - 'thousand', - 'million', - 'billion', - 'trillion', - 'quadrillion', -] - -_ordinal_suffixes = [ - ('one', 'first'), - ('two', 'second'), - ('three', 'third'), - ('five', 'fifth'), - ('eight', 'eighth'), - ('nine', 'ninth'), - ('twelve', 'twelfth'), - ('ty', 'tieth'), -] - - -def _remove_commas(m): - return m.group(1).replace(',', '') - - -def _expand_decimal_point(m): - return m.group(1).replace('.', ' point ') - - -def _expand_dollars(m): - match = m.group(1) - parts = match.split('.') - if len(parts) > 2: - return match + ' dollars' # Unexpected format - dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) - elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) - else: - return 'zero dollars' - - -def _standard_number_to_words(n, digit_group): - parts = [] - if n >= 1000: - # Format next higher digit group. - parts.append(_standard_number_to_words(n // 1000, digit_group + 1)) - n = n % 1000 - - if n >= 100: - parts.append('%s hundred' % _units[n // 100]) - if n % 100 >= len(_units): - parts.append(_tens[(n % 100) // 10]) - parts.append(_units[(n % 100) % 10]) - else: - parts.append(_units[n % 100]) - if n > 0: - parts.append(_digit_groups[digit_group]) - return ' '.join([x for x in parts if x]) - - -def _number_to_words(n): - # Handle special cases first, then go to the standard case: - if n >= 1000000000000000000: - return str(n) # Too large, just return the digits - elif n == 0: - return 'zero' - elif n % 100 == 0 and n % 1000 != 0 and n < 3000: - return _standard_number_to_words(n // 100, 0) + ' hundred' - else: - return _standard_number_to_words(n, 0) - - -def _expand_number(m): - return _number_to_words(int(m.group(0))) - - -def _expand_ordinal(m): - num = _number_to_words(int(m.group(1))) - for suffix, replacement in _ordinal_suffixes: - if num.endswith(suffix): - return num[:-len(suffix)] + replacement - return num + 'th' - - -def normalize_numbers(text): - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_pounds_re, r'\1 pounds', text) - text = re.sub(_dollars_re, _expand_dollars, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - text = re.sub(_number_re, _expand_number, text) - return text diff --git a/utils/text/symbols.py b/utils/text/symbols.py index 4c8f6c43..4dc8814d 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -12,10 +12,11 @@ _eos = '~' _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): -_arpabet = ['@' + s for s in cmudict.valid_symbols] +_arpabet = ['@' + s for s in cmudict._phonemes] # Export all symbols: symbols = [_pad, _eos] + list(_characters) + _arpabet +phonemes = [_pad, _eos] + cmudict._phonemes + list('!\'(),-.:;?') if __name__ == '__main__': print(symbols)