From 8e22147a194b351e1961561015d1af7cd94fdec9 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Wed, 21 Nov 2018 17:05:45 +0100
Subject: [PATCH] phonem extraction for training

---
 datasets/TTSDataset.py |   2 +-
 utils/text/__init__.py |  20 ++++++-
 utils/text/cleaners.py |   2 +-
 utils/text/cmudict.py  |  78 +++++++++++++++++++++----
 utils/text/numbers.py  | 129 -----------------------------------------
 utils/text/symbols.py  |   3 +-
 6 files changed, 90 insertions(+), 144 deletions(-)
 delete mode 100644 utils/text/numbers.py

diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py
index e97b38af..d6282de3 100644
--- a/datasets/TTSDataset.py
+++ b/datasets/TTSDataset.py
@@ -6,7 +6,7 @@ import torch
 import random
 from torch.utils.data import Dataset
 
-from utils.text import text_to_sequence
+from utils.text import text_to_sequence, phoneme_to_sequence
 from utils.data import (prepare_data, pad_per_step, prepare_tensor,
                         prepare_stop_target)
 
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index 37716fa9..ed4b6e3a 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -2,16 +2,26 @@
 
 import re
 from utils.text import cleaners
-from utils.text.symbols import symbols
+from utils.text.symbols import symbols, phonemes
 
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 
+_phonemes_to_id = {s: i for i, s in enumerate(phonemes)}
+_id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
+
 # Regular expression matching text enclosed in curly braces:
 _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
 
 
+def phoneme_to_sequence(text, cleaner_names):
+    sequence = []
+    sequence += _phonem_to_sequence(_clean_text(text, cleaner_names))
+    sequence.append(_phonemes_to_id['~'])
+    return sequence
+
+
 def text_to_sequence(text, cleaner_names):
     '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 
@@ -69,9 +79,17 @@ def _symbols_to_sequence(symbols):
     return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
 
 
+def _phonem_to_sequence(phonemes):
+    return [_phonemes_to_id[s] for s in phonemes if _should_keep_phonem(s)]
+
+
 def _arpabet_to_sequence(text):
     return _symbols_to_sequence(['@' + s for s in text.split()])
 
 
 def _should_keep_symbol(s):
     return s in _symbol_to_id and s is not '_' and s is not '~'
+
+
+def _should_keep_phonem(p):
+    return p in _phonemes_to_id and p is not '_' and p is not '~'
diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py
index 31c04ae4..a33f91b5 100644
--- a/utils/text/cleaners.py
+++ b/utils/text/cleaners.py
@@ -12,7 +12,7 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
 
 import re
 from unidecode import unidecode
-from .numbers import normalize_numbers
+from .number_norm import normalize_numbers
 
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')
diff --git a/utils/text/cmudict.py b/utils/text/cmudict.py
index 291ad33f..c74076cb 100644
--- a/utils/text/cmudict.py
+++ b/utils/text/cmudict.py
@@ -1,19 +1,75 @@
 # -*- coding: utf-8 -*-
 
 import re
+import phonemizer
 
-valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
-    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
-    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
-    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
-    'Y', 'Z', 'ZH'
-]
+# valid_symbols = [
+#     'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
+#     'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
+#     'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
+#     'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
+#     'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
+#     'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
+#     'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
+#     'Y', 'Z', 'ZH'
+# ]
 
-_valid_symbol_set = set(valid_symbols)
+
+_phonemes = {
+    'aa',
+    'ae',
+    'ah',
+    'ao',
+    'aw',
+    'ax',
+    'ay',
+    'b',
+    'ch',
+    'd',
+    'dh',
+    'eh',
+    'ey',
+    'f',
+    'g',
+    'hh',
+    'i',
+    'ih',
+    'iy',
+    'jh',
+    'k',
+    'l',
+    'm',
+    'n',
+    'ng',
+    'ow',
+    'oy',
+    'p',
+    'pau',
+    'r',
+    's',
+    'sh',
+    'ssil',
+    't',
+    'th',
+    'uh',
+    'uw',
+    'v',
+    'w',
+    'y',
+    'z'
+}
+
+_phonemes = set(_phonemes)
+
+
+def text2phone(text):
+    seperator = phonemizer.separator.Separator('', '', ' ')
+    ph = phonemizer.phonemize(text, separator=seperator)
+    ph = ph.split(' ')
+    ph.remove('')
+
+    result = [char2code[p] for p in ph]
+    return result
 
 
 class CMUDict:
diff --git a/utils/text/numbers.py b/utils/text/numbers.py
deleted file mode 100644
index 9cc6f4df..00000000
--- a/utils/text/numbers.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import re
-
-_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
-_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
-_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
-_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)')
-_number_re = re.compile(r'[0-9]+')
-
-_units = [
-    '', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
-    'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
-    'seventeen', 'eighteen', 'nineteen'
-]
-
-_tens = [
-    '',
-    'ten',
-    'twenty',
-    'thirty',
-    'forty',
-    'fifty',
-    'sixty',
-    'seventy',
-    'eighty',
-    'ninety',
-]
-
-_digit_groups = [
-    '',
-    'thousand',
-    'million',
-    'billion',
-    'trillion',
-    'quadrillion',
-]
-
-_ordinal_suffixes = [
-    ('one', 'first'),
-    ('two', 'second'),
-    ('three', 'third'),
-    ('five', 'fifth'),
-    ('eight', 'eighth'),
-    ('nine', 'ninth'),
-    ('twelve', 'twelfth'),
-    ('ty', 'tieth'),
-]
-
-
-def _remove_commas(m):
-    return m.group(1).replace(',', '')
-
-
-def _expand_decimal_point(m):
-    return m.group(1).replace('.', ' point ')
-
-
-def _expand_dollars(m):
-    match = m.group(1)
-    parts = match.split('.')
-    if len(parts) > 2:
-        return match + ' dollars'  # Unexpected format
-    dollars = int(parts[0]) if parts[0] else 0
-    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
-    elif dollars:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        return '%s %s' % (dollars, dollar_unit)
-    elif cents:
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s' % (cents, cent_unit)
-    else:
-        return 'zero dollars'
-
-
-def _standard_number_to_words(n, digit_group):
-    parts = []
-    if n >= 1000:
-        # Format next higher digit group.
-        parts.append(_standard_number_to_words(n // 1000, digit_group + 1))
-        n = n % 1000
-
-    if n >= 100:
-        parts.append('%s hundred' % _units[n // 100])
-    if n % 100 >= len(_units):
-        parts.append(_tens[(n % 100) // 10])
-        parts.append(_units[(n % 100) % 10])
-    else:
-        parts.append(_units[n % 100])
-    if n > 0:
-        parts.append(_digit_groups[digit_group])
-    return ' '.join([x for x in parts if x])
-
-
-def _number_to_words(n):
-    # Handle special cases first, then go to the standard case:
-    if n >= 1000000000000000000:
-        return str(n)  # Too large, just return the digits
-    elif n == 0:
-        return 'zero'
-    elif n % 100 == 0 and n % 1000 != 0 and n < 3000:
-        return _standard_number_to_words(n // 100, 0) + ' hundred'
-    else:
-        return _standard_number_to_words(n, 0)
-
-
-def _expand_number(m):
-    return _number_to_words(int(m.group(0)))
-
-
-def _expand_ordinal(m):
-    num = _number_to_words(int(m.group(1)))
-    for suffix, replacement in _ordinal_suffixes:
-        if num.endswith(suffix):
-            return num[:-len(suffix)] + replacement
-    return num + 'th'
-
-
-def normalize_numbers(text):
-    text = re.sub(_comma_number_re, _remove_commas, text)
-    text = re.sub(_pounds_re, r'\1 pounds', text)
-    text = re.sub(_dollars_re, _expand_dollars, text)
-    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-    text = re.sub(_ordinal_re, _expand_ordinal, text)
-    text = re.sub(_number_re, _expand_number, text)
-    return text
diff --git a/utils/text/symbols.py b/utils/text/symbols.py
index 4c8f6c43..4dc8814d 100644
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@@ -12,10 +12,11 @@ _eos = '~'
 _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
 
 # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-_arpabet = ['@' + s for s in cmudict.valid_symbols]
+_arpabet = ['@' + s for s in cmudict._phonemes]
 
 # Export all symbols:
 symbols = [_pad, _eos] + list(_characters) + _arpabet
+phonemes = [_pad, _eos] + cmudict._phonemes + list('!\'(),-.:;?')
 
 if __name__ == '__main__':
     print(symbols)