Implement BaseCharacters, IPAPhonemes, Graphemes

pull/1324/head
Eren Gölge 2021-11-16 13:27:25 +01:00
parent 1bee40af40
commit 2fb1f70503
1 changed files with 265 additions and 24 deletions

View File

@ -7,21 +7,33 @@ through Unidecode. For other data, you can modify _characters. See TRAINING_DATA
"""
def parse_symbols():
return {
"pad": _pad,
"eos": _eos,
"bos": _bos,
"characters": _characters,
"punctuations": _punctuations,
"phonemes": _phonemes,
}
def make_symbols(
characters,
phonemes=None,
punctuations="!'(),-.:;? ",
pad="_",
eos="~",
bos="^",
pad="<PAD>",
eos="<EOS>",
bos="<BOS>",
blank="<BLNK>",
unique=True,
): # pylint: disable=redefined-outer-name
"""Function to create symbols and phonemes
TODO: create phonemes_to_id and symbols_to_id dicts here."""
"""Function to create default characters and phonemes"""
_symbols = list(characters)
_symbols = [bos] + _symbols if len(bos) > 0 and bos is not None else _symbols
_symbols = [eos] + _symbols if len(bos) > 0 and eos is not None else _symbols
_symbols = [pad] + _symbols if len(bos) > 0 and pad is not None else _symbols
_symbols = [blank] + _symbols if len(bos) > 0 and blank is not None else _symbols
_phonemes = None
if phonemes is not None:
_phonemes_sorted = (
@ -35,9 +47,10 @@ def make_symbols(
return _symbols, _phonemes
_pad = "_"
_eos = "~"
_bos = "^"
_pad = "<PAD>"
_eos = "<EOS>"
_bos = "<BOS>"
_blank = "<BLNK>" # TODO: check if we need this alongside with PAD
_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? "
_punctuations = "!'(),-.:;? "
@ -52,24 +65,252 @@ _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprase
symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
# Generate ALIEN language
# from random import shuffle
# shuffle(phonemes)
class BaseCharacters:
"""🐸BaseCharacters class
Every vocabulary class should inherit from this class.
Args:
characters (str):
Main set of characters to be used in the vocabulary.
punctuations (str):
Characters to be treated as punctuation.
pad (str):
Special padding character that would be ignored by the model.
eos (str):
End of the sentence character.
bos (str):
Beginning of the sentence character.
blank (str):
Optional character used between characters by some models for better prosody.
is_unique (bool):
Remove duplicates from the provided characters. Defaults to True.
is_sorted (bool):
Sort the characters in alphabetical order. Defaults to True.
"""
def __init__(
self,
characters: str,
punctuations: str,
pad: str,
eos: str,
bos: str,
blank: str,
is_unique: bool = True,
is_sorted: bool = True,
) -> None:
self._characters = characters
self._punctuations = punctuations
self._pad = pad
self._eos = eos
self._bos = bos
self._blank = blank
self.is_unique = is_unique
self.is_sorted = is_sorted
self._create_vocab()
@property
def characters(self):
return self._characters
@characters.setter
def characters(self, characters):
self._characters = characters
self._vocab = self.create_vocab()
@property
def punctuations(self):
return self._punctuations
@punctuations.setter
def punctuations(self, punctuations):
self._punctuations = punctuations
self._vocab = self.create_vocab()
@property
def pad(self):
return self._pad
@pad.setter
def pad(self, pad):
self._pad = pad
self._vocab = self.create_vocab()
@property
def eos(self):
return self._eos
@eos.setter
def eos(self, eos):
self._eos = eos
self._vocab = self.create_vocab()
@property
def bos(self):
return self._bos
@bos.setter
def bos(self, bos):
self._bos = bos
self._vocab = self.create_vocab()
@property
def blank(self):
return self._bos
@bos.setter
def blank(self, bos):
self._bos = bos
self._vocab = self.create_vocab()
@property
def vocab(self):
return self._vocab
@property
def num_chars(self):
return len(self._vocab)
def _create_vocab(self):
_vocab = self.characters
if self.is_unique:
_vocab = list(set(_vocab))
if self.is_sorted:
_vocab = sorted(_vocab)
_vocab = list(_vocab)
_vocab = [self.bos] + _vocab if len(self.bos) > 0 and self.bos is not None else _vocab
_vocab = [self.eos] + _vocab if len(self.bos) > 0 and self.eos is not None else _vocab
_vocab = [self.pad] + _vocab if len(self.bos) > 0 and self.pad is not None else _vocab
self._vocab = _vocab + list(self._punctuations)
self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
assert len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
def char_to_id(self, char: str) -> int:
return self._char_to_id[char]
def id_to_char(self, idx: int) -> str:
return self._id_to_char[idx]
@staticmethod
def init_from_config(config: "Coqpit"):
return BaseCharacters(
**config.characters if config.characters is not None else {},
)
def parse_symbols():
return {
"pad": _pad,
"eos": _eos,
"bos": _bos,
"characters": _characters,
"punctuations": _punctuations,
"phonemes": _phonemes,
}
class IPAPhonemes(BaseCharacters):
"""🐸IPAPhonemes class to manage `TTS.tts` model vocabulary
Intended to be used with models using IPAPhonemes as input.
It uses system defaults for the undefined class arguments.
Args:
characters (str):
Main set of case-sensitive characters to be used in the vocabulary. Defaults to `_phonemes`.
punctuations (str):
Characters to be treated as punctuation. Defaults to `_punctuations`.
pad (str):
Special padding character that would be ignored by the model. Defaults to `_pad`.
eos (str):
End of the sentence character. Defaults to `_eos`.
bos (str):
Beginning of the sentence character. Defaults to `_bos`.
is_unique (bool):
Remove duplicates from the provided characters. Defaults to True.
is_sorted (bool):
Sort the characters in alphabetical order. Defaults to True.
"""
def __init__(
self,
characters: str = _phonemes,
punctuations: str = _punctuations,
pad: str = _pad,
eos: str = _eos,
bos: str = _bos,
is_unique: bool = True,
is_sorted: bool = True,
) -> None:
super().__init__(characters, punctuations, pad, eos, bos, is_unique, is_sorted)
@staticmethod
def init_from_config(config: "Coqpit"):
return IPAPhonemes(
**config.characters if config.characters is not None else {},
)
class Graphemes(BaseCharacters):
"""🐸Graphemes class to manage `TTS.tts` model vocabulary
Intended to be used with models using graphemes as input.
It uses system defaults for the undefined class arguments.
Args:
characters (str):
Main set of case-sensitive characters to be used in the vocabulary. Defaults to `_characters`.
punctuations (str):
Characters to be treated as punctuation. Defaults to `_punctuations`.
pad (str):
Special padding character that would be ignored by the model. Defaults to `_pad`.
eos (str):
End of the sentence character. Defaults to `_eos`.
bos (str):
Beginning of the sentence character. Defaults to `_bos`.
is_unique (bool):
Remove duplicates from the provided characters. Defaults to True.
is_sorted (bool):
Sort the characters in alphabetical order. Defaults to True.
"""
def __init__(
self,
characters: str = _characters,
punctuations: str = _punctuations,
pad: str = _pad,
eos: str = _eos,
bos: str = _bos,
is_unique: bool = True,
is_sorted: bool = True,
) -> None:
super().__init__(characters, punctuations, pad, eos, bos, is_unique, is_sorted)
@staticmethod
def init_from_config(config: "Coqpit"):
return Graphemes(
**config.characters if config.characters is not None else {},
)
if __name__ == "__main__":
print(" > TTS symbols {}".format(len(symbols)))
print(symbols)
print(" > TTS phonemes {}".format(len(phonemes)))
print("".join(sorted(phonemes)))
gr = Graphemes()
ph = IPAPhonemes()
print(gr.vocab)
print(ph.vocab)
print(gr.num_chars)
assert "a" == gr.id_to_char(gr.char_to_id("a"))