From 25c86ca715d7bc90d01f081f8f62d292815b9262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 27 Jan 2021 11:46:01 +0100 Subject: [PATCH 001/100] README update, set default models for synthesize.py and server.py. Disable verbose for ap init. --- README.md | 6 +++--- TTS/bin/synthesize.py | 7 +++++-- TTS/server/server.py | 4 ++-- TTS/utils/synthesizer.py | 4 ++-- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index ba036ddf..fc1598a6 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,11 @@ TTS comes with [pretrained models](https://github.com/mozilla/TTS/wiki/Released- [![License]()](https://opensource.org/licenses/MPL-2.0) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) -:loudspeaker: [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2) +📢 [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2) -:man_cook: [TTS training recipes](https://github.com/erogol/TTS_recipes) +👩🏽‍🍳 [TTS training recipes](https://github.com/erogol/TTS_recipes) -:page_facing_up: [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers) +📄 [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers) ## 💬 Where to ask questions Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly, so that more people can benefit from it. diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index b7ccf850..9a06c866 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -35,6 +35,9 @@ def main(): # list provided models ./TTS/bin/synthesize.py --list_models + # run tts with default models. + ./TTS/bin synthesize.py --text "Text for TTS" + # run a model from the list ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "//" --vocoder_name "//" --output_path @@ -67,14 +70,14 @@ def main(): parser.add_argument( '--model_name', type=str, - default=None, + default="tts_models/en/ljspeech/speedy-speech-wn", help= 'Name of one of the pre-trained tts models in format //' ) parser.add_argument( '--vocoder_name', type=str, - default=None, + default="vocoder_models/en/ljspeech/mulitband-melgan", help= 'Name of one of the pre-trained vocoder models in format //' ) diff --git a/TTS/server/server.py b/TTS/server/server.py index 1f7357af..425879cf 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -17,8 +17,8 @@ def create_argparser(): parser = argparse.ArgumentParser() parser.add_argument('--list_models', type=convert_boolean, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.') - parser.add_argument('--model_name', type=str, help='name of one of the released tts models.') - parser.add_argument('--vocoder_name', type=str, help='name of one of the released vocoder models.') + parser.add_argument('--model_name', type=str, default="tts_models/en/ljspeech/speedy-speech-wn", help='name of one of the released tts models.') + parser.add_argument('--vocoder_name', type=str, default="vocoder_models/en/ljspeech/mulitband-melgan", help='name of one of the released vocoder models.') parser.add_argument('--tts_checkpoint', type=str, help='path to custom tts checkpoint file') parser.add_argument('--tts_config', type=str, help='path to custom tts config.json file') parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model') diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 615e0d1d..4131bc7c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -79,7 +79,7 @@ class Synthesizer(object): self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes - self.ap = AudioProcessor(**self.tts_config.audio) + self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) @@ -96,7 +96,7 @@ class Synthesizer(object): def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) - self.vocoder_ap = AudioProcessor(**self.vocoder_config['audio']) + self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config['audio']) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: From ccbd542eb07a8349eeeecc975bca33258252be9e Mon Sep 17 00:00:00 2001 From: Thorsten Mueller Date: Wed, 27 Jan 2021 16:19:02 +0100 Subject: [PATCH 002/100] Added info if model already downloaded in --list_models --- TTS/utils/manage.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 25b3d797..af741156 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -52,7 +52,12 @@ class ModelManager(object): for lang in self.models_dict[model_type]: for dataset in self.models_dict[model_type][lang]: for model in self.models_dict[model_type][lang][dataset]: - print(f" >: {model_type}/{lang}/{dataset}/{model} ") + model_full_name = f"{model_type}--{lang}--{dataset}--{model}" + output_path = os.path.join(self.output_prefix, model_full_name) + if os.path.exists(output_path): + print(f" >: {model_type}/{lang}/{dataset}/{model} [already downloaded]") + else: + print(f" >: {model_type}/{lang}/{dataset}/{model}") def download_model(self, model_name): """Download model files given the full model name. From ca28e05ed71cea7462d9a4517a121edabf900239 Mon Sep 17 00:00:00 2001 From: Alexander Korolev Date: Wed, 27 Jan 2021 16:33:25 +0100 Subject: [PATCH 003/100] update fixed stopnet_pos_weight parameter config parameter c.stopnet_pos_weight has currently no effect as it is not used. --- TTS/bin/train_tacotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index ccb35a7c..be609905 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -534,7 +534,7 @@ def main(args): # pylint: disable=redefined-outer-name optimizer_st = None # setup criterion - criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) + criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') From 8a6eee7fec46da19f486f392e3233f978ea85c5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 28 Jan 2021 17:04:08 +0100 Subject: [PATCH 004/100] distill import statement, check python version in setup.py --- TTS/utils/synthesizer.py | 2 +- setup.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 4131bc7c..85e116cf 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -11,7 +11,7 @@ from TTS.tts.utils.speakers import load_speaker_mapping from TTS.vocoder.utils.generic_utils import setup_generator, interpolate_vocoder_input # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import -from TTS.tts.utils.synthesis import * +from TTS.tts.utils.synthesis import synthesis, trim_silence from TTS.tts.utils.text import make_symbols, phonemes, symbols diff --git a/setup.py b/setup.py index 6cc06f89..8df52e44 100644 --- a/setup.py +++ b/setup.py @@ -5,14 +5,20 @@ import os import shutil import subprocess import sys +from distutils.extension import Extension +from distutils.version import LooseVersion import numpy import setuptools.command.build_py import setuptools.command.develop - -from setuptools import find_packages, setup -from distutils.extension import Extension from Cython.Build import cythonize +from setuptools import find_packages, setup + +if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): + raise RuntimeError( + "TTS requires python >= 3.6 and <3.9 " + "but your Python version is {}".format(sys.version) + ) # parameters for wheeling server. parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False) From a926aa106de1846d72f29b5b662076720c3f5002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 29 Jan 2021 01:36:21 +0100 Subject: [PATCH 005/100] reorder imports --- TTS/tts/utils/synthesis.py | 2 ++ requirements.txt | 1 - setup.py | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 7e71df64..be587211 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,3 +1,5 @@ +import os +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import pkg_resources installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable if 'tensorflow' in installed or 'tensorflow-gpu' in installed: diff --git a/requirements.txt b/requirements.txt index 31b49916..5b947f4e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,6 @@ numba==0.48 librosa==0.7.2 phonemizer>=2.2.0 unidecode==0.4.20 -attrdict tensorboardX matplotlib Pillow diff --git a/setup.py b/setup.py index 8df52e44..9ea48efa 100644 --- a/setup.py +++ b/setup.py @@ -11,8 +11,9 @@ from distutils.version import LooseVersion import numpy import setuptools.command.build_py import setuptools.command.develop -from Cython.Build import cythonize from setuptools import find_packages, setup +from Cython.Build import cythonize + if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): raise RuntimeError( From 094b39939f394b83ad4b9a0984ac29552aa20906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 29 Jan 2021 01:36:35 +0100 Subject: [PATCH 006/100] pyaml --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5b947f4e..1e92f17e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,4 +22,4 @@ pylint==2.5.3 gdown umap-learn cython -pyyaml +pyyaml \ No newline at end of file From 5a6abe78df8a6f1c72162a09bdd0765f92ca013c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 29 Jan 2021 01:40:51 +0100 Subject: [PATCH 007/100] setup import reset --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9ea48efa..53a142a1 100644 --- a/setup.py +++ b/setup.py @@ -5,13 +5,12 @@ import os import shutil import subprocess import sys -from distutils.extension import Extension from distutils.version import LooseVersion import numpy import setuptools.command.build_py import setuptools.command.develop -from setuptools import find_packages, setup +from setuptools import setup, Extension, find_packages from Cython.Build import cythonize From e81ebec7a885b52d20506ffcdf6a30c4d058695f Mon Sep 17 00:00:00 2001 From: Alexander Korolev Date: Fri, 29 Jan 2021 15:18:59 +0100 Subject: [PATCH 008/100] fix device mismatch wavegrad training this should fixe the device mismatch as seen here https://github.com/mozilla/TTS/issues/622#issue-789802916 --- TTS/bin/train_vocoder_wavegrad.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 73802c63..b104652d 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -344,6 +344,10 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = torch.nn.L1Loss().cuda() + + if use_cuda: + model.cuda() + criterion.cuda() if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') @@ -378,10 +382,6 @@ def main(args): # pylint: disable=redefined-outer-name else: args.restore_step = 0 - if use_cuda: - model.cuda() - criterion.cuda() - # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) From aa5f24608a2e9529ae2e2d2a807687898de7b038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 29 Jan 2021 15:00:33 +0000 Subject: [PATCH 009/100] hubconf.py and load .models.json from the defualt location by mange.py --- TTS/hubconf.py | 26 ++++++++++++++++++++++++++ TTS/utils/manage.py | 16 +++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 TTS/hubconf.py diff --git a/TTS/hubconf.py b/TTS/hubconf.py new file mode 100644 index 00000000..c4e5bc99 --- /dev/null +++ b/TTS/hubconf.py @@ -0,0 +1,26 @@ +dependencies = ['torch', 'gdown'] +import torch +import os +import zipfile + +from TTS.utils.generic_utils import get_user_data_dir +from TTS.utils.synthesizer import Synthesizer +from TTS.utils.manage import ModelManager + + + +def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder_models/en/ljspeech/mulitband-melgan', pretrained=True): + manager = ModelManager() + + model_path, config_path = manager.download_model(model_name) + vocoder_path, vocoder_config_path = manager.download_model(vocoder_name) + + # create synthesizer + synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path) + return synthesizer + + +if __name__ == '__main__': + # synthesizer = torch.hub.load('/data/rw/home/projects/TTS/TTS', 'tts', source='local') + synthesizer = torch.hub.load('mozilla/TTS:hub_conf', 'tts', source='github') + synthesizer.tts("This is a test!") \ No newline at end of file diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index af741156..3cf8d67f 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -1,10 +1,11 @@ import json -import gdown -from pathlib import Path import os +from pathlib import Path -from TTS.utils.io import load_config +import gdown from TTS.utils.generic_utils import get_user_data_dir +from TTS.utils.io import load_config + class ModelManager(object): """Manage TTS models defined in .models.json. @@ -17,12 +18,17 @@ class ModelManager(object): Args: models_file (str): path to .model.json """ - def __init__(self, models_file): + def __init__(self, models_file=None): super().__init__() self.output_prefix = get_user_data_dir('tts') self.url_prefix = "https://drive.google.com/uc?id=" self.models_dict = None - self.read_models_file(models_file) + if models_file is not None: + self.read_models_file(models_file) + else: + # try the default location + path = Path(__file__).parent / "../.models.json" + self.read_models_file(path) def read_models_file(self, file_path): """Read .models.json as a dict From 0354b6f35ec31659a61182d4a7b32562704d08e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 29 Jan 2021 15:02:45 +0000 Subject: [PATCH 010/100] move hubconf --- TTS/hubconf.py => hubconf.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename TTS/hubconf.py => hubconf.py (100%) diff --git a/TTS/hubconf.py b/hubconf.py similarity index 100% rename from TTS/hubconf.py rename to hubconf.py From 66c2a61f74188d506bd55afaa9d3826cfeee3983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 29 Jan 2021 15:17:29 +0000 Subject: [PATCH 011/100] docstring hubconf --- hubconf.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/hubconf.py b/hubconf.py index c4e5bc99..0e2e60d8 100644 --- a/hubconf.py +++ b/hubconf.py @@ -8,8 +8,22 @@ from TTS.utils.synthesizer import Synthesizer from TTS.utils.manage import ModelManager +def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder_models/en/ljspeech/mulitband-melgan'): + """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text. -def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder_models/en/ljspeech/mulitband-melgan', pretrained=True): + Example: + >>> synthesizer = torch.hub.load('mozilla/TTS', 'tts', source='github') + >>> wavs = synthesizer.tts("This is a test! This is also a test!!") + wavs - is a list of values of the synthesized speech. + + Args: + model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'. + vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/mulitband-melgan'. + pretrained (bool, optional): [description]. Defaults to True. + + Returns: + TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models. + """ manager = ModelManager() model_path, config_path = manager.download_model(model_name) @@ -21,6 +35,5 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder if __name__ == '__main__': - # synthesizer = torch.hub.load('/data/rw/home/projects/TTS/TTS', 'tts', source='local') synthesizer = torch.hub.load('mozilla/TTS:hub_conf', 'tts', source='github') synthesizer.tts("This is a test!") \ No newline at end of file From 44c4a49745628692857261996fcc8014f5bc4506 Mon Sep 17 00:00:00 2001 From: Thorsten Mueller Date: Fri, 29 Jan 2021 17:23:38 +0100 Subject: [PATCH 012/100] Set out_path to be required param. --- TTS/bin/compute_statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 7642f86b..5c1796ea 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -19,8 +19,8 @@ def main(): description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, - help="directory to save the output file.") + parser.add_argument("--out_path", type=str, required=True + help="save path (directory and filename).") args = parser.parse_args() # load config From 879d946f10cea343d83b097ae7a863cba6d07da9 Mon Sep 17 00:00:00 2001 From: Thorsten Mueller Date: Sat, 30 Jan 2021 13:44:12 +0100 Subject: [PATCH 013/100] Ups. Added missing , --- TTS/bin/compute_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 5c1796ea..a74fe90a 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -19,7 +19,7 @@ def main(): description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", type=str, required=True + parser.add_argument("--out_path", type=str, required=True, help="save path (directory and filename).") args = parser.parse_args() From c7407571fa902009ca4ebcf062f703d43eb7d3b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 1 Feb 2021 10:05:55 +0000 Subject: [PATCH 014/100] fix #638 --- TTS/bin/train_vocoder_wavernn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index cad357dc..14d57837 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -200,7 +200,7 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.generate(ground_mel, + sample_wav = model.inference(ground_mel, c.batched, c.target_samples, c.overlap_samples, @@ -287,7 +287,7 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.generate(ground_mel, + sample_wav = model.inference(ground_mel, c.batched, c.target_samples, c.overlap_samples, From d003e593477da90e9c3850b22350be7a01b2e7a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 1 Feb 2021 11:26:21 +0000 Subject: [PATCH 015/100] readme update for espeak install --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fc1598a6..5c631140 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ Please use our dedicated channels for questions and discussion. Help is much mor You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers). ## Install TTS -TTS supports **python >= 3.6, <3.9**. +TTS is tested on Ubuntu 18.04 with **python >= 3.6, <3.9**. If you are only interested in [synthesizing speech](https://github.com/mozilla/TTS/tree/dev#example-synthesizing-speech-on-terminal-using-the-released-models) with the released TTS models, installing from PyPI is the easiest option. @@ -108,6 +108,11 @@ git clone https://github.com/mozilla/TTS pip install -e . ``` +We use ```espeak``` to convert graphemes to phonemes. You might need to install separately. +```bash +sudo apt-get install espeak +``` + ## Directory Structure ``` |- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.) From 699d2aa1c367d6ec21af456bb5082164771cd207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 1 Feb 2021 11:26:46 +0000 Subject: [PATCH 016/100] pin cython verions 0.29.20 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1e92f17e..b1baadd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,5 +21,5 @@ cardboardlint==1.3.0 pylint==2.5.3 gdown umap-learn -cython +cython==0.29.20 # > 0.29.20 breaks pyworld installation with the min numpy req of Tensorflow 2.4.1 pyyaml \ No newline at end of file From 8774e374446ca1491ef9ed3dedc3bd9401c4195d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 1 Feb 2021 11:34:05 +0000 Subject: [PATCH 017/100] unpin cython version and commentout pyworld in audio.py causing dep issues --- TTS/utils/audio.py | 21 +++++++++++---------- requirements.txt | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 93a5880f..87ae4f5b 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -3,7 +3,7 @@ import soundfile as sf import numpy as np import scipy.io.wavfile import scipy.signal -import pyworld as pw +# import pyworld as pw from TTS.tts.utils.data import StandardScaler @@ -292,15 +292,16 @@ class AudioProcessor(object): return pad // 2, pad // 2 + pad % 2 ### Compute F0 ### - def compute_f0(self, x): - f0, t = pw.dio( - x.astype(np.double), - fs=self.sample_rate, - f0_ceil=self.mel_fmax, - frame_period=1000 * self.hop_length / self.sample_rate, - ) - f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) - return f0 + # TODO: pw causes some dep issues + # def compute_f0(self, x): + # f0, t = pw.dio( + # x.astype(np.double), + # fs=self.sample_rate, + # f0_ceil=self.mel_fmax, + # frame_period=1000 * self.hop_length / self.sample_rate, + # ) + # f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) + # return f0 ### Audio Processing ### def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): diff --git a/requirements.txt b/requirements.txt index b1baadd7..1e92f17e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,5 +21,5 @@ cardboardlint==1.3.0 pylint==2.5.3 gdown umap-learn -cython==0.29.20 # > 0.29.20 breaks pyworld installation with the min numpy req of Tensorflow 2.4.1 +cython pyyaml \ No newline at end of file From 5c46543765192016a5638824cf3ff6fe88081088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 1 Feb 2021 13:18:56 +0000 Subject: [PATCH 018/100] linter fixes and version updates for deps --- TTS/bin/train_vocoder_wavegrad.py | 2 +- TTS/utils/audio.py | 2 +- hubconf.py | 15 ++++++--------- pyproject.toml | 2 +- requirements.txt | 2 +- tests/test_vocoder_gan_datasets.py | 3 ++- 6 files changed, 12 insertions(+), 14 deletions(-) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index b104652d..fe5fb3d7 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -344,7 +344,7 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = torch.nn.L1Loss().cuda() - + if use_cuda: model.cuda() criterion.cuda() diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 87ae4f5b..3d31ce6e 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -292,7 +292,7 @@ class AudioProcessor(object): return pad // 2, pad // 2 + pad % 2 ### Compute F0 ### - # TODO: pw causes some dep issues + # TODO: pw causes some dep issues # def compute_f0(self, x): # f0, t = pw.dio( # x.astype(np.double), diff --git a/hubconf.py b/hubconf.py index 0e2e60d8..9de4f7b2 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,9 +1,6 @@ dependencies = ['torch', 'gdown'] import torch -import os -import zipfile -from TTS.utils.generic_utils import get_user_data_dir from TTS.utils.synthesizer import Synthesizer from TTS.utils.manage import ModelManager @@ -15,7 +12,7 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder >>> synthesizer = torch.hub.load('mozilla/TTS', 'tts', source='github') >>> wavs = synthesizer.tts("This is a test! This is also a test!!") wavs - is a list of values of the synthesized speech. - + Args: model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'. vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/mulitband-melgan'. @@ -23,15 +20,15 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder Returns: TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models. - """ + """ manager = ModelManager() - + model_path, config_path = manager.download_model(model_name) vocoder_path, vocoder_config_path = manager.download_model(vocoder_name) - + # create synthesizer - synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path) - return synthesizer + synt = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path) + return synt if __name__ == '__main__': diff --git a/pyproject.toml b/pyproject.toml index fc0aca47..77d6b975 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,2 @@ [build-system] -requires = ["setuptools", "wheel", "Cython", "numpy>=1.16.0"] \ No newline at end of file +requires = ["setuptools", "wheel", "Cython", "numpy==1.17.0"] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1e92f17e..a427062e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ torch>=1.5 tensorflow==2.3.1 -numpy>=1.16.0 +numpy==1.17.0 scipy>=0.19.0 numba==0.48 librosa==0.7.2 diff --git a/tests/test_vocoder_gan_datasets.py b/tests/test_vocoder_gan_datasets.py index 2a487d9a..99a25dcf 100644 --- a/tests/test_vocoder_gan_datasets.py +++ b/tests/test_vocoder_gan_datasets.py @@ -61,7 +61,8 @@ def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, us mel = ap.melspectrogram(audio) # the first 2 and the last 2 frames are skipped due to the padding # differences in stft - assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum() <= 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum()}' + max_diff = abs((feat - mel[:, :feat1.shape[-1]])[:, 2:-2]).max() + assert max_diff <= 0, f' [!] {max_diff}' count_iter += 1 # if count_iter == max_iter: From 41f6579a746256d3d52598b1b4a7401b0f61a003 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 1 Feb 2021 13:47:29 +0000 Subject: [PATCH 019/100] push numpy version up to 1.17.5 --- pyproject.toml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 77d6b975..8b8da28d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,2 @@ [build-system] -requires = ["setuptools", "wheel", "Cython", "numpy==1.17.0"] \ No newline at end of file +requires = ["setuptools", "wheel", "Cython", "numpy==1.17.5"] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a427062e..7a0d9f76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ torch>=1.5 tensorflow==2.3.1 -numpy==1.17.0 +numpy==1.17.5 scipy>=0.19.0 numba==0.48 librosa==0.7.2 From 167bbc6a4a1f2823c281835b169d060e08438c8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 1 Feb 2021 14:06:34 +0000 Subject: [PATCH 020/100] update version number to 0.0.9.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 53a142a1..eee958bb 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ args, unknown_args = parser.parse_known_args() # Remove our arguments from argv so that setuptools doesn't see them sys.argv = [sys.argv[0]] + unknown_args -version = '0.0.9' +version = '0.0.9.1' cwd = os.path.dirname(os.path.abspath(__file__)) # Handle Cython code From c75ea74914851ea5d6d549db33cc3a48b7a442ca Mon Sep 17 00:00:00 2001 From: Thorsten Mueller Date: Wed, 27 Jan 2021 16:19:02 +0100 Subject: [PATCH 021/100] Added info if model already downloaded in --list_models --- TTS/utils/manage.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 25b3d797..af741156 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -52,7 +52,12 @@ class ModelManager(object): for lang in self.models_dict[model_type]: for dataset in self.models_dict[model_type][lang]: for model in self.models_dict[model_type][lang][dataset]: - print(f" >: {model_type}/{lang}/{dataset}/{model} ") + model_full_name = f"{model_type}--{lang}--{dataset}--{model}" + output_path = os.path.join(self.output_prefix, model_full_name) + if os.path.exists(output_path): + print(f" >: {model_type}/{lang}/{dataset}/{model} [already downloaded]") + else: + print(f" >: {model_type}/{lang}/{dataset}/{model}") def download_model(self, model_name): """Download model files given the full model name. From 4cb4fcf02cda3624000f47fdb3c53a0a6b11cbac Mon Sep 17 00:00:00 2001 From: Thorsten Mueller Date: Fri, 29 Jan 2021 17:23:38 +0100 Subject: [PATCH 022/100] Set out_path to be required param. --- TTS/bin/compute_statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 7642f86b..5c1796ea 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -19,8 +19,8 @@ def main(): description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, - help="directory to save the output file.") + parser.add_argument("--out_path", type=str, required=True + help="save path (directory and filename).") args = parser.parse_args() # load config From a82152eef353d30975c197426d66fb00d969d5cb Mon Sep 17 00:00:00 2001 From: Thorsten Mueller Date: Sat, 30 Jan 2021 13:44:12 +0100 Subject: [PATCH 023/100] Ups. Added missing , --- TTS/bin/compute_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 5c1796ea..a74fe90a 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -19,7 +19,7 @@ def main(): description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", type=str, required=True + parser.add_argument("--out_path", type=str, required=True, help="save path (directory and filename).") args = parser.parse_args() From cb77aef36c747b11ce08db7eb684760f70e99fee Mon Sep 17 00:00:00 2001 From: Branislav Gerazov Date: Thu, 4 Feb 2021 09:52:03 +0100 Subject: [PATCH 024/100] waveRNN fix --- TTS/bin/train_vocoder_wavernn.py | 28 ++++++----- TTS/vocoder/configs/wavernn_config.json | 3 ++ TTS/vocoder/models/wavernn.py | 9 ++-- TTS/vocoder/utils/generic_utils.py | 65 +++++++++++++------------ 4 files changed, 57 insertions(+), 48 deletions(-) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index 14d57837..d38bdee5 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -32,7 +32,7 @@ from TTS.vocoder.datasets.preprocess import ( load_wav_feat_data ) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss -from TTS.vocoder.utils.generic_utils import setup_wavernn +from TTS.vocoder.utils.generic_utils import setup_generator from TTS.vocoder.utils.io import save_best_model, save_checkpoint @@ -200,12 +200,14 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) + ground_mel = torch.FloatTensor(ground_mel) + if use_cuda: + ground_mel = ground_mel.cuda(non_blocking=True) sample_wav = model.inference(ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - use_cuda - ) + c.batched, + c.target_samples, + c.overlap_samples, + ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms @@ -287,12 +289,14 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) + ground_mel = torch.FloatTensor(ground_mel) + if use_cuda: + ground_mel = ground_mel.cuda(non_blocking=True) sample_wav = model.inference(ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - use_cuda - ) + c.batched, + c.target_samples, + c.overlap_samples, + ) predict_mel = ap.melspectrogram(sample_wav) # Sample audio @@ -350,7 +354,7 @@ def main(args): # pylint: disable=redefined-outer-name eval_data, train_data = load_wav_data( c.data_path, c.eval_split_size) # setup model - model_wavernn = setup_wavernn(c) + model_wavernn = setup_generator(c) # setup amp scaler scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 58667b69..effb103b 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -56,6 +56,9 @@ "upsample_factors": [4, 8, 8] // this needs to correctly factorise hop_length }, +// GENERATOR - for backward compatibility + "generator_model": "WaveRNN", + // DATASET //"use_gta": true, // use computed gta features from the tts model "data_path": "/home/erogol/Data/libritts/LibriTTS/train-clean-360/", // path containing training wav files diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index cb03deb3..fdb71cff 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -260,7 +260,7 @@ class WaveRNN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) - def inference(self, mels, batched, target, overlap): + def inference(self, mels, batched=None, target=None, overlap=None): self.eval() device = mels.device @@ -350,10 +350,11 @@ class WaveRNN(nn.Module): self.gen_display(i, seq_len, b_size, start) output = torch.stack(output).transpose(0, 1) - output = output.cpu().numpy() - output = output.astype(np.float64) - + output = output.cpu() if batched: + output = output.numpy() + output = output.astype(np.float64) + output = self.xfade_and_unfold(output, target, overlap) else: output = output[0] diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index fb943a37..b43a1263 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -61,40 +61,37 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): return figures -def to_camel(text): +def to_camel(text, cap=True): text = text.capitalize() return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) -def setup_wavernn(c): - print(" > Model: WaveRNN") - MyModel = importlib.import_module("TTS.vocoder.models.wavernn") - MyModel = getattr(MyModel, "WaveRNN") - model = MyModel( - rnn_dims=c.wavernn_model_params['rnn_dims'], - fc_dims=c.wavernn_model_params['fc_dims'], - mode=c.mode, - mulaw=c.mulaw, - pad=c.padding, - use_aux_net=c.wavernn_model_params['use_aux_net'], - use_upsample_net=c.wavernn_model_params['use_upsample_net'], - upsample_factors=c.wavernn_model_params['upsample_factors'], - feat_dims=c.audio['num_mels'], - compute_dims=c.wavernn_model_params['compute_dims'], - res_out_dims=c.wavernn_model_params['res_out_dims'], - num_res_blocks=c.wavernn_model_params['num_res_blocks'], - hop_length=c.audio["hop_length"], - sample_rate=c.audio["sample_rate"], - ) - return model - - def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) MyModel = importlib.import_module('TTS.vocoder.models.' + c.generator_model.lower()) - MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model.lower() in 'melgan_generator': + # this is to preserve the WaveRNN class name (instead of Wavernn) + if c.generator_model != 'WaveRNN': + MyModel = getattr(MyModel, to_camel(c.generator_model)) + else: + MyModel = getattr(MyModel, c.generator_model) + if c.generator_model.lower() in 'wavernn': + model = MyModel( + rnn_dims=c.wavernn_model_params['rnn_dims'], + fc_dims=c.wavernn_model_params['fc_dims'], + mode=c.mode, + mulaw=c.mulaw, + pad=c.padding, + use_aux_net=c.wavernn_model_params['use_aux_net'], + use_upsample_net=c.wavernn_model_params['use_upsample_net'], + upsample_factors=c.wavernn_model_params['upsample_factors'], + feat_dims=c.audio['num_mels'], + compute_dims=c.wavernn_model_params['compute_dims'], + res_out_dims=c.wavernn_model_params['res_out_dims'], + num_res_blocks=c.wavernn_model_params['num_res_blocks'], + hop_length=c.audio["hop_length"], + sample_rate=c.audio["sample_rate"],) + elif c.generator_model.lower() in 'melgan_generator': model = MyModel( in_channels=c.audio['num_mels'], out_channels=1, @@ -103,9 +100,10 @@ def setup_generator(c): upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'melgan_fb_generator': - pass - if c.generator_model.lower() in 'multiband_melgan_generator': + elif c.generator_model in 'melgan_fb_generator': + raise ValueError( + 'melgan_fb_generator is now fullband_melgan_generator') + elif c.generator_model.lower() in 'multiband_melgan_generator': model = MyModel( in_channels=c.audio['num_mels'], out_channels=4, @@ -114,7 +112,7 @@ def setup_generator(c): upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model.lower() in 'fullband_melgan_generator': + elif c.generator_model.lower() in 'fullband_melgan_generator': model = MyModel( in_channels=c.audio['num_mels'], out_channels=1, @@ -123,7 +121,7 @@ def setup_generator(c): upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model.lower() in 'parallel_wavegan_generator': + elif c.generator_model.lower() in 'parallel_wavegan_generator': model = MyModel( in_channels=1, out_channels=1, @@ -138,7 +136,7 @@ def setup_generator(c): bias=True, use_weight_norm=True, upsample_factors=c.generator_model_params['upsample_factors']) - if c.generator_model.lower() in 'wavegrad': + elif c.generator_model.lower() in 'wavegrad': model = MyModel( in_channels=c['audio']['num_mels'], out_channels=1, @@ -149,6 +147,9 @@ def setup_generator(c): ublock_out_channels=c['model_params']['ublock_out_channels'], upsample_factors=c['model_params']['upsample_factors'], upsample_dilations=c['model_params']['upsample_dilations']) + else: + raise NotImplementedError( + f'Model {c.generator_model} not implemented!') return model From 24ffa9e9f696121cb5326b3cba0659032d571b94 Mon Sep 17 00:00:00 2001 From: Branislav Gerazov Date: Fri, 5 Feb 2021 13:10:02 +0100 Subject: [PATCH 025/100] update wavernn test config, delete cap=True --- TTS/vocoder/utils/generic_utils.py | 2 +- tests/inputs/test_vocoder_wavernn_config.json | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index b43a1263..05ceba6b 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -61,7 +61,7 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): return figures -def to_camel(text, cap=True): +def to_camel(text): text = text.capitalize() return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index 9df32fef..3c6d06f5 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -54,6 +54,9 @@ "mulaw": false, // apply mulaw if mode is bits "padding": 2, // pad the input for resnet to see wider input length + // GENERATOR - for backward compatibility + "generator_model": "WaveRNN", + // DATASET //"use_gta": true, // use computed gta features from the tts model "data_path": "tests/data/ljspeech/wavs/", // path containing training wav files From f0635453256fd48091d26a474d6edfd300280429 Mon Sep 17 00:00:00 2001 From: Branislav Gerazov Date: Fri, 5 Feb 2021 13:26:33 +0100 Subject: [PATCH 026/100] improve robustness of defining wavernn in config file --- TTS/vocoder/utils/generic_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 05ceba6b..0d532063 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -71,10 +71,10 @@ def setup_generator(c): MyModel = importlib.import_module('TTS.vocoder.models.' + c.generator_model.lower()) # this is to preserve the WaveRNN class name (instead of Wavernn) - if c.generator_model != 'WaveRNN': - MyModel = getattr(MyModel, to_camel(c.generator_model)) + if c.generator_model.lower() == 'wavernn': + MyModel = getattr(MyModel, 'WaveRNN') else: - MyModel = getattr(MyModel, c.generator_model) + MyModel = getattr(MyModel, to_camel(c.generator_model)) if c.generator_model.lower() in 'wavernn': model = MyModel( rnn_dims=c.wavernn_model_params['rnn_dims'], From d49757faaa97276e2c874421c466fa1545e51f75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 5 Feb 2021 13:10:43 +0000 Subject: [PATCH 027/100] linter updates --- TTS/bin/compute_attention_masks.py | 1 + TTS/bin/synthesize.py | 3 +- TTS/bin/train_encoder.py | 7 +- TTS/bin/train_glow_tts.py | 22 ++--- TTS/bin/train_speedy_speech.py | 7 +- TTS/bin/train_tacotron.py | 14 ++- TTS/bin/train_vocoder_gan.py | 18 ++-- TTS/bin/train_vocoder_wavegrad.py | 86 +++++++++---------- TTS/bin/train_vocoder_wavernn.py | 17 ++-- TTS/bin/tune_wavegrad.py | 2 - TTS/speaker_encoder/dataset.py | 9 +- TTS/speaker_encoder/utils/generic_utils.py | 1 - TTS/tts/datasets/TTSDataset.py | 42 ++++++--- TTS/tts/layers/common_layers.py | 2 +- TTS/utils/generic_utils.py | 2 - hubconf.py | 2 +- tests/inputs/test_vocoder_wavernn_config.json | 3 + 17 files changed, 122 insertions(+), 116 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index fc02144a..53246e07 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -16,6 +16,7 @@ from TTS.utils.io import load_config if __name__ == '__main__': + # pylint: disable=bad-continuation parser = argparse.ArgumentParser( description='''Extract attention masks from trained Tacotron/Tacotron2 models. These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n''' diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 9a06c866..e0d214d5 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -179,7 +179,6 @@ def main(): # load models synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda) - use_griffin_lim = vocoder_path is None print(" > Text: {}".format(args.text)) # # handle multi-speaker setting @@ -218,4 +217,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 5201f548..12fba6e1 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -34,7 +34,9 @@ print(" > Using CUDA: ", use_cuda) print(" > Number of GPUs: ", num_gpus) -def setup_loader(ap: AudioProcessor, is_val: bool=False, verbose: bool=False): +def setup_loader(ap: AudioProcessor, + is_val: bool = False, + verbose: bool = False): if is_val: loader = None else: @@ -254,8 +256,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, - new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) LOG_DIR = OUT_PATH tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder') diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index d03ab1ee..5cd23ce4 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -119,7 +119,7 @@ def format_data(data): avg_text_length, avg_spec_length, attn_mask, item_idx -def data_depended_init(data_loader, model, ap): +def data_depended_init(data_loader, model): """Data depended initialization for activation normalization.""" if hasattr(model, 'module'): for f in model.module.decoder.flows: @@ -138,7 +138,7 @@ def data_depended_init(data_loader, model, ap): # format data text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model _ = model.forward( @@ -177,7 +177,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data) + avg_text_length, avg_spec_length, attn_mask, _ = format_data(data) loader_time = time.time() - end_time @@ -191,20 +191,20 @@ def train(data_loader, model, criterion, optimizer, scheduler, # compute loss loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, - o_dur_log, o_total_dur, text_lengths) + o_dur_log, o_total_dur, text_lengths) # backward pass with loss scaling if c.mixed_precision: scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -332,7 +332,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( @@ -550,13 +550,14 @@ def main(args): # pylint: disable=redefined-outer-name eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step - model = data_depended_init(train_loader, model, ap) + model = data_depended_init(train_loader, model) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: @@ -632,8 +633,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index a24cf8bc..667f5abd 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -175,13 +175,13 @@ def train(data_loader, model, criterion, optimizer, scheduler, scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -518,7 +518,8 @@ def main(args): # pylint: disable=redefined-outer-name train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index be609905..4640a3eb 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -178,10 +178,10 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, - linear_input, stop_tokens, stop_targets, - mel_lengths, decoder_backward_output, - alignments, alignment_lengths, alignments_backward, - text_lengths) + linear_input, stop_tokens, stop_targets, + mel_lengths, decoder_backward_output, + alignments, alignment_lengths, + alignments_backward, text_lengths) # check nan loss if torch.isnan(loss_dict['loss']).any(): @@ -199,7 +199,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # stopnet optimizer step if c.separate_stopnet: - scaler_st.scale( loss_dict['stopnet_loss']).backward() + scaler_st.scale(loss_dict['stopnet_loss']).backward() scaler.unscale_(optimizer_st) optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) @@ -535,7 +535,6 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) - if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -706,8 +705,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 5f1e8c63..a1d1b322 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -33,9 +33,8 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: + loader = None + if not is_val or c.run_eval: dataset = GANDataset(ap=ap, items=eval_data if is_val else train_data, seq_len=c.seq_len, @@ -274,14 +273,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute spectrograms figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -430,11 +429,11 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + c.audio["sample_rate"]) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - # synthesize a full voice + # synthesize a full voice data_loader.return_segments = False return keep_avg.avg_values @@ -639,8 +638,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index fe5fb3d7..c53612c2 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -34,16 +34,16 @@ def setup_loader(ap, is_val=False, verbose=False): loader = None else: dataset = WaveGradDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose) + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=True, + use_noise_augment=False, + use_cache=c.use_cache, + verbose=verbose) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=c.batch_size, @@ -54,7 +54,6 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val else c.num_loader_workers, pin_memory=False) - return loader @@ -79,8 +78,8 @@ def format_test_data(data): return m, x -def train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, epoch): +def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, + epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 @@ -94,7 +93,8 @@ def train(model, criterion, optimizer, c_logger.print_train_start() # setup noise schedule noise_schedule = c['train_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) else: @@ -120,7 +120,7 @@ def train(model, criterion, optimizer, # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_wavegrad_dict = {'wavegrad_loss': loss} # check nan loss if torch.isnan(loss).any(): @@ -133,13 +133,13 @@ def train(model, criterion, optimizer, scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) scaler.step(optimizer) scaler.update() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) optimizer.step() # schedule update @@ -205,7 +205,8 @@ def train(model, criterion, optimizer, epoch, OUT_PATH, model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + scaler=scaler.state_dict() + if c.mixed_precision else None) end_time = time.time() @@ -246,14 +247,12 @@ def evaluate(model, criterion, ap, global_step, epoch): else: noise, x_noisy, noise_scale = model.compute_y_n(x) - # forward pass noise_hat = model(x_noisy, m, noise_scale) # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} - + loss_wavegrad_dict = {'wavegrad_loss': loss} loss_dict = dict() for key, value in loss_wavegrad_dict.items(): @@ -284,7 +283,9 @@ def evaluate(model, criterion, ap, global_step, epoch): # setup noise schedule and inference noise_schedule = c['test_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], + noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) # compute voice @@ -315,7 +316,8 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, + c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -395,26 +397,25 @@ def main(args): # pylint: disable=redefined-outer-name global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, - global_step, epoch) + _, global_step = train(model, criterion, optimizer, scheduler, scaler, + ap, global_step, epoch) + eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + best_loss = save_best_model( + target_loss, + best_loss, + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None) if __name__ == '__main__': @@ -486,8 +487,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index 14d57837..6847e011 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -200,12 +200,9 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.inference(ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - use_cuda - ) + sample_wav = model.inference(ground_mel, c.batched, + c.target_samples, c.overlap_samples, + use_cuda) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms @@ -287,12 +284,8 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.inference(ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - use_cuda - ) + sample_wav = model.inference(ground_mel, c.batched, c.target_samples, + c.overlap_samples, use_cuda) predict_mel = ap.melspectrogram(sample_wav) # Sample audio diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index 7461282d..436a2764 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -87,5 +87,3 @@ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=tot best_schedule = {'beta': beta} print(f" > Found a better schedule. - MSE: {mse.item()}") np.save(args.output_path, best_schedule) - - diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 33cc4f36..6110ac4d 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -1,10 +1,9 @@ -import numpy -import numpy as np import queue -import torch import random + +import numpy as np +import torch from torch.utils.data import Dataset -from tqdm import tqdm class MyDataset(Dataset): @@ -155,7 +154,7 @@ class MyDataset(Dataset): # add random gaussian noise if self.additive_noise > 0: - noises_ = [numpy.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_] + noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_] wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))] # get a random subset of each of the wavs and convert to MFCC. diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 021c7f45..47bf79cc 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -114,4 +114,3 @@ def check_config_speaker_encoder(c): check_argument('path', dataset_entry, restricted=True, val_type=str) check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) - diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 38dd2890..3b327cbc 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -90,7 +90,8 @@ class MyDataset(Dataset): return data @staticmethod - def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, tp, add_blank): + def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, + language, tp, add_blank): """generate a phoneme sequence from text. since the usage is for subsequent caching, we never add bos and eos chars here. Instead we add those dynamically later; based on the @@ -98,13 +99,16 @@ class MyDataset(Dataset): phonemes = phoneme_to_sequence(text, [cleaners], language=language, enable_eos_bos=False, - tp=tp, add_blank=add_blank) + tp=tp, + add_blank=add_blank) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) return phonemes @staticmethod - def _load_or_generate_phoneme_sequence(wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, tp, add_blank): + def _load_or_generate_phoneme_sequence(wav_file, text, phoneme_cache_path, + enable_eos_bos, cleaners, language, + tp, add_blank): file_name = os.path.splitext(os.path.basename(wav_file))[0] # different names for normal phonemes and with blank chars. @@ -143,12 +147,16 @@ class MyDataset(Dataset): if not self.input_seq_computed: if self.use_phonemes: - text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank) + text = self._load_or_generate_phoneme_sequence( + wav_file, text, self.phoneme_cache_path, + self.enable_eos_bos, self.cleaners, self.phoneme_language, + self.tp, self.add_blank) else: text = np.asarray(text_to_sequence(text, [self.cleaners], - tp=self.tp, add_blank=self.add_blank), - dtype=np.int32) + tp=self.tp, + add_blank=self.add_blank), + dtype=np.int32) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] @@ -177,7 +185,8 @@ class MyDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item - phonemes = MyDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) + phonemes = MyDataset._load_or_generate_phoneme_sequence( + wav_file, text, *func_args) return phonemes def compute_input_seq(self, num_workers=0): @@ -188,13 +197,18 @@ class MyDataset(Dataset): print(" | > Computing input sequences ...") for idx, item in enumerate(tqdm.tqdm(self.items)): text, *_ = item - sequence = np.asarray(text_to_sequence(text, [self.cleaners], - tp=self.tp, add_blank=self.add_blank), - dtype=np.int32) + sequence = np.asarray(text_to_sequence( + text, [self.cleaners], + tp=self.tp, + add_blank=self.add_blank), + dtype=np.int32) self.items[idx][0] = sequence else: - func_args = [self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank] + func_args = [ + self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, + self.phoneme_language, self.tp, self.add_blank + ] if self.verbose: print(" | > Computing phonemes ...") if num_workers == 0: @@ -203,7 +217,11 @@ class MyDataset(Dataset): self.items[idx][0] = phonemes else: with Pool(num_workers) as p: - phonemes = list(tqdm.tqdm(p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items))) + phonemes = list( + tqdm.tqdm(p.imap(MyDataset._phoneme_worker, + [[item, func_args] + for item in self.items]), + total=len(self.items))) for idx, p in enumerate(phonemes): self.items[idx][0] = p diff --git a/TTS/tts/layers/common_layers.py b/TTS/tts/layers/common_layers.py index 5da9b49d..a23bb3f9 100644 --- a/TTS/tts/layers/common_layers.py +++ b/TTS/tts/layers/common_layers.py @@ -124,4 +124,4 @@ class Prenet(nn.Module): x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training) else: x = F.relu(linear(x)) - return x \ No newline at end of file + return x diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 5890f04d..9a803351 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -6,8 +6,6 @@ import subprocess import sys from pathlib import Path -import torch - def get_git_branch(): try: diff --git a/hubconf.py b/hubconf.py index 9de4f7b2..fc7003c9 100644 --- a/hubconf.py +++ b/hubconf.py @@ -33,4 +33,4 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder if __name__ == '__main__': synthesizer = torch.hub.load('mozilla/TTS:hub_conf', 'tts', source='github') - synthesizer.tts("This is a test!") \ No newline at end of file + synthesizer.tts("This is a test!") diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index 9df32fef..3c6d06f5 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -54,6 +54,9 @@ "mulaw": false, // apply mulaw if mode is bits "padding": 2, // pad the input for resnet to see wider input length + // GENERATOR - for backward compatibility + "generator_model": "WaveRNN", + // DATASET //"use_gta": true, // use computed gta features from the tts model "data_path": "tests/data/ljspeech/wavs/", // path containing training wav files From 5b06c74bf418ed29e709f812bd78919795f6d204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 5 Feb 2021 13:23:00 +0000 Subject: [PATCH 028/100] hubconf dependency --- hubconf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hubconf.py b/hubconf.py index fc7003c9..ff65ea4f 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,4 +1,4 @@ -dependencies = ['torch', 'gdown'] +dependencies = ['torch', 'gdown', 'pysbd'] import torch from TTS.utils.synthesizer import Synthesizer From e7e880f514fee8fa4e81f2e2f4484e25f37817c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 5 Feb 2021 13:42:24 +0000 Subject: [PATCH 029/100] fix gdown --- TTS/utils/manage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 3cf8d67f..524d8dbf 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -105,7 +105,7 @@ class ModelManager(object): return output_model_path, output_config_path def _download_file(self, idx, output): - gdown.download(f"{self.url_prefix}{idx}", output=output) + gdown.download(f"{self.url_prefix}{idx}", output=output, quiet=False) From 39f65f139a46b399d5a64aaab266a91ae66fe852 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 5 Feb 2021 13:57:30 +0000 Subject: [PATCH 030/100] hubconf update --- hubconf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hubconf.py b/hubconf.py index ff65ea4f..7fc020b5 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,11 +1,11 @@ -dependencies = ['torch', 'gdown', 'pysbd'] +dependencies = ['torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode'] # apt install espeak import torch from TTS.utils.synthesizer import Synthesizer from TTS.utils.manage import ModelManager -def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder_models/en/ljspeech/mulitband-melgan'): +def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder_models/en/ljspeech/mulitband-melgan', use_cuda=False): """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text. Example: @@ -27,7 +27,7 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder vocoder_path, vocoder_config_path = manager.download_model(vocoder_name) # create synthesizer - synt = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path) + synt = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, use_cuda) return synt From 4f8f274d6e20a9610a5ca8577d5786fb5e38a809 Mon Sep 17 00:00:00 2001 From: gerazov Date: Sat, 6 Feb 2021 22:25:56 +0100 Subject: [PATCH 031/100] restructured arg parsing and processing to utils --- TTS/utils/arguments.py | 207 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 TTS/utils/arguments.py diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py new file mode 100644 index 00000000..c3190e50 --- /dev/null +++ b/TTS/utils/arguments.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Argument parser for training scripts.""" + +import argparse +import re +import glob +import os + +from TTS.utils.generic_utils import ( + create_experiment_folder, get_git_branch) +from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.io import copy_model_files, load_config +from TTS.utils.tensorboard_logger import TensorboardLogger + +from TTS.tts.utils.generic_utils import check_config_tts + + +def parse_arguments(argv): + """Parse command line arguments of training scripts. + + Parameters + ---------- + argv : list + This is a list of input arguments as given by sys.argv + + Returns + ------- + argparse.Namespace + Parsed arguments. + + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--continue_path", + type=str, + help=("Training output folder to continue training. Used to continue " + "a training. If it is used, "config_path" is ignored."), + default="", + required="--config_path" not in argv) + parser.add_argument( + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="") + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in argv) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.") + parser.add_argument( + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument( + "--group_id", + type=str, + default="", + help="DISTRIBUTED: process group id.") + + return parser.parse_args() + + +def get_last_checkpoint(path): + """Get latest checkpoint from a list of filenames. + + It is based on globbing for `*.pth.tar` and the RegEx + `checkpoint_([0-9]+)`. + + Parameters + ---------- + path : list + Path to files to be compared. + + Raises + ------ + ValueError + If no checkpoint files are found. + + Returns + ------- + last_checkpoint : str + Last checkpoint filename. + + """ + last_checkpoint_num = 0 + last_checkpoint = None + filenames = glob.glob( + os.path.join(path, "/*.pth.tar")) + for filename in filenames: + try: + checkpoint_num = int( + re.search(r"checkpoint_([0-9]+)", filename).groups()[0]) + if checkpoint_num > last_checkpoint_num: + last_checkpoint_num = checkpoint_num + last_checkpoint = filename + except AttributeError: # if there's no match in the filename + pass + if last_checkpoint is None: + raise ValueError(f"No checkpoints in {path}!") + else: + return last_checkpoint + + +def process_args(args, model_type): + """Process parsed comand line arguments. + + Parameters + ---------- + args : argparse.Namespace or dict like + Parsed input arguments. + model_type : str + Model type used to check config parameters and setup the TensorBoard + logger. One of: + - tacotron + - glow_tts + - speedy_speech + - gan + - wavegrad + - wavernn + + Raises + ------ + ValueError + If `model_type` is not one of implemented choices. + + Returns + ------- + c : TTS.utils.io.AttrDict + Config paramaters. + out_path : str + Path to save models and logging. + audio_path : str + Path to save generated test audios. + c_logger : TTS.utils.console_logger.ConsoleLogger + Class that does logging to the console. + tb_logger : TTS.utils.tensorboard.TensorboardLogger + Class that does the TensorBoard loggind. + + """ + if args.continue_path != "": + args.output_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + list_of_files = glob.glob( + os.path.join(args.continue_path, "*.pth.tar") + ) # * means all if need specific format then *.csv + args.restore_path = max(list_of_files, key=os.path.getctime) + # args.restore_path = get_last_checkpoint(args.continue_path) + print(f" > Training continues for {args.restore_path}") + + # setup output paths and read configs + c = load_config(args.config_path) + + if model_type in "tacotron glow_tts speedy_speech": + model_class = "TTS" + elif model_type in "gan wavegrad wavernn": + model_class = "VOCODER" + else: + raise ValueError("model type {model_type} not recognized!") + + if model_class == "TTS": + check_config_tts(c) + elif model_class == "VOCODER": + print("Vocoder config checker not implemented, " + "skipping ...") + else: + raise ValueError(f"model type {model_type} not recognized!") + + _ = os.path.dirname(os.path.realpath(__file__)) + + if model_type in "tacotron wavegrad wavernn" and c.mixed_precision: + print(" > Mixed precision mode is ON") + + out_path = args.continue_path + if args.continue_path == "": + out_path = create_experiment_folder(c.output_path, c.run_name, + args.debug) + + audio_path = os.path.join(out_path, "test_audios") + + c_logger = ConsoleLogger() + + if args.rank == 0: + os.makedirs(audio_path, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + copy_model_files(c, args.config_path, + out_path, new_fields) + os.chmod(audio_path, 0o775) + os.chmod(out_path, 0o775) + + log_path = out_path + + tb_logger = TensorboardLogger(log_path, model_name=model_class) + + # write model desc to tensorboard + tb_logger.tb_add_text("model-description", c["run_description"], 0) + + return c, out_path, audio_path, c_logger, tb_logger From 2705d27b285fba03d1f2cbc69bbcbc737137d1b5 Mon Sep 17 00:00:00 2001 From: gerazov Date: Sat, 6 Feb 2021 22:29:30 +0100 Subject: [PATCH 032/100] changed train scripts --- TTS/bin/train_glow_tts.py | 111 +++--------------- TTS/bin/train_speedy_speech.py | 92 ++------------- TTS/bin/train_tacotron.py | 110 ++++-------------- TTS/bin/train_vocoder_gan.py | 106 ++++------------- TTS/bin/train_vocoder_wavegrad.py | 181 +++++++++--------------------- TTS/bin/train_vocoder_wavernn.py | 110 +++--------------- 6 files changed, 134 insertions(+), 576 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 5cd23ce4..407616ec 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +"""Train Glow TTS model.""" -import argparse -import glob import os import sys import time @@ -14,10 +12,12 @@ import torch from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler + +from TTS.utils.arguments import parse_arguments, process_args from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import GlowTTSLoss -from TTS.tts.utils.generic_utils import check_config_tts, setup_model +from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers @@ -25,18 +25,15 @@ from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import init_distributed, reduce_tensor from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import NoamLR, setup_torch_training_env use_cuda, num_gpus = setup_torch_training_env(True, False) + def setup_loader(ap, r, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None @@ -119,7 +116,7 @@ def format_data(data): avg_text_length, avg_spec_length, attn_mask, item_idx -def data_depended_init(data_loader, model): +def data_depended_init(data_loader, model, ap): """Data depended initialization for activation normalization.""" if hasattr(model, 'module'): for f in model.module.decoder.flows: @@ -138,7 +135,7 @@ def data_depended_init(data_loader, model): # format data text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\ - _, _, attn_mask, _ = format_data(data) + _, _, attn_mask, item_idx = format_data(data) # forward pass model _ = model.forward( @@ -177,7 +174,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - avg_text_length, avg_spec_length, attn_mask, _ = format_data(data) + avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data) loader_time = time.time() - end_time @@ -191,20 +188,20 @@ def train(data_loader, model, criterion, optimizer, scheduler, # compute loss loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, - o_dur_log, o_total_dur, text_lengths) + o_dur_log, o_total_dur, text_lengths) # backward pass with loss scaling if c.mixed_precision: scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -332,7 +329,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - _, _, attn_mask, _ = format_data(data) + _, _, attn_mask, item_idx = format_data(data) # forward pass model z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( @@ -468,7 +465,6 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): return keep_avg.avg_values -# FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping @@ -550,14 +546,13 @@ def main(args): # pylint: disable=redefined-outer-name eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step - model = data_depended_init(train_loader, model) + model = data_depended_init(train_loader, model, ap) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, - global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: @@ -567,81 +562,9 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - check_config_tts(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - if c.mixed_precision: - print(" > Mixed precision enabled.") - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='glow_tts') try: main(args) diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 667f5abd..bee37b05 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -11,6 +11,7 @@ import numpy as np from random import randrange import torch +from TTS.utils.arguments import parse_arguments, process_args # DISTRIBUTED from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader @@ -18,7 +19,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import SpeedySpeechLoss -from TTS.tts.utils.generic_utils import check_config_tts, setup_model +from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers @@ -26,14 +27,10 @@ from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import init_distributed, reduce_tensor from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import NoamLR, setup_torch_training_env use_cuda, num_gpus = setup_torch_training_env(True, False) @@ -175,13 +172,13 @@ def train(data_loader, model, criterion, optimizer, scheduler, scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -518,8 +515,7 @@ def main(args): # pylint: disable=redefined-outer-name train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, - global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: @@ -529,81 +525,9 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - check_config_tts(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - if c.mixed_precision: - print(" > Mixed precision enabled.") - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='tts') try: main(args) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 4640a3eb..e8b8b8e9 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +"""Trains Tacotron based TTS models.""" -import argparse -import glob import os import sys import time @@ -11,11 +9,12 @@ from random import randrange import numpy as np import torch +from TTS.utils.arguments import parse_arguments, process_args from torch.utils.data import DataLoader from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import TacotronLoss -from TTS.tts.utils.generic_utils import check_config_tts, setup_model +from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers @@ -23,15 +22,11 @@ from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import (DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor) from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import (NoamLR, adam_weight_decay, check_update, gradual_training_scheduler, set_weight_decay, setup_torch_training_env) @@ -61,7 +56,13 @@ def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): phoneme_language=c.phoneme_language, enable_eos_bos=c.enable_eos_bos_chars, verbose=verbose, - speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None) + speaker_mapping=( + speaker_mapping if ( + c.use_speaker_embedding and + c.use_external_speaker_embedding_file + ) else None + ) + ) if c.use_phonemes and c.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. @@ -178,10 +179,10 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, - linear_input, stop_tokens, stop_targets, - mel_lengths, decoder_backward_output, - alignments, alignment_lengths, - alignments_backward, text_lengths) + linear_input, stop_tokens, stop_targets, + mel_lengths, decoder_backward_output, + alignments, alignment_lengths, alignments_backward, + text_lengths) # check nan loss if torch.isnan(loss_dict['loss']).any(): @@ -199,7 +200,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # stopnet optimizer step if c.separate_stopnet: - scaler_st.scale(loss_dict['stopnet_loss']).backward() + scaler_st.scale( loss_dict['stopnet_loss']).backward() scaler.unscale_(optimizer_st) optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) @@ -491,7 +492,6 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): return keep_avg.avg_values -# FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping @@ -534,7 +534,8 @@ def main(args): # pylint: disable=redefined-outer-name optimizer_st = None # setup criterion - criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) + criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) + if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -640,80 +641,9 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - print(f" > Training continues for {args.continue_path}") - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - - # setup output paths and read configs - c = load_config(args.config_path) - check_config_tts(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - if c.mixed_precision: - print(" > Mixed precision mode is ON") - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='tacotron') try: main(args) diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index a1d1b322..2c1f901a 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -1,5 +1,6 @@ -import argparse -import glob +#!/usr/bin/env python3 +"""Trains GAN based vocoder model.""" + import os import sys import time @@ -7,15 +8,14 @@ import traceback from inspect import signature import torch +from TTS.utils.arguments import parse_arguments, process_args from torch.utils.data import DataLoader from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config + from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger + from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data @@ -33,8 +33,9 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - loader = None - if not is_val or c.run_eval: + if is_val and not c.run_eval: + loader = None + else: dataset = GANDataset(ap=ap, items=eval_data if is_val else train_data, seq_len=c.seq_len, @@ -113,7 +114,7 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, y_hat = model_G(c_G) y_hat_sub = None y_G_sub = None - y_hat_vis = y_hat # for visualization + y_hat_vis = y_hat # for visualization # FIXME! .clone().detach() # PQMF formatting if y_hat.shape[1] > 1: @@ -273,14 +274,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute spectrograms figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -439,7 +440,6 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) return keep_avg.avg_values -# FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global train_data, eval_data @@ -506,7 +506,7 @@ def main(args): # pylint: disable=redefined-outer-name scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) scheduler_disc.optimizer = optimizer_disc except RuntimeError: - # retore only matching layers. + # restore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) @@ -556,7 +556,8 @@ def main(args): # pylint: disable=redefined-outer-name model_disc, criterion_disc, optimizer_disc, scheduler_gen, scheduler_disc, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, + eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, + criterion_disc, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] @@ -575,78 +576,9 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='gan') try: main(args) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index c53612c2..4ef6769c 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -1,5 +1,6 @@ -import argparse -import glob +#!/usr/bin/env python3 +"""Trains WaveGrad vocoder models.""" + import os import sys import time @@ -7,19 +8,16 @@ import traceback import numpy as np import torch +from TTS.utils.arguments import parse_arguments, process_args # DISTRIBUTED from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.optim import Adam from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from TTS.utils.audio import AudioProcessor -from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_model_files, load_config -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset @@ -34,16 +32,16 @@ def setup_loader(ap, is_val=False, verbose=False): loader = None else: dataset = WaveGradDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose) + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=True, + use_noise_augment=False, + use_cache=c.use_cache, + verbose=verbose) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=c.batch_size, @@ -54,6 +52,7 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val else c.num_loader_workers, pin_memory=False) + return loader @@ -78,8 +77,8 @@ def format_test_data(data): return m, x -def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, - epoch): +def train(model, criterion, optimizer, + scheduler, scaler, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 @@ -93,8 +92,7 @@ def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, c_logger.print_train_start() # setup noise schedule noise_schedule = c['train_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], - noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) else: @@ -120,7 +118,7 @@ def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss': loss} + loss_wavegrad_dict = {'wavegrad_loss':loss} # check nan loss if torch.isnan(loss).any(): @@ -133,13 +131,13 @@ def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) scaler.step(optimizer) scaler.update() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) optimizer.step() # schedule update @@ -205,8 +203,7 @@ def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch, OUT_PATH, model_losses=loss_dict, - scaler=scaler.state_dict() - if c.mixed_precision else None) + scaler=scaler.state_dict() if c.mixed_precision else None) end_time = time.time() @@ -247,12 +244,14 @@ def evaluate(model, criterion, ap, global_step, epoch): else: noise, x_noisy, noise_scale = model.compute_y_n(x) + # forward pass noise_hat = model(x_noisy, m, noise_scale) # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss': loss} + loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_dict = dict() for key, value in loss_wavegrad_dict.items(): @@ -283,9 +282,7 @@ def evaluate(model, criterion, ap, global_step, epoch): # setup noise schedule and inference noise_schedule = c['test_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], - noise_schedule['max_val'], - noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) # compute voice @@ -316,8 +313,7 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, - c.eval_split_size) + eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -347,10 +343,6 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = torch.nn.L1Loss().cuda() - if use_cuda: - model.cuda() - criterion.cuda() - if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -384,6 +376,10 @@ def main(args): # pylint: disable=redefined-outer-name else: args.restore_step = 0 + if use_cuda: + model.cuda() + criterion.cuda() + # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) @@ -397,105 +393,32 @@ def main(args): # pylint: disable=redefined-outer-name global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, scheduler, scaler, - ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) + _, global_step = train(model, criterion, optimizer, + scheduler, scaler, ap, global_step, + epoch) + eval_avg_loss_dict = evaluate(model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + best_loss = save_best_model(target_loss, + best_loss, + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None) if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help= - 'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - # DISTRIBUTED - if c.mixed_precision: - print(" > Mixed precision is enabled") - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='wavegrad') try: main(args) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index 7056b74a..e32301fc 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -1,9 +1,10 @@ -import argparse +#!/usr/bin/env python3 +"""Train WaveRNN vocoder model.""" + import os import sys import traceback import time -import glob import random import torch @@ -11,18 +12,14 @@ from torch.utils.data import DataLoader # from torch.utils.data.distributed import DistributedSampler +from TTS.utils.arguments import parse_arguments, process_args from TTS.tts.utils.visual import plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.radam import RAdam -from TTS.utils.io import copy_model_files, load_config from TTS.utils.training import setup_torch_training_env -from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.generic_utils import ( KeepAverage, count_parameters, - create_experiment_folder, - get_git_branch, remove_experiment_folder, set_init_dict, ) @@ -207,7 +204,14 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch c.batched, c.target_samples, c.overlap_samples, + # use_cuda ) + # sample_wav = model.generate(ground_mel, + # c.batched, + # c.target_samples, + # c.overlap_samples, + # use_cuda + # ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms @@ -296,6 +300,7 @@ def evaluate(model, criterion, ap, global_step, epoch): c.batched, c.target_samples, c.overlap_samples, + # use_cuda ) predict_mel = ap.melspectrogram(sample_wav) @@ -306,10 +311,9 @@ def evaluate(model, criterion, ap, global_step, epoch): ) # compute spectrograms - figures = { - "eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T) - } + figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T) + } tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -448,87 +452,9 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--continue_path", - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default="", - required="--config_path" not in sys.argv, - ) - parser.add_argument( - "--restore_path", - type=str, - help="Model file to be restored. Use to finetune a model.", - default="", - ) - parser.add_argument( - "--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in sys.argv, - ) - parser.add_argument( - "--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.", - ) - - # DISTRUBUTED - parser.add_argument( - "--rank", - type=int, - default=0, - help="DISTRIBUTED: process rank for distributed training.", - ) - parser.add_argument( - "--group_id", type=str, default="", help="DISTRIBUTED: process group id." - ) - args = parser.parse_args() - - if args.continue_path != "": - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") - list_of_files = glob.glob( - args.continue_path + "/*.pth.tar" - ) # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - # check_config(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - OUT_PATH = args.continue_path - if args.continue_path == "": - OUT_PATH = create_experiment_folder( - c.output_path, c.run_name, args.debug - ) - - AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_model_files( - c, args.config_path, OUT_PATH, new_fields - ) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") - - # write model desc to tensorboard - tb_logger.tb_add_text("model-description", c["run_description"], 0) + args = parse_arguments(sys.argv) + c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args( + args, model_type='wavernn') try: main(args) From 8fdd08ea157c56c93be033dc23e1355bbf9b07d3 Mon Sep 17 00:00:00 2001 From: gerazov Date: Sat, 6 Feb 2021 22:59:52 +0100 Subject: [PATCH 033/100] updated to current dev --- TTS/bin/train_glow_tts.py | 19 ++--- TTS/bin/train_speedy_speech.py | 10 +-- TTS/bin/train_tacotron.py | 20 +++--- TTS/bin/train_vocoder_gan.py | 15 ++-- TTS/bin/train_vocoder_wavegrad.py | 112 ++++++++++++++++-------------- TTS/bin/train_vocoder_wavernn.py | 40 +++++------ 6 files changed, 109 insertions(+), 107 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 407616ec..3c211496 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -116,7 +116,7 @@ def format_data(data): avg_text_length, avg_spec_length, attn_mask, item_idx -def data_depended_init(data_loader, model, ap): +def data_depended_init(data_loader, model): """Data depended initialization for activation normalization.""" if hasattr(model, 'module'): for f in model.module.decoder.flows: @@ -135,7 +135,7 @@ def data_depended_init(data_loader, model, ap): # format data text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model _ = model.forward( @@ -174,7 +174,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data) + avg_text_length, avg_spec_length, attn_mask, _ = format_data(data) loader_time = time.time() - end_time @@ -188,20 +188,20 @@ def train(data_loader, model, criterion, optimizer, scheduler, # compute loss loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, - o_dur_log, o_total_dur, text_lengths) + o_dur_log, o_total_dur, text_lengths) # backward pass with loss scaling if c.mixed_precision: scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -329,7 +329,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( @@ -546,13 +546,14 @@ def main(args): # pylint: disable=redefined-outer-name eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step - model = data_depended_init(train_loader, model, ap) + model = data_depended_init(train_loader, model) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index bee37b05..7d7d834c 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -172,13 +172,13 @@ def train(data_loader, model, criterion, optimizer, scheduler, scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -515,12 +515,14 @@ def main(args): # pylint: disable=redefined-outer-name train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] - best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, + best_loss = save_best_model(target_loss, best_loss, model, optimizer, + global_step, epoch, c.r, OUT_PATH) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index e8b8b8e9..53e028d3 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -9,8 +9,8 @@ from random import randrange import numpy as np import torch -from TTS.utils.arguments import parse_arguments, process_args from torch.utils.data import DataLoader +from TTS.utils.arguments import parse_arguments, process_args from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import TacotronLoss @@ -62,7 +62,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): c.use_external_speaker_embedding_file ) else None ) - ) + ) if c.use_phonemes and c.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. @@ -179,10 +179,10 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, - linear_input, stop_tokens, stop_targets, - mel_lengths, decoder_backward_output, - alignments, alignment_lengths, alignments_backward, - text_lengths) + linear_input, stop_tokens, stop_targets, + mel_lengths, decoder_backward_output, + alignments, alignment_lengths, + alignments_backward, text_lengths) # check nan loss if torch.isnan(loss_dict['loss']).any(): @@ -200,7 +200,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # stopnet optimizer step if c.separate_stopnet: - scaler_st.scale( loss_dict['stopnet_loss']).backward() + scaler_st.scale(loss_dict['stopnet_loss']).backward() scaler.unscale_(optimizer_st) optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) @@ -534,8 +534,7 @@ def main(args): # pylint: disable=redefined-outer-name optimizer_st = None # setup criterion - criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) - + criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -637,7 +636,8 @@ def main(args): # pylint: disable=redefined-outer-name epoch, c.r, OUT_PATH, - scaler=scaler.state_dict() if c.mixed_precision else None) + scaler=scaler.state_dict() if c.mixed_precision else None + ) if __name__ == '__main__': diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 2c1f901a..1f2beb70 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -8,8 +8,8 @@ import traceback from inspect import signature import torch -from TTS.utils.arguments import parse_arguments, process_args from torch.utils.data import DataLoader +from TTS.utils.arguments import parse_arguments, process_args from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import (KeepAverage, count_parameters, remove_experiment_folder, set_init_dict) @@ -33,9 +33,8 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: + loader = None + if not is_val or c.run_eval: dataset = GANDataset(ap=ap, items=eval_data if is_val else train_data, seq_len=c.seq_len, @@ -114,7 +113,7 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, y_hat = model_G(c_G) y_hat_sub = None y_G_sub = None - y_hat_vis = y_hat # for visualization # FIXME! .clone().detach() + y_hat_vis = y_hat # for visualization # PQMF formatting if y_hat.shape[1] > 1: @@ -274,14 +273,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute spectrograms figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 4ef6769c..d8dc88e1 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -8,12 +8,12 @@ import traceback import numpy as np import torch -from TTS.utils.arguments import parse_arguments, process_args # DISTRIBUTED from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.optim import Adam from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from TTS.utils.arguments import parse_arguments, process_args from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import (KeepAverage, count_parameters, @@ -32,16 +32,16 @@ def setup_loader(ap, is_val=False, verbose=False): loader = None else: dataset = WaveGradDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose) + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=True, + use_noise_augment=False, + use_cache=c.use_cache, + verbose=verbose) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=c.batch_size, @@ -77,8 +77,8 @@ def format_test_data(data): return m, x -def train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, epoch): +def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, + epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 @@ -92,7 +92,8 @@ def train(model, criterion, optimizer, c_logger.print_train_start() # setup noise schedule noise_schedule = c['train_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) else: @@ -118,7 +119,7 @@ def train(model, criterion, optimizer, # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_wavegrad_dict = {'wavegrad_loss': loss} # check nan loss if torch.isnan(loss).any(): @@ -131,13 +132,13 @@ def train(model, criterion, optimizer, scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) scaler.step(optimizer) scaler.update() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) optimizer.step() # schedule update @@ -193,17 +194,19 @@ def train(model, criterion, optimizer, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) end_time = time.time() @@ -250,7 +253,7 @@ def evaluate(model, criterion, ap, global_step, epoch): # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_wavegrad_dict = {'wavegrad_loss': loss} loss_dict = dict() @@ -282,7 +285,9 @@ def evaluate(model, criterion, ap, global_step, epoch): # setup noise schedule and inference noise_schedule = c['test_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], + noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) # compute voice @@ -313,7 +318,8 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, + c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -343,6 +349,10 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = torch.nn.L1Loss().cuda() + if use_cuda: + model.cuda() + criterion.cuda() + if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -376,10 +386,6 @@ def main(args): # pylint: disable=redefined-outer-name else: args.restore_step = 0 - if use_cuda: - model.cuda() - criterion.cuda() - # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) @@ -393,26 +399,26 @@ def main(args): # pylint: disable=redefined-outer-name global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, - global_step, epoch) + _, global_step = train(model, criterion, optimizer, scheduler, scaler, + ap, global_step, epoch) + eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + best_loss = save_best_model( + target_loss, + best_loss, + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) if __name__ == '__main__': diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index e32301fc..b4ffe143 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -178,18 +178,19 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None - ) + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) # synthesize a full voice rand_idx = random.randrange(0, len(train_data)) @@ -204,14 +205,7 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch c.batched, c.target_samples, c.overlap_samples, - # use_cuda ) - # sample_wav = model.generate(ground_mel, - # c.batched, - # c.target_samples, - # c.overlap_samples, - # use_cuda - # ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms @@ -300,7 +294,6 @@ def evaluate(model, criterion, ap, global_step, epoch): c.batched, c.target_samples, c.overlap_samples, - # use_cuda ) predict_mel = ap.melspectrogram(sample_wav) @@ -311,9 +304,10 @@ def evaluate(model, criterion, ap, global_step, epoch): ) # compute spectrograms - figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T) - } + figures = { + "eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T) + } tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) From ad17dc9e761809ebcfbd8496bd7906e73ac2d3a0 Mon Sep 17 00:00:00 2001 From: gerazov Date: Sat, 6 Feb 2021 23:05:01 +0100 Subject: [PATCH 034/100] final fixes --- TTS/bin/train_glow_tts.py | 2 +- TTS/bin/train_tacotron.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 3c211496..9db2381e 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -552,7 +552,7 @@ def main(args): # pylint: disable=redefined-outer-name train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 53e028d3..0a53f2a1 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -56,12 +56,10 @@ def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): phoneme_language=c.phoneme_language, enable_eos_bos=c.enable_eos_bos_chars, verbose=verbose, - speaker_mapping=( - speaker_mapping if ( - c.use_speaker_embedding and - c.use_external_speaker_embedding_file - ) else None - ) + speaker_mapping=(speaker_mapping if ( + c.use_speaker_embedding + and c.use_external_speaker_embedding_file + ) else None) ) if c.use_phonemes and c.compute_input_seq_cache: From e507373b553b46089654727b7fbe773b187c38be Mon Sep 17 00:00:00 2001 From: gerazov Date: Sat, 6 Feb 2021 23:08:47 +0100 Subject: [PATCH 035/100] final final fixes --- TTS/utils/arguments.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index c3190e50..f0b45d5e 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -35,7 +35,7 @@ def parse_arguments(argv): "--continue_path", type=str, help=("Training output folder to continue training. Used to continue " - "a training. If it is used, "config_path" is ignored."), + "a training. If it is used, 'config_path' is ignored."), default="", required="--config_path" not in argv) parser.add_argument( @@ -151,6 +151,7 @@ def process_args(args, model_type): os.path.join(args.continue_path, "*.pth.tar") ) # * means all if need specific format then *.csv args.restore_path = max(list_of_files, key=os.path.getctime) + # checkpoint number based continuing # args.restore_path = get_last_checkpoint(args.continue_path) print(f" > Training continues for {args.restore_path}") From acc0af760b447b060ce597d3cdf5dd047d501120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 8 Feb 2021 10:54:52 +0000 Subject: [PATCH 036/100] update nose command for circle CI --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8aba5c75..5f6db915 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -31,7 +31,7 @@ jobs: - run: | sudo pip install --quiet --upgrade cardboardlint pylint cardboardlinter --refspec ${CIRCLE_BRANCH} -n auto - - run: nosetests tests --nocapture + - run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker - run: | sudo ./tests/test_server_package.sh sudo ./tests/test_glow-tts_train.sh From b22d99c978b2ff14941cc0523d87fad2a3957e02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 8 Feb 2021 11:42:19 +0000 Subject: [PATCH 037/100] reduce glowtts size for testing --- tests/test_glow_tts.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_glow_tts.py b/tests/test_glow_tts.py index 2d375031..d290baf5 100644 --- a/tests/test_glow_tts.py +++ b/tests/test_glow_tts.py @@ -44,8 +44,8 @@ class GlowTTSTrainTest(unittest.TestCase): # model to train model = GlowTts( num_chars=32, - hidden_channels_enc=128, - hidden_channels_dec=128, + hidden_channels_enc=48, + hidden_channels_dec=48, hidden_channels_dp=32, out_channels=80, encoder_type='rel_pos_transformer', @@ -54,7 +54,7 @@ class GlowTTSTrainTest(unittest.TestCase): 'dropout_p': 0.1, 'num_layers': 6, 'num_heads': 2, - 'hidden_channels_ffn': 768, # 4 times the hidden_channels + 'hidden_channels_ffn': 16, # 4 times the hidden_channels 'input_length': None }, use_encoder_prenet=True, @@ -73,8 +73,8 @@ class GlowTTSTrainTest(unittest.TestCase): # reference model to compare model weights model_ref = GlowTts( num_chars=32, - hidden_channels_enc=128, - hidden_channels_dec=128, + hidden_channels_enc=48, + hidden_channels_dec=48, hidden_channels_dp=32, out_channels=80, encoder_type='rel_pos_transformer', @@ -83,7 +83,7 @@ class GlowTTSTrainTest(unittest.TestCase): 'dropout_p': 0.1, 'num_layers': 6, 'num_heads': 2, - 'hidden_channels_ffn': 768, # 4 times the hidden_channels + 'hidden_channels_ffn': 16, # 4 times the hidden_channels 'input_length': None }, use_encoder_prenet=True, From 7aa873b558aa1b219d6447e752992d835334f403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 8 Feb 2021 12:25:46 +0000 Subject: [PATCH 038/100] use single process dataloder in tests --- tests/inputs/test_config.json | 2 +- tests/inputs/test_glow_tts.json | 4 ++-- tests/inputs/test_train_config.json | 4 ++-- tests/inputs/test_vocoder_multiband_melgan_config.json | 4 ++-- tests/inputs/test_vocoder_wavegrad.json | 4 ++-- tests/inputs/test_vocoder_wavernn_config.json | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/inputs/test_config.json b/tests/inputs/test_config.json index ca4eef03..17c369aa 100644 --- a/tests/inputs/test_config.json +++ b/tests/inputs/test_config.json @@ -43,7 +43,7 @@ "batch_size": 2, "r": 5, "mk": 1.0, - "num_loader_workers": 4, + "num_loader_workers": 0, "memory_size": 5, "save_step": 200, diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index ff8a81ea..e7d86eef 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -112,8 +112,8 @@ // DATA LOADING "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 500, // DATASET-RELATED: maximum text length diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_train_config.json index ee0680e3..cfd33669 100644 --- a/tests/inputs/test_train_config.json +++ b/tests/inputs/test_train_config.json @@ -127,8 +127,8 @@ // DATA LOADING "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json index 442550c6..9540b32b 100644 --- a/tests/inputs/test_vocoder_multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -134,8 +134,8 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 10, // PATHS diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json index 8fa0bbe1..fc8059ec 100644 --- a/tests/inputs/test_vocoder_wavegrad.json +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -104,8 +104,8 @@ "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 4, // PATHS diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index 3c6d06f5..d477a66b 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -100,8 +100,8 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 10, // number of samples for testing // PATHS From c619859a3f4c8997207eed12bc95ec1e8e691a87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 9 Feb 2021 11:43:17 +0000 Subject: [PATCH 039/100] linter fixes --- TTS/bin/train_speedy_speech.py | 2 +- TTS/utils/arguments.py | 5 ++--- tests/test_text_processing.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 7d7d834c..a9a83bbf 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -515,7 +515,7 @@ def main(args): # pylint: disable=redefined-outer-name train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index f0b45d5e..948c90d3 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -104,8 +104,7 @@ def get_last_checkpoint(path): pass if last_checkpoint is None: raise ValueError(f"No checkpoints in {path}!") - else: - return last_checkpoint + return last_checkpoint def process_args(args, model_type): @@ -193,7 +192,7 @@ def process_args(args, model_type): if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, + copy_model_files(c, args.config_path, out_path, new_fields) os.chmod(audio_path, 0o775) os.chmod(out_path, 0o775) diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 2f68c6e7..8c075d06 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -21,7 +21,7 @@ def test_phoneme_to_sequence(): text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" assert text_hat == text_hat_with_params == gt - + # multiple punctuations text = "Be a voice, not an! echo?" sequence = phoneme_to_sequence(text, text_cleaner, lang) From cea5e517f2409475c0fb6c4fba5137cb4ec3dc2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 9 Feb 2021 14:24:14 +0000 Subject: [PATCH 040/100] download github model releases by model manager --- TTS/.models.json | 1 + TTS/utils/manage.py | 44 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 075861db..4805ddba 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -14,6 +14,7 @@ "model_file": "1CFoPDQBnhfBFu2Gc0TBSJn8o-TuNKQn7", "config_file": "1lWSscNfKet1zZSJCNirOn7v9bigUZ8C1", "stats_file": "1qevpGRVHPmzfiRBNuugLMX62x1k7B5vK", + "github_rls_url": null, "commit": "" }, "speedy-speech-wn":{ diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 524d8dbf..db62acd1 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -1,8 +1,11 @@ +import io import json import os +import zipfile from pathlib import Path import gdown +import requests from TTS.utils.generic_utils import get_user_data_dir from TTS.utils.io import load_config @@ -71,6 +74,11 @@ class ModelManager(object): 'type/language/dataset/model' e.g. 'tts_model/en/ljspeech/tacotron' + Every model must have the following files + - *.pth.tar : pytorch model checkpoint file. + - config.json : model config file. + - scale_stats.npy (if exist): scale values for preprocessing. + Args: model_name (str): model name as explained above. @@ -91,11 +99,17 @@ class ModelManager(object): print(f" > Downloading model to {output_path}") output_stats_path = None # download files to the output path - self._download_file(model_item['model_file'], output_model_path) - self._download_file(model_item['config_file'], output_config_path) - if model_item['stats_file'] is not None and len(model_item['stats_file']) > 1: + if self._check_dict_key(model_item, 'github_rls_url'): + # download from github release + # TODO: pass output_path + self._download_zip_file(model_item['github_rls_url'], output_path) + else: + # download from gdrive + self._download_gdrive_file(model_item['model_file'], output_model_path) + self._download_gdrive_file(model_item['config_file'], output_config_path) + if self._check_dict_key(model_item, 'scale_stats'): output_stats_path = os.path.join(output_path, 'scale_stats.npy') - self._download_file(model_item['stats_file'], output_stats_path) + self._download_gdrive_file(model_item['stats_file'], output_stats_path) # set scale stats path in config.json config_path = output_config_path config = load_config(config_path) @@ -104,9 +118,25 @@ class ModelManager(object): json.dump(config, jf) return output_model_path, output_config_path - def _download_file(self, idx, output): - gdown.download(f"{self.url_prefix}{idx}", output=output, quiet=False) - + def _download_gdrive_file(self, gdrive_idx, output): + """Download files from GDrive using their file ids""" + gdown.download(f"{self.url_prefix}{gdrive_idx}", output=output, quiet=False) + + def _download_zip_file(self, file_url, output): + """Download the target zip file and extract the files + to a folder with the same name as the zip file.""" + r = requests.get(file_url) + z = zipfile.ZipFile(io.BytesIO(r.content)) + z.extractall(output) + + @staticmethod + def _check_dict_key(my_dict, key): + if key in my_dict.keys() and my_dict[key] is not None: + if not isinstance(key, str): + return True + if isinstance(key, str) and len(my_dict[key]) > 0: + return True + return False From 0c6519c8381778215fc683c865f689bf8e6e7b8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 9 Feb 2021 14:24:58 +0000 Subject: [PATCH 041/100] test_compute_statistics.sh --- run_tests.sh | 1 + setup.py | 2 +- tests/test_compute_statistics.sh | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) create mode 100755 tests/test_compute_statistics.sh diff --git a/run_tests.sh b/run_tests.sh index abfc53d4..003bfe41 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -12,6 +12,7 @@ nosetests tests -x &&\ ./tests/test_vocoder_wavernn_train.sh && \ ./tests/test_vocoder_wavegrad_train.sh && \ ./tests/test_speedy_speech_train.sh && \ +./tests/test_compute_statistics.sh && \ # linter check cardboardlinter --refspec master \ No newline at end of file diff --git a/setup.py b/setup.py index eee958bb..38aba0f5 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ args, unknown_args = parser.parse_known_args() # Remove our arguments from argv so that setuptools doesn't see them sys.argv = [sys.argv[0]] + unknown_args -version = '0.0.9.1' +version = '0.0.10' cwd = os.path.dirname(os.path.abspath(__file__)) # Handle Cython code diff --git a/tests/test_compute_statistics.sh b/tests/test_compute_statistics.sh new file mode 100755 index 00000000..c2b32282 --- /dev/null +++ b/tests/test_compute_statistics.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -xe +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/inputs/test_glow_tts.json --out_path $BASEDIR/outputs/scale_stats.npy + From 9cad4352887db95a3456c0120262be76247023d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 9 Feb 2021 15:11:26 +0000 Subject: [PATCH 042/100] css10 dataset preprocessor --- TTS/tts/datasets/preprocess.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 56fc75f5..7815d87d 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -166,6 +166,20 @@ def ljspeech(root_path, meta_file): return items +def css10(root_path, meta_file): + """Normalizes the CSS10 dataset file to TTS format""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "ljspeech" + with open(txt_file, 'r') as ttf: + for line in ttf: + cols = line.split('|') + wav_file = os.path.join(root_path, cols[0]) + text = cols[1] + items.append([text, wav_file, speaker_name]) + return items + + def nancy(root_path, meta_file): """Normalizes the Nancy meta data file to TTS format""" txt_file = os.path.join(root_path, meta_file) From b08b8ca2a1d22d07c2c9ec3d9702d25298ced728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 10 Feb 2021 13:30:59 +0000 Subject: [PATCH 043/100] add russian phoneme char --- TTS/tts/utils/text/symbols.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/utils/text/symbols.py b/TTS/tts/utils/text/symbols.py index 544277c5..68e74585 100644 --- a/TTS/tts/utils/text/symbols.py +++ b/TTS/tts/utils/text/symbols.py @@ -30,7 +30,7 @@ _vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ' _non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ' _pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ' _suprasegmentals = 'ˈˌːˑ' -_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ' +_other_symbols = 'ʍwɥʜʢʡɕʑɺɧʲ' _diacrilics = 'ɚ˞ɫ' _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics From 6c824a6629ed6dd26c445beb45620f08dc1b41c6 Mon Sep 17 00:00:00 2001 From: Adonis Pujols Date: Thu, 11 Feb 2021 04:48:53 -0500 Subject: [PATCH 044/100] spelling error. should be multiband not mulitband --- TTS/.models.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 075861db..fc3e91da 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -65,7 +65,7 @@ }, "en": { "ljspeech":{ - "mulitband-melgan":{ + "multiband-melgan":{ "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K", "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu", "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU", @@ -74,4 +74,4 @@ } } } -} \ No newline at end of file +} From b29a7e9645bee1eb949283d92a4986eae237fc9a Mon Sep 17 00:00:00 2001 From: Adonis Pujols Date: Thu, 11 Feb 2021 04:49:28 -0500 Subject: [PATCH 045/100] spelling error. should be multiband not mulitband --- TTS/server/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/server/README.md b/TTS/server/README.md index a8d8635a..54c85bd6 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -17,10 +17,10 @@ List officially released models. ```python TTS/server/server.py --list_models ``` Run the server with the official models. -```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan``` +```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan``` Run the server with the official models on a GPU. -```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan --use_cuda True``` +```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` Run the server with a custom models. ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json``` From 48011a8b58f4364f09a325096943891da893e1b7 Mon Sep 17 00:00:00 2001 From: Adonis Pujols Date: Thu, 11 Feb 2021 05:26:06 -0500 Subject: [PATCH 046/100] add encoding="utf-8" --- TTS/utils/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 46abf1c8..5d0cd6f1 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -22,7 +22,7 @@ class AttrDict(dict): def read_json_with_comments(json_path): # fallback to json - with open(json_path, "r") as f: + with open(json_path, "r", encoding = "utf-8") as f: input_str = f.read() # handle comments input_str = re.sub(r'\\\n', '', input_str) @@ -40,7 +40,7 @@ def load_config(config_path: str) -> AttrDict: ext = os.path.splitext(config_path)[1] if ext in (".yml", ".yaml"): - with open(config_path, "r") as f: + with open(config_path, "r", encoding = "utf-8") as f: data = yaml.safe_load(f) else: data = read_json_with_comments(config_path) @@ -61,7 +61,7 @@ def copy_model_files(c, config_file, out_path, new_fields): """ # copy config.json copy_config_path = os.path.join(out_path, 'config.json') - config_lines = open(config_file, "r").readlines() + config_lines = open(config_file, "r", encoding = "utf-8").readlines() # add extra information fields for key, value in new_fields.items(): if isinstance(value, str): From 3c2e13ca5c4a9f354bb685e3e355a7950180e4a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 10:36:52 +0000 Subject: [PATCH 047/100] fix the default vocoder name --- TTS/bin/synthesize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index e0d214d5..12829acd 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -77,7 +77,7 @@ def main(): parser.add_argument( '--vocoder_name', type=str, - default="vocoder_models/en/ljspeech/mulitband-melgan", + default="vocoder_models/en/ljspeech/multiband-melgan", help= 'Name of one of the pre-trained vocoder models in format //' ) From 3baec4ea962fcd6c5ad7eb7b8a9fc59e608a1e27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 11:14:39 +0000 Subject: [PATCH 048/100] add missing phonemes to test_config.json --- TTS/tts/utils/text/__init__.py | 8 +++++--- tests/inputs/test_config.json | 2 +- tests/test_text_processing.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 29f4af1d..23c5ab5f 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- import re -from packaging import version + import phonemizer +from packaging import version from phonemizer.phonemize import phonemize from TTS.tts.utils.text import cleaners -from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \ - _eos +from TTS.tts.utils.text.symbols import (_bos, _eos, _phoneme_punctuations, + make_symbols, phonemes, symbols) + # pylint: disable=unnecessary-comprehension # Mappings from symbol to numeric ID and vice versa: diff --git a/tests/inputs/test_config.json b/tests/inputs/test_config.json index 17c369aa..b28bec64 100644 --- a/tests/inputs/test_config.json +++ b/tests/inputs/test_config.json @@ -29,7 +29,7 @@ "bos": "^", "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", "punctuations":"!'(),-.:;? ", - "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫʲ" }, "hidden_size": 128, diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 8c075d06..61a83fa1 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -17,7 +17,7 @@ def test_phoneme_to_sequence(): lang = "en-us" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) - _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" assert text_hat == text_hat_with_params == gt From f1799dbd6066232077074242fafdfa91ccf10e82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 11:25:31 +0000 Subject: [PATCH 049/100] docstring update --- TTS/bin/synthesize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 12829acd..ad3d6cbb 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -27,7 +27,9 @@ def main(): # pylint: disable=bad-continuation parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n''' - '''You can either use your trained model or choose a model from the provided list.\n'''\ + '''You can either use your trained model or choose a model from the provided list.\n\n'''\ + + '''If you don't specify any models, then it uses LJSpeech based English models\n\n'''\ ''' Example runs: From bc131208be374ba7939684ec874180577b8a7f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 13:04:47 +0000 Subject: [PATCH 050/100] fix spelling of a def argument and parse phonemes from config.json if use_phonemes is True --- TTS/server/server.py | 2 +- TTS/tts/utils/generic_utils.py | 2 +- TTS/tts/utils/text/symbols.py | 21 +++++++++++---------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/TTS/server/server.py b/TTS/server/server.py index 425879cf..da9c8079 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -18,7 +18,7 @@ def create_argparser(): parser = argparse.ArgumentParser() parser.add_argument('--list_models', type=convert_boolean, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.') parser.add_argument('--model_name', type=str, default="tts_models/en/ljspeech/speedy-speech-wn", help='name of one of the released tts models.') - parser.add_argument('--vocoder_name', type=str, default="vocoder_models/en/ljspeech/mulitband-melgan", help='name of one of the released vocoder models.') + parser.add_argument('--vocoder_name', type=str, default="vocoder_models/en/ljspeech/multiband-melgan", help='name of one of the released vocoder models.') parser.add_argument('--tts_checkpoint', type=str, help='path to custom tts checkpoint file') parser.add_argument('--tts_config', type=str, help='path to custom tts config.json file') parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model') diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index 7a4c3a30..d898aebd 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -163,7 +163,7 @@ def check_config_tts(c): check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys() and c['use_phonemes'], val_type=str) check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) # normalization parameters diff --git a/TTS/tts/utils/text/symbols.py b/TTS/tts/utils/text/symbols.py index 68e74585..7a7e8844 100644 --- a/TTS/tts/utils/text/symbols.py +++ b/TTS/tts/utils/text/symbols.py @@ -5,19 +5,20 @@ Defines the set of symbols used in text input to the model. The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' -def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name +def make_symbols(characters, phonemes=None, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name ''' Function to create symbols and phonemes ''' - _phonemes_sorted = sorted(list(phonemes)) - - # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): - _arpabet = ['@' + s for s in _phonemes_sorted] - - # Export all symbols: - _symbols = [pad, eos, bos] + list(characters) + _arpabet - _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) - + _symbols = [pad, eos, bos] + list(characters) + _phonemes = None + if phonemes is not None: + _phonemes_sorted = sorted(list(phonemes)) + # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): + _arpabet = ['@' + s for s in _phonemes_sorted] + # Export all symbols: + _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) + _symbols += _arpabet return _symbols, _phonemes + _pad = '_' _eos = '~' _bos = '^' From ff27690ca7b3fad0f897c5ba731de8bff4dd0bc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 13:43:29 +0000 Subject: [PATCH 051/100] bug fix --- TTS/utils/manage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index db62acd1..72894890 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -107,7 +107,7 @@ class ModelManager(object): # download from gdrive self._download_gdrive_file(model_item['model_file'], output_model_path) self._download_gdrive_file(model_item['config_file'], output_config_path) - if self._check_dict_key(model_item, 'scale_stats'): + if self._check_dict_key(model_item, 'stats_file'): output_stats_path = os.path.join(output_path, 'scale_stats.npy') self._download_gdrive_file(model_item['stats_file'], output_stats_path) # set scale stats path in config.json From 2043a9b5f596b6f1eaa97e8555e7c7c67bd75c26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 15:25:55 +0000 Subject: [PATCH 052/100] define default vocoders --- TTS/.models.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TTS/.models.json b/TTS/.models.json index 1d416245..12970797 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -7,6 +7,7 @@ "model_file": "1NFsfhH8W8AgcfJ-BsL8CYAwQfZ5k4T-n", "config_file": "1IAROF3yy9qTK43vG_-R67y3Py9yYbD6t", "stats_file": null, + "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "" }, "tacotron2-DCA": { @@ -15,6 +16,7 @@ "config_file": "1lWSscNfKet1zZSJCNirOn7v9bigUZ8C1", "stats_file": "1qevpGRVHPmzfiRBNuugLMX62x1k7B5vK", "github_rls_url": null, + "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "" }, "speedy-speech-wn":{ @@ -22,6 +24,7 @@ "model_file": "1VXAwiq6N-Viq3rsSXlf43bdoi0jSvMAJ", "config_file": "1KvZilhsNP3EumVggDcD46yd834eO5hR3", "stats_file": "1Ju7apZ5JlgsVECcETL-GEx3DRoNzWfkR", + "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "77b6145" } } @@ -32,6 +35,7 @@ "model_file": "1jZ4HvYcAXI5ZClke2iGA7qFQQJBXIovw", "config_file": "1s7g4n-B73ChCB48AQ88_DV_8oyLth8r0", "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv", + "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "commit": "" } } @@ -42,6 +46,7 @@ "model_file": "1qyxrrCyoXUvBG2lqVd0KqAlHj-2nZCgS", "config_file": "1yECKeP2LI7tNv4E8yVNx1yLmCfTCpkqG", "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv", + "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "commit": "" } } From 0657b38111cc6374dd18459a65a986546fc45bd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 15:26:17 +0000 Subject: [PATCH 053/100] use default vocoder in synthesize.py --- TTS/bin/synthesize.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index ad3d6cbb..059cef23 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -40,7 +40,10 @@ def main(): # run tts with default models. ./TTS/bin synthesize.py --text "Text for TTS" - # run a model from the list + # run a tts model with its default vocoder model. + ./TTS/bin synthesize.py --text "Text for TTS" --model_name "//" + + # run with specific tts and vocoder models from the list ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "//" --vocoder_name "//" --output_path # run your own TTS model (Using Griffin-Lim Vocoder) @@ -79,7 +82,7 @@ def main(): parser.add_argument( '--vocoder_name', type=str, - default="vocoder_models/en/ljspeech/multiband-melgan", + default=None, help= 'Name of one of the pre-trained vocoder models in format //' ) @@ -163,10 +166,11 @@ def main(): # CASE2: load pre-trained models if args.model_name is not None: - model_path, config_path = manager.download_model(args.model_name) + model_path, config_path, model_item = manager.download_model(args.model_name) + args.vocoder_name = model_item['default_vocoder'] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None: - vocoder_path, vocoder_config_path = manager.download_model(args.vocoder_name) + vocoder_path, vocoder_config_path, vocoder_item = manager.download_model(args.vocoder_name) # CASE3: load custome models if args.model_path is not None: From 43f54d2dce62f4b86ad363155a13713196c56d1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 15:26:52 +0000 Subject: [PATCH 054/100] fix make_symbols --- TTS/tts/utils/text/symbols.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/utils/text/symbols.py b/TTS/tts/utils/text/symbols.py index 7a7e8844..7a741a8f 100644 --- a/TTS/tts/utils/text/symbols.py +++ b/TTS/tts/utils/text/symbols.py @@ -10,7 +10,7 @@ def make_symbols(characters, phonemes=None, punctuations='!\'(),-.:;? ', pad='_' _symbols = [pad, eos, bos] + list(characters) _phonemes = None if phonemes is not None: - _phonemes_sorted = sorted(list(phonemes)) + _phonemes_sorted = sorted(list(set(phonemes))) # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): _arpabet = ['@' + s for s in _phonemes_sorted] # Export all symbols: @@ -45,4 +45,4 @@ if __name__ == '__main__': print(" > TTS symbols {}".format(len(symbols))) print(symbols) print(" > TTS phonemes {}".format(len(phonemes))) - print(phonemes) + print(''.join(sorted(phonemes))) From 1649ad343117ebac7778877ee7953d3e3963209a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 15:27:20 +0000 Subject: [PATCH 055/100] save_wav with a custom sampling rate --- TTS/bin/synthesize.py | 2 +- TTS/utils/audio.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 059cef23..382a4fc6 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -219,7 +219,7 @@ def main(): str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' out_path = os.path.join(args.out_path, file_name) print(" > Saving output to {}".format(out_path)) - synthesizer.save_wav(wav, out_path) + synthesizer.save_wav(wav, out_path,) if __name__ == "__main__": diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 3d31ce6e..af613ba3 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -342,9 +342,10 @@ class AudioProcessor(object): x = self.sound_norm(x) return x - def save_wav(self, wav, path): + def save_wav(self, wav, path, sample_rate=None): + sample_rate = self.sample_rate if sample_rate is None else sample_rate wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) + scipy.io.wavfile.write(path, sample_rate, wav_norm.astype(np.int16)) @staticmethod def mulaw_encode(wav, qc): From 0c52d27d65ca0c72f42247a02e7e0436935e3249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 15:27:41 +0000 Subject: [PATCH 056/100] return the json entry of the downloaded model --- TTS/utils/manage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 72894890..97cdf2b6 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -116,7 +116,7 @@ class ModelManager(object): config["audio"]['stats_path'] = output_stats_path with open(config_path, "w") as jf: json.dump(config, jf) - return output_model_path, output_config_path + return output_model_path, output_config_path, model_item def _download_gdrive_file(self, gdrive_idx, output): """Download files from GDrive using their file ids""" From 8aa6a0decb588c45d6083094371a3c1c019ee347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 15:28:07 +0000 Subject: [PATCH 057/100] set an output_sample_rate in synthesizer and use it for writing the wav file --- TTS/utils/synthesizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 85e116cf..2a779e53 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -45,8 +45,10 @@ class Synthesizer(object): assert torch.cuda.is_available(), "CUDA is not availabe on this machine." self.load_tts(tts_checkpoint, tts_config, use_cuda) + self.output_sample_rate = self.tts_config.audio['sample_rate'] if vocoder_checkpoint: self.load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) + self.output_sample_rate = self.vocoder_config.audio['sample_rate'] @staticmethod def get_segmenter(lang): @@ -75,6 +77,7 @@ class Synthesizer(object): def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement + global symbols, phonemes self.tts_config = load_config(tts_config) @@ -104,7 +107,7 @@ class Synthesizer(object): def save_wav(self, wav, path): wav = np.array(wav) - self.ap.save_wav(wav, path) + self.ap.save_wav(wav, path, self.output_sample_rate) def split_into_sentences(self, text): return self.seg.segment(text) From a1e595790d32b26677d10d5b0ac4b5f1d18df2b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Feb 2021 15:31:39 +0000 Subject: [PATCH 058/100] use default vocoders in server.pu --- TTS/server/server.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/TTS/server/server.py b/TTS/server/server.py index da9c8079..7cf98394 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -18,7 +18,7 @@ def create_argparser(): parser = argparse.ArgumentParser() parser.add_argument('--list_models', type=convert_boolean, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.') parser.add_argument('--model_name', type=str, default="tts_models/en/ljspeech/speedy-speech-wn", help='name of one of the released tts models.') - parser.add_argument('--vocoder_name', type=str, default="vocoder_models/en/ljspeech/multiband-melgan", help='name of one of the released vocoder models.') + parser.add_argument('--vocoder_name', type=str, default=None, help='name of one of the released vocoder models.') parser.add_argument('--tts_checkpoint', type=str, help='path to custom tts checkpoint file') parser.add_argument('--tts_config', type=str, help='path to custom tts config.json file') parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model') @@ -58,10 +58,11 @@ if args.list_models: # set models by the released models if args.model_name is not None: - tts_checkpoint_file, tts_config_file = manager.download_model(args.model_name) + tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(args.model_name) + args.vocoder_name = tts_json_dict['default_vocoder'] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None: - vocoder_checkpoint_file, vocoder_config_file = manager.download_model(args.vocoder_name) + vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(args.vocoder_name) # If these were not specified in the CLI args, use default values with embedded model files if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file): From af46727517e63147554753e6958b348fad5fe616 Mon Sep 17 00:00:00 2001 From: gerazov Date: Fri, 12 Feb 2021 02:12:00 +0100 Subject: [PATCH 059/100] loading last checkpoint/best_model works, deleting last best models options added, loading last best_loss added --- TTS/bin/train_glow_tts.py | 18 +- TTS/bin/train_speedy_speech.py | 14 +- TTS/bin/train_tacotron.py | 12 +- TTS/bin/train_vocoder_gan.py | 15 +- TTS/bin/train_vocoder_wavegrad.py | 12 +- TTS/bin/train_vocoder_wavernn.py | 14 +- TTS/tts/configs/config.json | 346 +++++++++--------- TTS/tts/configs/glow_tts_gated_conv.json | 2 + TTS/tts/configs/glow_tts_ljspeech.json | 2 + .../ljspeech_tacotron2_dynamic_conv_attn.json | 344 ++++++++--------- TTS/tts/configs/speedy_speech_ljspeech.json | 2 + TTS/utils/arguments.py | 73 ++-- .../multiband-melgan_and_rwd_config.json | 2 + .../configs/multiband_melgan_config.json | 2 + .../multiband_melgan_config_mozilla.json | 2 + .../configs/parallel_wavegan_config.json | 2 + .../configs/universal_fullband_melgan.json | 2 + TTS/vocoder/configs/wavegrad_libritts.json | 2 + TTS/vocoder/configs/wavernn_config.json | 2 + TTS/vocoder/utils/io.py | 30 +- 20 files changed, 507 insertions(+), 391 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 9db2381e..14a20149 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -538,8 +538,16 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(args.best_path) + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_best = c.get('keep_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) @@ -549,7 +557,8 @@ def main(args): # pylint: disable=redefined-outer-name model = data_depended_init(train_loader, model) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, + train_avg_loss_dict, global_step = train(train_loader, model, + criterion, optimizer, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, @@ -558,8 +567,9 @@ def main(args): # pylint: disable=redefined-outer-name target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] - best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, - OUT_PATH) + best_loss = save_best_model(target_loss, best_loss, model, optimizer, + global_step, epoch, c.r, OUT_PATH, + keep_best=keep_best, keep_after=keep_after) if __name__ == '__main__': diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index a9a83bbf..4e521451 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -502,8 +502,16 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(args.best_path) + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_best = c.get('keep_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) @@ -522,8 +530,8 @@ def main(args): # pylint: disable=redefined-outer-name if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, - global_step, epoch, c.r, - OUT_PATH) + global_step, epoch, c.r, OUT_PATH, + keep_best=keep_best, keep_after=keep_after) if __name__ == '__main__': diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 0a53f2a1..cdc68c94 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -581,8 +581,16 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(args.best_path) + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_best = c.get('keep_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_best False # define data loaders train_loader = setup_loader(ap, @@ -634,6 +642,8 @@ def main(args): # pylint: disable=redefined-outer-name epoch, c.r, OUT_PATH, + keep_best=keep_best, + keep_after=keep_after, scaler=scaler.state_dict() if c.mixed_precision else None ) diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 1f2beb70..ecc33288 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -545,8 +545,16 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(args.best_path) + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_best = c.get('keep_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_best False global_step = args.restore_step for epoch in range(0, c.epochs): @@ -571,7 +579,10 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, - model_losses=eval_avg_loss_dict) + keep_best=keep_best, + keep_after=keep_after, + model_losses=eval_avg_loss_dict, + ) if __name__ == '__main__': diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index d8dc88e1..7846aae5 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -393,8 +393,16 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print(" > WaveGrad has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): + if args.restore_step == 0 or not args.best_path: best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(args.best_path) + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_best = c.get('keep_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_best False global_step = args.restore_step for epoch in range(0, c.epochs): @@ -416,6 +424,8 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, + keep_best=keep_best, + keep_after=keep_after, model_losses=eval_avg_loss_dict, scaler=scaler.state_dict() if c.mixed_precision else None ) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index b4ffe143..44ffef14 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -416,8 +416,16 @@ def main(args): # pylint: disable=redefined-outer-name num_parameters = count_parameters(model_wavernn) print(" > Model has {} parameters".format(num_parameters), flush=True) - if "best_loss" not in locals(): - best_loss = float("inf") + if args.restore_step == 0 or not args.best_path: + best_loss = float('inf') + print(" > Starting with inf best loss.") + else: + print(args.best_path) + best_loss = torch.load(args.best_path, + map_location='cpu')['model_loss'] + print(f" > Starting with loaded last best loss {best_loss}.") + keep_best = c.get('keep_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_best False global_step = args.restore_step for epoch in range(0, c.epochs): @@ -440,6 +448,8 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, + keep_best=keep_best, + keep_after=keep_after, model_losses=eval_avg_loss_dict, scaler=scaler.state_dict() if c.mixed_precision else None ) diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json index 48f20e8f..5bd249d9 100644 --- a/TTS/tts/configs/config.json +++ b/TTS/tts/configs/config.json @@ -1,172 +1,174 @@ -{ - "model": "Tacotron2", - "run_name": "ljspeech-ddc", - "run_description": "tacotron2 with DDC and differential spectral loss.", - - // AUDIO PARAMETERS - "audio":{ - // stft parameters - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - - // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // Griffin-Lim - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 1, - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - - // VOCABULARY PARAMETERS - // if custom character set is not defined, - // default set in symbols.py is used - // "characters":{ - // "pad": "_", - // "eos": "~", - // "bos": "^", - // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - // "punctuations":"!'(),-.:;? ", - // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" - // }, - - // DISTRIBUTED TRAINING - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":16, - "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. - - // LOSS SETTINGS - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled - "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled - "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled - "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled - "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. - - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "wd": 0.000001, // Weight decay weight. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - - // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. - "prenet_type": "original", // "original" or "bn". - "prenet_dropout": false, // enable/disable dropout at prenet. - - // TACOTRON ATTENTION - "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' - "attention_heads": 4, // number of attention heads (only for 'graves') - "attention_norm": "sigmoid", // softmax or sigmoid. - "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. - "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. - "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ - "ddc_r": 7, // reduction rate for coarse decoder. - - // STOPNET - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log training on console. - "tb_plot_step": 100, // Number of steps to plot TB training figures. - "print_eval": false, // If True, it prints intermediate loss values in evalulation. - "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "text_cleaner": "phoneme_cleaners", - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "batch_group_size": 4, //Number of batches to shuffle after bucketing. - "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 153, // DATASET-RELATED: maximum text length - "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage. - "use_noise_augment": true, - - // PATHS - "output_path": "/home/erogol/Models/LJSpeech/", - - // PHONEMES - "phoneme_cache_path": "/home/erogol/Models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - - // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_gst": false, // use global style tokens - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} - // with the dictionary being len(dict) <= len(gst_style_tokens). - "gst_embedding_dim": 512, - "gst_num_heads": 4, - "gst_style_tokens": 10, - "gst_use_speaker_embedding": false - }, - - // DATASETS - "datasets": // List of datasets. They all merged and they get different speaker_ids. - [ - { - "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", - "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers - "meta_file_val": null - } - ] -} - +{ + "model": "Tacotron2", + "run_name": "ljspeech-ddc", + "run_description": "tacotron2 with DDC and differential spectral loss.", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":16, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 4, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage. + "use_noise_augment": true, + + // PATHS + "output_path": "/home/erogol/Models/LJSpeech/", + + // PHONEMES + "phoneme_cache_path": "/home/erogol/Models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": false, // use global style tokens + "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10, + "gst_use_speaker_embedding": false + }, + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "/home/erogol/Data/LJSpeech-1.1/", + "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers + "meta_file_val": null + } + ] +} + diff --git a/TTS/tts/configs/glow_tts_gated_conv.json b/TTS/tts/configs/glow_tts_gated_conv.json index d34fbaf0..865c6f29 100644 --- a/TTS/tts/configs/glow_tts_gated_conv.json +++ b/TTS/tts/configs/glow_tts_gated_conv.json @@ -93,6 +93,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "apex_amp_level": null, diff --git a/TTS/tts/configs/glow_tts_ljspeech.json b/TTS/tts/configs/glow_tts_ljspeech.json index 636d9313..6e15de10 100644 --- a/TTS/tts/configs/glow_tts_ljspeech.json +++ b/TTS/tts/configs/glow_tts_ljspeech.json @@ -105,6 +105,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json b/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json index cd5ad8ab..3cf66870 100644 --- a/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json +++ b/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json @@ -1,171 +1,173 @@ -{ - "model": "Tacotron2", - "run_name": "ljspeech-dcattn", - "run_description": "tacotron2 with dynamic convolution attention.", - - // AUDIO PARAMETERS - "audio":{ - // stft parameters - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - - // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // Griffin-Lim - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 1, - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - - // VOCABULARY PARAMETERS - // if custom character set is not defined, - // default set in symbols.py is used - // "characters":{ - // "pad": "_", - // "eos": "~", - // "bos": "^", - // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - // "punctuations":"!'(),-.:;? ", - // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" - // }, - - // DISTRIBUTED TRAINING - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":16, - "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. - - // LOSS SETTINGS - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled - "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled - "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled - "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled - "ga_alpha": 0.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. - - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "wd": 0.000001, // Weight decay weight. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - - // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. - "prenet_type": "original", // "original" or "bn". - "prenet_dropout": false, // enable/disable dropout at prenet. - - // TACOTRON ATTENTION - "attention_type": "dynamic_convolution", // 'original' , 'graves', 'dynamic_convolution' - "attention_heads": 4, // number of attention heads (only for 'graves') - "attention_norm": "softmax", // softmax or sigmoid. - "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. - "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. - "double_decoder_consistency": false, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ - "ddc_r": 7, // reduction rate for coarse decoder. - - // STOPNET - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log training on console. - "tb_plot_step": 100, // Number of steps to plot TB training figures. - "print_eval": false, // If True, it prints intermediate loss values in evalulation. - "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "text_cleaner": "phoneme_cleaners", - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "batch_group_size": 4, //Number of batches to shuffle after bucketing. - "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 153, // DATASET-RELATED: maximum text length - "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage. - - // PATHS - "output_path": "/home/erogol/Models/LJSpeech/", - - // PHONEMES - "phoneme_cache_path": "/home/erogol/Models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - - // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_gst": false, // use global style tokens - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} - // with the dictionary being len(dict) <= len(gst_style_tokens). - "gst_embedding_dim": 512, - "gst_num_heads": 4, - "gst_style_tokens": 10, - "gst_use_speaker_embedding": false - }, - - // DATASETS - "datasets": // List of datasets. They all merged and they get different speaker_ids. - [ - { - "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", - "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers - "meta_file_val": null - } - ] -} - +{ + "model": "Tacotron2", + "run_name": "ljspeech-dcattn", + "run_description": "tacotron2 with dynamic convolution attention.", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":16, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 0.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "dynamic_convolution", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "softmax", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": false, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 4, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage. + + // PATHS + "output_path": "/home/erogol/Models/LJSpeech/", + + // PHONEMES + "phoneme_cache_path": "/home/erogol/Models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": false, // use global style tokens + "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10, + "gst_use_speaker_embedding": false + }, + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "/home/erogol/Data/LJSpeech-1.1/", + "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers + "meta_file_val": null + } + ] +} + diff --git a/TTS/tts/configs/speedy_speech_ljspeech.json b/TTS/tts/configs/speedy_speech_ljspeech.json index bd511470..9f1d3f8b 100644 --- a/TTS/tts/configs/speedy_speech_ljspeech.json +++ b/TTS/tts/configs/speedy_speech_ljspeech.json @@ -109,6 +109,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n "mixed_precision": false, diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 948c90d3..d05936dc 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -43,6 +43,11 @@ def parse_arguments(argv): type=str, help="Model file to be restored. Use to finetune a model.", default="") + parser.add_argument( + "--best_path", + type=str, + help="Best model file to be used for extracting best loss.", + default="") parser.add_argument( "--config_path", type=str, @@ -67,11 +72,11 @@ def parse_arguments(argv): return parser.parse_args() -def get_last_checkpoint(path): - """Get latest checkpoint from a list of filenames. +def get_last_models(path): + """Get latest checkpoint or/and best model in path. It is based on globbing for `*.pth.tar` and the RegEx - `checkpoint_([0-9]+)`. + `(checkpoint|best_model)_([0-9]+)`. Parameters ---------- @@ -81,7 +86,7 @@ def get_last_checkpoint(path): Raises ------ ValueError - If no checkpoint files are found. + If no checkpoint or best_model files are found. Returns ------- @@ -89,22 +94,37 @@ def get_last_checkpoint(path): Last checkpoint filename. """ - last_checkpoint_num = 0 - last_checkpoint = None - filenames = glob.glob( - os.path.join(path, "/*.pth.tar")) - for filename in filenames: - try: - checkpoint_num = int( - re.search(r"checkpoint_([0-9]+)", filename).groups()[0]) - if checkpoint_num > last_checkpoint_num: - last_checkpoint_num = checkpoint_num - last_checkpoint = filename - except AttributeError: # if there's no match in the filename - pass - if last_checkpoint is None: - raise ValueError(f"No checkpoints in {path}!") - return last_checkpoint + file_names = glob.glob(os.path.join(path, "*.pth.tar")) + last_models = {} + last_model_nums = {} + for key in ['checkpoint', 'best_model']: + last_model_num = 0 + last_model = None + for file_name in file_names: + try: + model_num = int(re.search( + f"{key}_([0-9]+)", file_name).groups()[0]) + if model_num > last_model_num: + last_model_num = model_num + last_model = file_name + except AttributeError: # if there's no match in the filename + continue + last_models[key] = last_model + last_model_nums[key] = last_model_num + + # check what models were found + if not last_models: + raise ValueError(f"No models found in continue path {path}!") + elif 'checkpoint' not in last_models: # no checkpoint just best model + last_models['checkpoint'] = last_models['best_model'] + elif 'best_model' not in last_models: # no best model + # this shouldn't happen, but let's handle it just in case + last_models['best_model'] = None + # finally check if last best model is more recent than checkpoint + elif last_model_nums['best_model'] > last_model_nums['checkpoint']: + last_models['checkpoint'] = last_models['best_model'] + + return last_models['checkpoint'], last_models['best_model'] def process_args(args, model_type): @@ -143,15 +163,12 @@ def process_args(args, model_type): Class that does the TensorBoard loggind. """ - if args.continue_path != "": + if args.continue_path: args.output_path = args.continue_path args.config_path = os.path.join(args.continue_path, "config.json") - list_of_files = glob.glob( - os.path.join(args.continue_path, "*.pth.tar") - ) # * means all if need specific format then *.csv - args.restore_path = max(list_of_files, key=os.path.getctime) - # checkpoint number based continuing - # args.restore_path = get_last_checkpoint(args.continue_path) + args.restore_path, best_model = get_last_models(args.continue_path) + if not args.best_path: + args.best_path = best_model print(f" > Training continues for {args.restore_path}") # setup output paths and read configs @@ -178,7 +195,7 @@ def process_args(args, model_type): print(" > Mixed precision mode is ON") out_path = args.continue_path - if args.continue_path == "": + if not out_path: out_path = create_experiment_folder(c.output_path, c.run_name, args.debug) diff --git a/TTS/vocoder/configs/multiband-melgan_and_rwd_config.json b/TTS/vocoder/configs/multiband-melgan_and_rwd_config.json index 0b751854..b4d42f4b 100644 --- a/TTS/vocoder/configs/multiband-melgan_and_rwd_config.json +++ b/TTS/vocoder/configs/multiband-melgan_and_rwd_config.json @@ -138,6 +138,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/multiband_melgan_config.json b/TTS/vocoder/configs/multiband_melgan_config.json index 7a5a13e3..af2af8a3 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.json +++ b/TTS/vocoder/configs/multiband_melgan_config.json @@ -128,6 +128,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/multiband_melgan_config_mozilla.json b/TTS/vocoder/configs/multiband_melgan_config_mozilla.json index 4978d42f..0f133fa7 100644 --- a/TTS/vocoder/configs/multiband_melgan_config_mozilla.json +++ b/TTS/vocoder/configs/multiband_melgan_config_mozilla.json @@ -141,6 +141,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/parallel_wavegan_config.json b/TTS/vocoder/configs/parallel_wavegan_config.json index fcd765bd..85e659f4 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.json +++ b/TTS/vocoder/configs/parallel_wavegan_config.json @@ -130,6 +130,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/universal_fullband_melgan.json b/TTS/vocoder/configs/universal_fullband_melgan.json index fe4433c2..efb6f3cd 100644 --- a/TTS/vocoder/configs/universal_fullband_melgan.json +++ b/TTS/vocoder/configs/universal_fullband_melgan.json @@ -124,6 +124,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/wavegrad_libritts.json b/TTS/vocoder/configs/wavegrad_libritts.json index a271ce33..9107d556 100644 --- a/TTS/vocoder/configs/wavegrad_libritts.json +++ b/TTS/vocoder/configs/wavegrad_libritts.json @@ -103,6 +103,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 5000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index effb103b..220904c9 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -89,6 +89,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/utils/io.py b/TTS/vocoder/utils/io.py index 5c42dfca..232d972d 100644 --- a/TTS/vocoder/utils/io.py +++ b/TTS/vocoder/utils/io.py @@ -1,4 +1,5 @@ import os +import glob import torch import datetime import pickle as pickle_tts @@ -61,12 +62,13 @@ def save_checkpoint(model, optimizer, scheduler, model_disc, optimizer_disc, scheduler_disc, current_step, epoch, checkpoint_path, **kwargs) -def save_best_model(target_loss, best_loss, model, optimizer, scheduler, +def save_best_model(current_loss, best_loss, model, optimizer, scheduler, model_disc, optimizer_disc, scheduler_disc, current_step, - epoch, output_folder, **kwargs): - if target_loss < best_loss: - file_name = 'best_model.pth.tar' - checkpoint_path = os.path.join(output_folder, file_name) + epoch, out_path, keep_best=False, keep_after=10000, + **kwargs): + if current_loss < best_loss: + best_model_name = f'best_model_{current_step}.pth.tar' + checkpoint_path = os.path.join(out_path, best_model_name) print(" > BEST MODEL : {}".format(checkpoint_path)) save_model(model, optimizer, @@ -77,7 +79,21 @@ def save_best_model(target_loss, best_loss, model, optimizer, scheduler, current_step, epoch, checkpoint_path, - model_loss=target_loss, + model_loss=current_loss, **kwargs) - best_loss = target_loss + # only delete previous if current is saved successfully + if not keep_best or (current_step < keep_after): + model_names = glob.glob( + os.path.join(out_path, 'best_model*.pth.tar')) + for model_name in model_names: + if os.path.basename(model_name) == best_model_name: + continue + os.remove(model_name) + # create symlink to best model for convinience + link_name = 'best_model.pth.tar' + link_path = os.path.join(out_path, link_name) + if os.path.islink(link_path) or os.path.isfile(link_path): + os.remove(link_path) + os.symlink(best_model_name, os.path.join(out_path, link_name)) + best_loss = current_loss return best_loss From 8b6fd76ad2351be666849303b0ce5d8e260c0a37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 09:46:11 +0000 Subject: [PATCH 060/100] find unique characters in a dataset --- TTS/bin/find_unique_chars.py | 48 ++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 TTS/bin/find_unique_chars.py diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py new file mode 100644 index 00000000..e6c35878 --- /dev/null +++ b/TTS/bin/find_unique_chars.py @@ -0,0 +1,48 @@ +"""Find all the unique characters in a dataset""" +import os +import argparse +from argparse import RawTextHelpFormatter + +from TTS.tts.datasets.preprocess import get_preprocessor_by_name + + +def main(): + parser = argparse.ArgumentParser(description='''Find all the unique characters or phonemes in a dataset.\n\n''' + + '''Target dataset must be defined in TTS.tts.datasets.preprocess\n\n'''\ + + ''' + Example runs: + + python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv + ''', + formatter_class=RawTextHelpFormatter) + + parser.add_argument( + '--dataset', + type=str, + default='', + help='One of the target dataset names in TTS.tts.datasets.preprocess.' + ) + + parser.add_argument( + '--meta_file', + type=str, + default=None, + help='Path to the transcriptions file of the dataset.' + ) + + args = parser.parse_args() + + preprocessor = get_preprocessor_by_name(args.dataset) + items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file)) + texts = " ".join([item[0] for item in items]) + chars = set(texts) + lower_chars = set(texts.lower()) + print(f" > Number of unique characters: {len(chars)}") + print(f" > Unique characters: {''.join(sorted(chars))}") + print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") + + +if __name__ == "__main__": + main() \ No newline at end of file From 310d18325e8f4bc4b9c46c27fd6d4572aecb4577 Mon Sep 17 00:00:00 2001 From: gerazov Date: Fri, 12 Feb 2021 10:55:45 +0100 Subject: [PATCH 061/100] brushed up printing model load path and best loss path --- TTS/bin/train_glow_tts.py | 6 ++++-- TTS/bin/train_speedy_speech.py | 4 +++- TTS/bin/train_tacotron.py | 10 ++++++---- TTS/bin/train_vocoder_gan.py | 8 +++++--- TTS/bin/train_vocoder_wavegrad.py | 4 +++- TTS/bin/train_vocoder_wavernn.py | 4 +++- TTS/utils/arguments.py | 4 +--- 7 files changed, 25 insertions(+), 15 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 14a20149..072ad41b 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -497,6 +497,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion = GlowTTSLoss() if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)} ...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before @@ -514,7 +515,7 @@ def main(args): # pylint: disable=redefined-outer-name for group in optimizer.param_groups: group['initial_lr'] = c.lr - print(" > Model restored from step %d" % checkpoint['step'], + print(f" > Model restored from step {checkpoint['step']:d}", flush=True) args.restore_step = checkpoint['step'] else: @@ -542,7 +543,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float('inf') print(" > Starting with inf best loss.") else: - print(args.best_path) + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 4e521451..8e9dbc2e 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -461,6 +461,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion = SpeedySpeechLoss(c) if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)} ...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before @@ -506,7 +507,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float('inf') print(" > Starting with inf best loss.") else: - print(args.best_path) + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index cdc68c94..2382c0be 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -534,12 +534,13 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: - print(" > Restoring Model.") + print(" > Restoring Model...") model.load_state_dict(checkpoint['model']) # optimizer restore - print(" > Restoring Optimizer.") + print(" > Restoring Optimizer...") optimizer.load_state_dict(checkpoint['optimizer']) if "scaler" in checkpoint and c.mixed_precision: print(" > Restoring AMP Scaler...") @@ -547,7 +548,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.reinit_layers: raise RuntimeError except (KeyError, RuntimeError): - print(" > Partial model initialization.") + print(" > Partial model initialization...") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) @@ -585,7 +586,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float('inf') print(" > Starting with inf best loss.") else: - print(args.best_path) + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index ecc33288..9043a560 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -485,6 +485,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion_disc = DiscriminatorLoss(c) if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Generator Model...") @@ -523,7 +524,7 @@ def main(args): # pylint: disable=redefined-outer-name for group in optimizer_disc.param_groups: group['lr'] = c.lr_disc - print(" > Model restored from step %d" % checkpoint['step'], + print(f" > Model restored from step {checkpoint['step']:d}", flush=True) args.restore_step = checkpoint['step'] else: @@ -549,10 +550,11 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float('inf') print(" > Starting with inf best loss.") else: - print(args.best_path) + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] - print(f" > Starting with loaded last best loss {best_loss}.") + print(f" > Starting with best loss of {best_loss}.") keep_best = c.get('keep_best', False) keep_after = c.get('keep_after', 10000) # void if keep_best False diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 7846aae5..271e8d4c 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -354,6 +354,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion.cuda() if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Model...") @@ -397,7 +398,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float('inf') print(" > Starting with inf best loss.") else: - print(args.best_path) + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index 44ffef14..5fde5025 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -383,6 +383,7 @@ def main(args): # pylint: disable=redefined-outer-name # restore any checkpoint if args.restore_path: + print(f" > Restoring from {os.path.basename(args.restore_path)}...") checkpoint = torch.load(args.restore_path, map_location="cpu") try: print(" > Restoring Model...") @@ -420,7 +421,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float('inf') print(" > Starting with inf best loss.") else: - print(args.best_path) + print(" > Restoring best loss from " + f"{os.path.basename(args.best_path)} ...") best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index d05936dc..44345dd5 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -169,7 +169,6 @@ def process_args(args, model_type): args.restore_path, best_model = get_last_models(args.continue_path) if not args.best_path: args.best_path = best_model - print(f" > Training continues for {args.restore_path}") # setup output paths and read configs c = load_config(args.config_path) @@ -184,8 +183,7 @@ def process_args(args, model_type): if model_class == "TTS": check_config_tts(c) elif model_class == "VOCODER": - print("Vocoder config checker not implemented, " - "skipping ...") + print("Vocoder config checker not implemented, skipping ...") else: raise ValueError(f"model type {model_type} not recognized!") From 702dff3edcefd4b9e7d43cc8ec618f7c21c8d75a Mon Sep 17 00:00:00 2001 From: gerazov Date: Fri, 12 Feb 2021 11:03:52 +0100 Subject: [PATCH 062/100] added keep_best and keep_after to test configs. --- tests/inputs/test_glow_tts.json | 2 + tests/inputs/test_speedy_speech.json | 2 + tests/inputs/test_train_config.json | 352 +++++++++--------- .../test_vocoder_multiband_melgan_config.json | 2 + tests/inputs/test_vocoder_wavegrad.json | 2 + tests/inputs/test_vocoder_wavernn_config.json | 2 + 6 files changed, 187 insertions(+), 175 deletions(-) diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index e7d86eef..338ed8ec 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -106,6 +106,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "apex_amp_level": null, diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech.json index ae4b8b2d..4f9f36bc 100644 --- a/tests/inputs/test_speedy_speech.json +++ b/tests/inputs/test_speedy_speech.json @@ -111,6 +111,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n "mixed_precision": false, diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_train_config.json index cfd33669..8c9e20d3 100644 --- a/tests/inputs/test_train_config.json +++ b/tests/inputs/test_train_config.json @@ -1,175 +1,177 @@ -{ - "model": "Tacotron2", - "run_name": "test_sample_dataset_run", - "run_description": "sample dataset test run", - - // AUDIO PARAMETERS - "audio":{ - // stft parameters - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - - // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // Griffin-Lim - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - - // VOCABULARY PARAMETERS - // if custom character set is not defined, - // default set in symbols.py is used - // "characters":{ - // "pad": "_", - // "eos": "~", - // "bos": "^", - // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - // "punctuations":"!'(),-.:;? ", - // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" - // }, - - // DISTRIBUTED TRAINING - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":1, - "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "mixed_precision": false, - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // LOSS SETTINGS - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled - "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled - "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled - "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled - "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. - - // OPTIMIZER - "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1.0, // upper limit for gradients for clipping. - "epochs": 1, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "wd": 0.000001, // Weight decay weight. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - - // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. - "prenet_type": "bn", // "original" or "bn". - "prenet_dropout": false, // enable/disable dropout at prenet. - - // TACOTRON ATTENTION - "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' - "attention_heads": 4, // number of attention heads (only for 'graves') - "attention_norm": "sigmoid", // softmax or sigmoid. - "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. - "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. - "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ - "ddc_r": 7, // reduction rate for coarse decoder. - - // STOPNET - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - - // TENSORBOARD and LOGGING - "print_step": 1, // Number of steps to log training on console. - "tb_plot_step": 100, // Number of steps to plot TB training figures. - "print_eval": false, // If True, it prints intermediate loss values in evalulation. - "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "text_cleaner": "phoneme_cleaners", - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. - "batch_group_size": 0, //Number of batches to shuffle after bucketing. - "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 153, // DATASET-RELATED: maximum text length - "compute_input_seq_cache": true, - - // PATHS - "output_path": "tests/train_outputs/", - - // PHONEMES - "phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - - // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_gst": true, // use global style tokens - "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} - // with the dictionary being len(dict) == len(gst_style_tokens). - "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST. - "gst_embedding_dim": 512, - "gst_num_heads": 4, - "gst_style_tokens": 10 - }, - - // DATASETS - "train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. - "eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. - "datasets": // List of datasets. They all merged and they get different speaker_ids. - [ - { - "name": "ljspeech", - "path": "tests/data/ljspeech/", - "meta_file_train": "metadata.csv", - "meta_file_val": "metadata.csv" - } - ] - -} - +{ + "model": "Tacotron2", + "run_name": "test_sample_dataset_run", + "run_description": "sample dataset test run", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "mixed_precision": false, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 1, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": true, + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": null, + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) == len(gst_style_tokens). + "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST. + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, + + // DATASETS + "train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] + +} + diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json index 9540b32b..e1d201ab 100644 --- a/tests/inputs/test_vocoder_multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -131,6 +131,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json index fc8059ec..5a068751 100644 --- a/tests/inputs/test_vocoder_wavegrad.json +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -101,6 +101,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 10000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index d477a66b..4239e8bd 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -97,6 +97,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING From 0e78e31dbf98380395060593095ecebabd2bed0e Mon Sep 17 00:00:00 2001 From: gerazov Date: Fri, 12 Feb 2021 11:36:01 +0100 Subject: [PATCH 063/100] reformated docstrings in arguments.py --- TTS/utils/arguments.py | 85 ++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 53 deletions(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 44345dd5..0a06b562 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -19,16 +19,11 @@ from TTS.tts.utils.generic_utils import check_config_tts def parse_arguments(argv): """Parse command line arguments of training scripts. - Parameters - ---------- - argv : list - This is a list of input arguments as given by sys.argv - - Returns - ------- - argparse.Namespace - Parsed arguments. + Args: + argv (list): This is a list of input arguments as given by sys.argv + Returns: + argparse.Namespace: Parsed arguments. """ parser = argparse.ArgumentParser() parser.add_argument( @@ -46,7 +41,8 @@ def parse_arguments(argv): parser.add_argument( "--best_path", type=str, - help="Best model file to be used for extracting best loss.", + help=("Best model file to be used for extracting best loss." + "If not specified, the latest best model in continue path is used"), default="") parser.add_argument( "--config_path", @@ -78,21 +74,14 @@ def get_last_models(path): It is based on globbing for `*.pth.tar` and the RegEx `(checkpoint|best_model)_([0-9]+)`. - Parameters - ---------- - path : list - Path to files to be compared. + Args: + path (list): Path to files to be compared. - Raises - ------ - ValueError - If no checkpoint or best_model files are found. - - Returns - ------- - last_checkpoint : str - Last checkpoint filename. + Raises: + ValueError: If no checkpoint or best_model files are found. + Returns: + last_checkpoint (str): Last checkpoint filename. """ file_names = glob.glob(os.path.join(path, "*.pth.tar")) last_models = {} @@ -130,38 +119,28 @@ def get_last_models(path): def process_args(args, model_type): """Process parsed comand line arguments. - Parameters - ---------- - args : argparse.Namespace or dict like - Parsed input arguments. - model_type : str - Model type used to check config parameters and setup the TensorBoard - logger. One of: - - tacotron - - glow_tts - - speedy_speech - - gan - - wavegrad - - wavernn + Args: + args (argparse.Namespace or dict like): Parsed input arguments. + model_type (str): Model type used to check config parameters and setup + the TensorBoard logger. One of: + - tacotron + - glow_tts + - speedy_speech + - gan + - wavegrad + - wavernn - Raises - ------ - ValueError - If `model_type` is not one of implemented choices. - - Returns - ------- - c : TTS.utils.io.AttrDict - Config paramaters. - out_path : str - Path to save models and logging. - audio_path : str - Path to save generated test audios. - c_logger : TTS.utils.console_logger.ConsoleLogger - Class that does logging to the console. - tb_logger : TTS.utils.tensorboard.TensorboardLogger - Class that does the TensorBoard loggind. + Raises: + ValueError: If `model_type` is not one of implemented choices. + Returns: + c (TTS.utils.io.AttrDict): Config paramaters. + out_path (str): Path to save models and logging. + audio_path (str): Path to save generated test audios. + c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does + logging to the console. + tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does + the TensorBoard loggind. """ if args.continue_path: args.output_path = args.continue_path From e774f68aeefd9dfac5a09847cb8def93a5e22184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 12:03:42 +0000 Subject: [PATCH 064/100] save used model characters to the checkpoints --- TTS/bin/train_glow_tts.py | 11 +++++++---- TTS/bin/train_speedy_speech.py | 11 +++++++---- TTS/bin/train_tacotron.py | 7 ++++++- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 9db2381e..a12c5581 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -268,7 +268,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, + save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, model_characters, model_loss=loss_dict['loss']) # wait all kernels to be completed @@ -467,7 +467,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping + global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): @@ -477,7 +477,10 @@ def main(args): # pylint: disable=redefined-outer-name if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - num_chars = len(phonemes) if c.use_phonemes else len(symbols) + + # set model characters + model_characters = phonemes if c.use_phonemes else symbols + num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) @@ -559,7 +562,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, - OUT_PATH) + OUT_PATH, model_characters) if __name__ == '__main__': diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index a9a83bbf..1f32c8f6 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -247,7 +247,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, + save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, model_characters, model_loss=loss_dict['loss']) # wait all kernels to be completed @@ -431,7 +431,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): # FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping + global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): @@ -441,7 +441,10 @@ def main(args): # pylint: disable=redefined-outer-name if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - num_chars = len(phonemes) if c.use_phonemes else len(symbols) + + # set model characters + model_characters = phonemes if c.use_phonemes else symbols + num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=True) @@ -523,7 +526,7 @@ def main(args): # pylint: disable=redefined-outer-name target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, - OUT_PATH) + OUT_PATH, model_characters) if __name__ == '__main__': diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 0a53f2a1..a9c0881f 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -284,6 +284,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH, optimizer_st=optimizer_st, model_loss=loss_dict['postnet_loss'], + characters=model_characters, scaler=scaler.state_dict() if c.mixed_precision else None) # Diagnostic visualizations @@ -492,9 +493,11 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping + global meta_data_train, meta_data_eval, speaker_mapping, symbols, phonemes, model_characters # Audio processor ap = AudioProcessor(**c.audio) + + # setup custom characters if set in config file. if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) @@ -503,6 +506,7 @@ def main(args): # pylint: disable=redefined-outer-name init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) + model_characters = phonemes if c.use_phonemes else symbols # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) @@ -634,6 +638,7 @@ def main(args): # pylint: disable=redefined-outer-name epoch, c.r, OUT_PATH, + model_characters, scaler=scaler.state_dict() if c.mixed_precision else None ) From 918f007a11cc9fcd603bbd398fb3718be5f8b1b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 12:04:07 +0000 Subject: [PATCH 065/100] docstring update --- TTS/tts/datasets/TTSDataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 3b327cbc..16329ad7 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -39,6 +39,7 @@ class MyDataset(Dataset): compute_linear_spec (bool): compute linear spectrogram if True. ap (TTS.tts.utils.AudioProcessor): audio processor object. meta_data (list): list of dataset instances. + tp (dict): dict of custom text characters used for converting texts to sequences. batch_group_size (int): (0) range of batch randomization after sorting sequences by length. min_seq_len (int): (0) minimum sequence length to be processed From 2abfff17f928789576884dea72b88127b73cb71c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 12:04:41 +0000 Subject: [PATCH 066/100] enable saving model characters in io.py --- TTS/tts/utils/io.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/TTS/tts/utils/io.py b/TTS/tts/utils/io.py index 63e04283..fe94d98d 100644 --- a/TTS/tts/utils/io.py +++ b/TTS/tts/utils/io.py @@ -38,7 +38,15 @@ def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False return model, state -def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_dict=None, **kwargs): +def save_model(model, + optimizer, + current_step, + epoch, + r, + output_path, + characters, + amp_state_dict=None, + **kwargs): """Save ```TTS.tts.models``` states with extra fields. Args: @@ -48,6 +56,7 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_ epoch (int): current number of training epochs. r (int): model reduction rate for Tacotron models. output_path (str): output path to save the model file. + characters (list): list of characters used in the model. amp_state_dict (state_dict, optional): Apex.amp state dict if Apex is enabled. Defaults to None. """ if hasattr(model, 'module'): @@ -60,7 +69,8 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_ 'step': current_step, 'epoch': epoch, 'date': datetime.date.today().strftime("%B %d, %Y"), - 'r': r + 'r': r, + 'characters': characters } if amp_state_dict: state['amp'] = amp_state_dict @@ -68,7 +78,8 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_ torch.save(state, output_path) -def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs): +def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, + characters, **kwargs): """Save model checkpoint, intended for saving checkpoints at training. Args: @@ -78,14 +89,16 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **k epoch (int): current number of training epochs. r (int): model reduction rate for Tacotron models. output_path (str): output path to save the model file. + characters (list): list of characters used in the model. """ file_name = 'checkpoint_{}.pth.tar'.format(current_step) checkpoint_path = os.path.join(output_folder, file_name) print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch, r, checkpoint_path, **kwargs) + save_model(model, optimizer, current_step, epoch, r, checkpoint_path, characters, **kwargs) -def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs): +def save_best_model(target_loss, best_loss, model, optimizer, current_step, + epoch, r, output_folder, characters, **kwargs): """Save model checkpoint, intended for saving the best model after each epoch. It compares the current model loss with the best loss so far and saves the model if the current loss is better. @@ -99,6 +112,7 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc epoch (int): current number of training epochs. r (int): model reduction rate for Tacotron models. output_path (str): output path to save the model file. + characters (list): list of characters used in the model. Returns: float: updated current best loss. @@ -107,6 +121,6 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc file_name = 'best_model.pth.tar' checkpoint_path = os.path.join(output_folder, file_name) print(" >> BEST MODEL : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs) + save_model(model, optimizer, current_step, epoch, r, checkpoint_path, characters, model_loss=target_loss, **kwargs) best_loss = target_loss return best_loss From 593cedee145a16c80b0cf6a23dab95142e71a3e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 12:05:56 +0000 Subject: [PATCH 067/100] parse_characters function --- TTS/tts/utils/text/symbols.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/TTS/tts/utils/text/symbols.py b/TTS/tts/utils/text/symbols.py index 7a741a8f..e82967fb 100644 --- a/TTS/tts/utils/text/symbols.py +++ b/TTS/tts/utils/text/symbols.py @@ -41,6 +41,16 @@ symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _e # from random import shuffle # shuffle(phonemes) + +def parse_symbols(): + return {'pad': _pad, + 'eos': _eos, + 'bos': _bos, + 'characters': _characters, + 'punctuations': _punctuations, + 'phonemes': _phonemes} + + if __name__ == '__main__': print(" > TTS symbols {}".format(len(symbols))) print(symbols) From 7ab527d17e6ea91f6c7f9919a59bd4b98148eaa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 12:06:46 +0000 Subject: [PATCH 068/100] save default model chars to the training config file --- TTS/utils/arguments.py | 71 ++++++++++++++++++++---------------------- TTS/utils/io.py | 2 +- 2 files changed, 34 insertions(+), 39 deletions(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 948c90d3..031a3140 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -3,17 +3,18 @@ """Argument parser for training scripts.""" import argparse -import re import glob import os - -from TTS.utils.generic_utils import ( - create_experiment_folder, get_git_branch) -from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.io import copy_model_files, load_config -from TTS.utils.tensorboard_logger import TensorboardLogger +import re +import json from TTS.tts.utils.generic_utils import check_config_tts +from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.generic_utils import create_experiment_folder, get_git_branch +from TTS.utils.io import (copy_model_files, load_config, + save_characters_to_config) +from TTS.utils.tensorboard_logger import TensorboardLogger +from TTS.tts.utils.text.symbols import parse_symbols def parse_arguments(argv): @@ -110,38 +111,27 @@ def get_last_checkpoint(path): def process_args(args, model_type): """Process parsed comand line arguments. - Parameters - ---------- - args : argparse.Namespace or dict like - Parsed input arguments. - model_type : str - Model type used to check config parameters and setup the TensorBoard - logger. One of: - - tacotron - - glow_tts - - speedy_speech - - gan - - wavegrad - - wavernn + Args: + args (argparse.Namespace or dict like): Parsed input arguments. + model_type (str): Model type used to check config parameters and setup the TensorBoard + logger. One of: + - tacotron + - glow_tts + - speedy_speech + - gan + - wavegrad + - wavernn - Raises - ------ - ValueError - If `model_type` is not one of implemented choices. - - Returns - ------- - c : TTS.utils.io.AttrDict - Config paramaters. - out_path : str - Path to save models and logging. - audio_path : str - Path to save generated test audios. - c_logger : TTS.utils.console_logger.ConsoleLogger - Class that does logging to the console. - tb_logger : TTS.utils.tensorboard.TensorboardLogger - Class that does the TensorBoard loggind. + Raises: + ValueError + If `model_type` is not one of implemented choices. + Returns: + c (TTS.utils.io.AttrDict): Config paramaters. + out_path (str): Path to save models and logging. + audio_path (str): Path to save generated test audios. + c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does logging to the console. + tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does the TensorBoard loggind. """ if args.continue_path != "": args.output_path = args.continue_path @@ -156,7 +146,6 @@ def process_args(args, model_type): # setup output paths and read configs c = load_config(args.config_path) - if model_type in "tacotron glow_tts speedy_speech": model_class = "TTS" elif model_type in "gan wavegrad wavernn": @@ -192,6 +181,12 @@ def process_args(args, model_type): if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() + # if model characters are not set in the config file + # save the default set to the config file for future + # compatibility. + if model_class == 'TTS' and not 'characters' in c.keys(): + used_characters = parse_symbols() + new_fields['characters'] = used_characters copy_model_files(c, args.config_path, out_path, new_fields) os.chmod(audio_path, 0o775) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 46abf1c8..1148e0fe 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -67,7 +67,7 @@ def copy_model_files(c, config_file, out_path, new_fields): if isinstance(value, str): new_line = '"{}":"{}",\n'.format(key, value) else: - new_line = '"{}":{},\n'.format(key, value) + new_line = '"{}":{},\n'.format(key, json.dumps(value, ensure_ascii=False)) config_lines.insert(1, new_line) config_out_file = open(copy_config_path, "w") config_out_file.writelines(config_lines) From b28c724c0458b3565507508a4394ed146343bbe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 12:10:57 +0000 Subject: [PATCH 069/100] remove _phoneme_punctuations --- TTS/tts/utils/text/__init__.py | 4 ++-- TTS/tts/utils/text/symbols.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 23c5ab5f..90df61c4 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -6,7 +6,7 @@ import phonemizer from packaging import version from phonemizer.phonemize import phonemize from TTS.tts.utils.text import cleaners -from TTS.tts.utils.text.symbols import (_bos, _eos, _phoneme_punctuations, +from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations, make_symbols, phonemes, symbols) @@ -24,7 +24,7 @@ _phonemes = phonemes _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') # Regular expression matching punctuations, ignoring empty space -PHONEME_PUNCTUATION_PATTERN = r'['+_phoneme_punctuations+']+' +PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations+']+' def text2phone(text, language): diff --git a/TTS/tts/utils/text/symbols.py b/TTS/tts/utils/text/symbols.py index e82967fb..4bd1d6c2 100644 --- a/TTS/tts/utils/text/symbols.py +++ b/TTS/tts/utils/text/symbols.py @@ -5,6 +5,8 @@ Defines the set of symbols used in text input to the model. The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' + + def make_symbols(characters, phonemes=None, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name ''' Function to create symbols and phonemes ''' _symbols = [pad, eos, bos] + list(characters) @@ -18,15 +20,14 @@ def make_symbols(characters, phonemes=None, punctuations='!\'(),-.:;? ', pad='_' _symbols += _arpabet return _symbols, _phonemes - _pad = '_' _eos = '~' _bos = '^' _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' _punctuations = '!\'(),-.:;? ' -_phoneme_punctuations = '.!;:,?' +# _phoneme_punctuations = '.!;:,?' -# Phonemes definition +# Phonemes definition (All IPA characters) _vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ' _non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ' _pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ' From 4244096ccb1f53dced2db91b13c6709a5ae6c356 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 14:07:26 +0000 Subject: [PATCH 070/100] update test_text_processing for espeak-ng --- TTS/tts/utils/text/__init__.py | 2 +- tests/test_text_processing.py | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 90df61c4..9771e691 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -24,7 +24,7 @@ _phonemes = phonemes _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') # Regular expression matching punctuations, ignoring empty space -PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations+']+' +PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+' def text2phone(text, language): diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 61a83fa1..2ea8e8f9 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -19,7 +19,7 @@ def test_phoneme_to_sequence(): text_hat = sequence_to_phoneme(sequence) sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) - gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" + gt = 'ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!' assert text_hat == text_hat_with_params == gt # multiple punctuations @@ -28,7 +28,7 @@ def test_phoneme_to_sequence(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) - gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" + gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ?" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -39,7 +39,7 @@ def test_phoneme_to_sequence(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) - gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" + gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -61,7 +61,7 @@ def test_phoneme_to_sequence(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) - gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." + gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ." print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -72,7 +72,7 @@ def test_phoneme_to_sequence(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) - gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" + gt = "^biː ɐ vɔɪs, nɑːt æn! ɛkoʊ.~" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -83,7 +83,7 @@ def test_phoneme_to_sequence(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) - gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" + gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -97,7 +97,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" + gt = "ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!" assert text_hat == text_hat_with_params == gt # multiple punctuations @@ -106,7 +106,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" + gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ?' print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -117,7 +117,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" + gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ' print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -128,7 +128,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" + gt = 'biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!' print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -139,7 +139,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." + gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ.' print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -150,7 +150,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" + gt = "^biː ɐ vɔɪs, nɑːt æn! ɛkoʊ.~" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -161,14 +161,14 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" + gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" - gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" + gt = 'ɹ|iː|s|ə|n|t| |ɹ|ᵻ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|ŋ|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!' lang = "en-us" ph = text2phone(text, lang) assert gt == ph From 420901f4c23a2f9882dcb544cca9261f8376ec18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 14:41:17 +0000 Subject: [PATCH 071/100] linter fixes --- TTS/bin/find_unique_chars.py | 5 ++--- TTS/utils/arguments.py | 6 ++---- TTS/utils/manage.py | 3 ++- hubconf.py | 11 ++++++----- tests/test_demo_server.py | 2 +- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index e6c35878..654a3ff9 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -7,16 +7,15 @@ from TTS.tts.datasets.preprocess import get_preprocessor_by_name def main(): + # pylint: disable=bad-continuation parser = argparse.ArgumentParser(description='''Find all the unique characters or phonemes in a dataset.\n\n''' '''Target dataset must be defined in TTS.tts.datasets.preprocess\n\n'''\ - ''' Example runs: python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv - ''', - formatter_class=RawTextHelpFormatter) + ''', formatter_class=RawTextHelpFormatter) parser.add_argument( '--dataset', diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 031a3140..7d8f4adf 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -6,15 +6,13 @@ import argparse import glob import os import re -import json from TTS.tts.utils.generic_utils import check_config_tts +from TTS.tts.utils.text.symbols import parse_symbols from TTS.utils.console_logger import ConsoleLogger from TTS.utils.generic_utils import create_experiment_folder, get_git_branch -from TTS.utils.io import (copy_model_files, load_config, - save_characters_to_config) +from TTS.utils.io import copy_model_files, load_config from TTS.utils.tensorboard_logger import TensorboardLogger -from TTS.tts.utils.text.symbols import parse_symbols def parse_arguments(argv): diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 97cdf2b6..bd236dda 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -122,7 +122,8 @@ class ModelManager(object): """Download files from GDrive using their file ids""" gdown.download(f"{self.url_prefix}{gdrive_idx}", output=output, quiet=False) - def _download_zip_file(self, file_url, output): + @staticmethod + def _download_zip_file(file_url, output): """Download the target zip file and extract the files to a folder with the same name as the zip file.""" r = requests.get(file_url) diff --git a/hubconf.py b/hubconf.py index 7fc020b5..13549dfe 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,11 +1,11 @@ -dependencies = ['torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode'] # apt install espeak +dependencies = ['torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode'] # apt install espeak-ng import torch from TTS.utils.synthesizer import Synthesizer from TTS.utils.manage import ModelManager -def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder_models/en/ljspeech/mulitband-melgan', use_cuda=False): +def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name=None, use_cuda=False): """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text. Example: @@ -15,7 +15,7 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder Args: model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'. - vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/mulitband-melgan'. + vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/multiband-melgan'. pretrained (bool, optional): [description]. Defaults to True. Returns: @@ -23,8 +23,9 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder """ manager = ModelManager() - model_path, config_path = manager.download_model(model_name) - vocoder_path, vocoder_config_path = manager.download_model(vocoder_name) + model_path, config_path, model_item = manager.download_model(model_name) + vocoder_name = model_item['default_vocoder'] if vocoder_name is None else vocoder_name + vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) # create synthesizer synt = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, use_cuda) diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index bccff55d..1de3f558 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -21,7 +21,7 @@ class DemoServerTest(unittest.TestCase): num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) - save_checkpoint(model, None, 10, 10, 1, output_path) + save_checkpoint(model, None, 10, 10, 1, output_path, None) def test_in_out(self): self._create_random_model() From c613e0142fc4e648f989e6ead42659c41ba93b8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 14:55:08 +0000 Subject: [PATCH 072/100] update ci to espeak-ng --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5f6db915..6570bad6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -25,7 +25,7 @@ jobs: - checkout - run: | sudo apt update - sudo apt install espeak git + sudo apt install espeak-ng git - run: sudo pip install --upgrade pip - run: sudo pip install -e . - run: | From 33bcdc6ff825e1175ef4c8869a7377deae34f362 Mon Sep 17 00:00:00 2001 From: nmstoker Date: Sun, 14 Feb 2021 23:44:05 +0000 Subject: [PATCH 073/100] Updating models list to include EK1 TTS/vocoder --- TTS/.models.json | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/TTS/.models.json b/TTS/.models.json index 12970797..385a5021 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -1,6 +1,16 @@ { "tts_models":{ "en":{ + "ek1":{ + "tacotron2": { + "description": "EK1 en-rp tacotron2 by NMStoker", + "model_file": "1OJ5sLYmB03dQAf1FcY06b5X-0hiR0SNZ", + "config_file": "1hSnodL--5AFJTWvlU96e0pCnCfNU3yM_", + "stats_file": null, + "default_vocoder": "vocoder_models/en/ek1/wavegrad", + "commit": "c802255" + } + }, "ljspeech":{ "glow-tts":{ "description": "", @@ -70,6 +80,15 @@ } }, "en": { + "ek1":{ + "wavegrad": { + "description": "EK1 en-rp wavegrad by NMStoker", + "model_file": "1ShaCSrQfSRjM66vo45Bgo019uJDDloLS", + "config_file": "1otnQR5yTfN5A77yMKmUSzwh_VNvYwKai", + "stats_file": null, + "commit": "c802255" + } + }, "ljspeech":{ "multiband-melgan":{ "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K", From 77e630348e5e87fddbfbebf38f98d1972869f8ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Feb 2021 11:02:21 +0000 Subject: [PATCH 074/100] author , license and contact info in .model.json --- TTS/.models.json | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 385a5021..05997461 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -18,7 +18,10 @@ "config_file": "1IAROF3yy9qTK43vG_-R67y3Py9yYbD6t", "stats_file": null, "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", - "commit": "" + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"erengolge@gmail.com" }, "tacotron2-DCA": { "description": "", @@ -27,7 +30,10 @@ "stats_file": "1qevpGRVHPmzfiRBNuugLMX62x1k7B5vK", "github_rls_url": null, "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", - "commit": "" + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"erengolge@gmail.com" }, "speedy-speech-wn":{ "description": "Speedy Speech model with wavenet decoder.", @@ -35,7 +41,10 @@ "config_file": "1KvZilhsNP3EumVggDcD46yd834eO5hR3", "stats_file": "1Ju7apZ5JlgsVECcETL-GEx3DRoNzWfkR", "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", - "commit": "77b6145" + "commit": "77b6145", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"erengolge@gmail.com" } } }, @@ -46,7 +55,10 @@ "config_file": "1s7g4n-B73ChCB48AQ88_DV_8oyLth8r0", "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", - "commit": "" + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"erengolge@gmail.com" } } }, @@ -57,7 +69,10 @@ "config_file": "1yECKeP2LI7tNv4E8yVNx1yLmCfTCpkqG", "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", - "commit": "" + "commit": "", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"erengolge@gmail.com" } } } @@ -69,13 +84,19 @@ "model_file": "1r2g90JaZsfCj9dJkI9ioIU6JCFMPRqi6", "config_file": "1POrrLf5YEpZyjvWyMccj1nGCVc94mR6s", "stats_file": "1Vwbv4t-N1i3jXqI0bgKAhShAEO097sK0", - "commit": "ea976b0" + "commit": "ea976b0", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"erengolge@gmail.com" }, "fullband-melgan":{ "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K", "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu", "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU", - "commit": "4132240" + "commit": "4132240", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"erengolge@gmail.com" } } }, @@ -94,7 +115,10 @@ "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K", "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu", "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU", - "commit": "ea976b0" + "commit": "ea976b0", + "author": "Eren Gölge @erogol", + "license": "MPL", + "contact":"erengolge@gmail.com" } } } From dc3596dad4322f8e8e797ea9299807ad0914d3f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Feb 2021 11:29:22 +0000 Subject: [PATCH 075/100] model_manager tests --- TTS/utils/manage.py | 10 ++++++++-- tests/test_model_manager.py | 21 +++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 tests/test_model_manager.py diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 97cdf2b6..02e515f3 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -21,9 +21,12 @@ class ModelManager(object): Args: models_file (str): path to .model.json """ - def __init__(self, models_file=None): + def __init__(self, models_file=None, output_prefix=None): super().__init__() - self.output_prefix = get_user_data_dir('tts') + if output_prefix is None: + self.output_prefix = get_user_data_dir('tts') + else: + self.output_prefix = os.path.join(output_prefix, 'tts') self.url_prefix = "https://drive.google.com/uc?id=" self.models_dict = None if models_file is not None: @@ -57,6 +60,7 @@ class ModelManager(object): def list_models(self): print(" Name format: type/language/dataset/model") + models_name_list = [] for model_type in self.models_dict: for lang in self.models_dict[model_type]: for dataset in self.models_dict[model_type][lang]: @@ -67,6 +71,8 @@ class ModelManager(object): print(f" >: {model_type}/{lang}/{dataset}/{model} [already downloaded]") else: print(f" >: {model_type}/{lang}/{dataset}/{model}") + models_name_list.append(f'{model_type}/{lang}/{dataset}/{model}') + return models_name_list def download_model(self, model_name): """Download model files given the full model name. diff --git a/tests/test_model_manager.py b/tests/test_model_manager.py new file mode 100644 index 00000000..ae0a62b8 --- /dev/null +++ b/tests/test_model_manager.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3` +import os +import shutil +import glob +import unittest +from tests import get_tests_output_path +from TTS.utils.manage import ModelManager + + +def test_if_all_models_available(): + """Check if all the models are downloadable.""" + print(" > Checking the availability of all the models under the ModelManager.") + manager = ModelManager(output_prefix=get_tests_output_path()) + model_names = manager.list_models() + for model_name in model_names: + manager.download_model(model_name) + print(f" | > OK: {model_name}") + + folders = glob.glob(os.path.join(manager.output_prefix, '*')) + assert len(folders) == len(model_names) + shutil.rmtree(manager.output_prefix) \ No newline at end of file From 706e8410cd7f3277f52045b550b103d96c8daa04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Feb 2021 11:42:51 +0000 Subject: [PATCH 076/100] linterup date --- tests/test_model_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_model_manager.py b/tests/test_model_manager.py index ae0a62b8..7807716f 100644 --- a/tests/test_model_manager.py +++ b/tests/test_model_manager.py @@ -2,7 +2,6 @@ import os import shutil import glob -import unittest from tests import get_tests_output_path from TTS.utils.manage import ModelManager From 3b6ce04332245e4c87bfec1896b26e403882f887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Feb 2021 13:02:29 +0100 Subject: [PATCH 077/100] Update TTS/bin/find_unique_chars.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jörg Thalheim --- TTS/bin/find_unique_chars.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 654a3ff9..f9b6827b 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -35,13 +35,13 @@ def main(): preprocessor = get_preprocessor_by_name(args.dataset) items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file)) - texts = " ".join([item[0] for item in items]) + texts = "".join(item[0] for item in items) chars = set(texts) - lower_chars = set(texts.lower()) + lower_chars = filter(lambda c: c.islower(), chars) print(f" > Number of unique characters: {len(chars)}") print(f" > Unique characters: {''.join(sorted(chars))}") print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") if __name__ == "__main__": - main() \ No newline at end of file + main() From 80af8ca5e1f7059b0a10be1ddeabf01daaae57a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Feb 2021 13:03:59 +0100 Subject: [PATCH 078/100] Update TTS/utils/arguments.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jörg Thalheim --- TTS/utils/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 7d8f4adf..ebacab1b 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -182,7 +182,7 @@ def process_args(args, model_type): # if model characters are not set in the config file # save the default set to the config file for future # compatibility. - if model_class == 'TTS' and not 'characters' in c.keys(): + if model_class == 'TTS' and not 'characters' in c: used_characters = parse_symbols() new_fields['characters'] = used_characters copy_model_files(c, args.config_path, From ff218e23700a7266cd987acb690fb36d0f7dd071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Feb 2021 12:07:02 +0000 Subject: [PATCH 079/100] remove redundancy --- TTS/tts/utils/text/symbols.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/tts/utils/text/symbols.py b/TTS/tts/utils/text/symbols.py index 4bd1d6c2..83435917 100644 --- a/TTS/tts/utils/text/symbols.py +++ b/TTS/tts/utils/text/symbols.py @@ -25,7 +25,6 @@ _eos = '~' _bos = '^' _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' _punctuations = '!\'(),-.:;? ' -# _phoneme_punctuations = '.!;:,?' # Phonemes definition (All IPA characters) _vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ' From 06a3ba2fe2b71118ba60b0fea73ea04bde77cf43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Feb 2021 12:10:19 +0000 Subject: [PATCH 080/100] linter update --- TTS/utils/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index ebacab1b..922caaa0 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -182,7 +182,7 @@ def process_args(args, model_type): # if model characters are not set in the config file # save the default set to the config file for future # compatibility. - if model_class == 'TTS' and not 'characters' in c: + if model_class == 'TTS' and 'characters' not in c: used_characters = parse_symbols() new_fields['characters'] = used_characters copy_model_files(c, args.config_path, From 40f44757233fe3c80e3ea73e1bfa033cc2d375bf Mon Sep 17 00:00:00 2001 From: Adonis Pujols Date: Thu, 11 Feb 2021 05:26:06 -0500 Subject: [PATCH 081/100] add encoding="utf-8" --- TTS/utils/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 1148e0fe..30b7b7e2 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -22,7 +22,7 @@ class AttrDict(dict): def read_json_with_comments(json_path): # fallback to json - with open(json_path, "r") as f: + with open(json_path, "r", encoding = "utf-8") as f: input_str = f.read() # handle comments input_str = re.sub(r'\\\n', '', input_str) @@ -40,7 +40,7 @@ def load_config(config_path: str) -> AttrDict: ext = os.path.splitext(config_path)[1] if ext in (".yml", ".yaml"): - with open(config_path, "r") as f: + with open(config_path, "r", encoding = "utf-8") as f: data = yaml.safe_load(f) else: data = read_json_with_comments(config_path) @@ -61,7 +61,7 @@ def copy_model_files(c, config_file, out_path, new_fields): """ # copy config.json copy_config_path = os.path.join(out_path, 'config.json') - config_lines = open(config_file, "r").readlines() + config_lines = open(config_file, "r", encoding = "utf-8").readlines() # add extra information fields for key, value in new_fields.items(): if isinstance(value, str): From 9cb02aeea78826d8085b04eb34c237ab4c006264 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Mon, 15 Feb 2021 16:04:47 +0100 Subject: [PATCH 082/100] Chinese mandarin implementation (tacotron2) --- TTS/.models.json | 10 + TTS/tts/datasets/preprocess.py | 16 + TTS/tts/utils/chinese_mandarin/__init__.py | 0 TTS/tts/utils/chinese_mandarin/numbers.py | 107 ++++ TTS/tts/utils/chinese_mandarin/phonemizer.py | 41 ++ .../chinese_mandarin/pinyinToPhonemes.py | 420 ++++++++++++++ TTS/tts/utils/synthesis.py | 1 + TTS/tts/utils/text/__init__.py | 18 +- TTS/tts/utils/text/cleaners.py | 9 + TTS/utils/synthesizer.py | 9 +- ...on2_TTS_and_MultiBand_MelGAN_Example.ipynb | 529 ++++++++++++++++++ 11 files changed, 1158 insertions(+), 2 deletions(-) create mode 100644 TTS/tts/utils/chinese_mandarin/__init__.py create mode 100644 TTS/tts/utils/chinese_mandarin/numbers.py create mode 100644 TTS/tts/utils/chinese_mandarin/phonemizer.py create mode 100644 TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py create mode 100644 notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb diff --git a/TTS/.models.json b/TTS/.models.json index 05997461..0fb187a4 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -75,6 +75,16 @@ "contact":"erengolge@gmail.com" } } + }, + "zh":{ + "baker":{ + "tacotron2-DDC-GST":{ + "model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw", + "config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz", + "stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV", + "commit": "" + } + } } }, "vocoder_models":{ diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 7815d87d..be479376 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -352,3 +352,19 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): with open(str(cache_to), 'r') as f: return [x.strip().split('|') for x in f.readlines()] + + + + +# ======================================== Baker (chinese mandarin single speaker) =========================================== +def baker(root_path, meta_file): + """Normalizes the Baker meta data file to TTS format""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "baker" + with open(txt_file, 'r') as ttf: + for line in ttf: + wav_name, text = line.rstrip('\n').split("|") + wav_path = os.path.join(root_path, "clips_22", wav_name) + items.append([text, wav_path, speaker_name]) + return items diff --git a/TTS/tts/utils/chinese_mandarin/__init__.py b/TTS/tts/utils/chinese_mandarin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py new file mode 100644 index 00000000..8d2f40ff --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -0,0 +1,107 @@ + +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed under WTFPL or the Unlicense or CC0. +# This uses Python 3, but it's easy to port to Python 2 by changing +# strings to u'xx'. + +import re +import itertools + + +def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): + """ + Converts numbers to Chinese representations. + `big` : use financial characters. + `simp` : use simplified characters instead of traditional characters. + `o` : use 〇 for zero. + `twoalt`: use 两/兩 for two when appropriate. + Note that `o` and `twoalt` is ignored when `big` is used, + and `twoalt` is ignored when `o` is used for formal representations. + """ + # check num first + nd = str(num) + if abs(float(nd)) >= 1e48: + raise ValueError('number out of range') + elif 'e' in nd: + raise ValueError('scientific notation is not supported') + c_symbol = '正负点' if simp else '正負點' + if o: # formal + twoalt = False + if big: + c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖' + c_unit1 = '拾佰仟' + c_twoalt = '贰' if simp else '貳' + else: + c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九' + c_unit1 = '十百千' + if twoalt: + c_twoalt = '两' if simp else '兩' + else: + c_twoalt = '二' + c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載' + revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l))) + nd = str(num) + result = [] + if nd[0] == '+': + result.append(c_symbol[0]) + elif nd[0] == '-': + result.append(c_symbol[1]) + if '.' in nd: + integer, remainder = nd.lstrip('+-').split('.') + else: + integer, remainder = nd.lstrip('+-'), None + if int(integer): + splitted = [integer[max(i - 4, 0):i] + for i in range(len(integer), 0, -4)] + intresult = [] + for nu, unit in enumerate(splitted): + # special cases + if int(unit) == 0: # 0000 + intresult.append(c_basic[0]) + continue + elif nu > 0 and int(unit) == 2: # 0002 + intresult.append(c_twoalt + c_unit2[nu - 1]) + continue + ulist = [] + unit = unit.zfill(4) + for nc, ch in enumerate(reversed(unit)): + if ch == '0': + if ulist: # ???0 + ulist.append(c_basic[0]) + elif nc == 0: + ulist.append(c_basic[int(ch)]) + elif nc == 1 and ch == '1' and unit[1] == '0': + # special case for tens + # edit the 'elif' if you don't like + # 十四, 三千零十四, 三千三百一十四 + ulist.append(c_unit1[0]) + elif nc > 1 and ch == '2': + ulist.append(c_twoalt + c_unit1[nc - 1]) + else: + ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) + ustr = revuniq(ulist) + if nu == 0: + intresult.append(ustr) + else: + intresult.append(ustr + c_unit2[nu - 1]) + result.append(revuniq(intresult).strip(c_basic[0])) + else: + result.append(c_basic[0]) + if remainder: + result.append(c_symbol[2]) + result.append(''.join(c_basic[int(ch)] for ch in remainder)) + return ''.join(result) + + + + +def _number_replace(match : re.Match): + match_str: str = match.group() + return _num2chinese(match_str) + + +def replace_numbers_to_characters_in_text(text : str): + text = re.sub(r'[0-9]+', _number_replace, text) + return text \ No newline at end of file diff --git a/TTS/tts/utils/chinese_mandarin/phonemizer.py b/TTS/tts/utils/chinese_mandarin/phonemizer.py new file mode 100644 index 00000000..7742c491 --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/phonemizer.py @@ -0,0 +1,41 @@ +from typing import List + +import pypinyin + +from .pinyinToPhonemes import PINYIN_DICT + + +import jieba + + +def _chinese_character_to_pinyin(text: str) -> List[str]: + pinyins = pypinyin.pinyin( + text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True + ) + pinyins_flat_list = [item for sublist in pinyins for item in sublist] + return pinyins_flat_list + + +def _chinese_pinyin_to_phoneme(pinyin: str) -> str: + segment = pinyin[:-1] + tone = pinyin[-1] + phoneme = PINYIN_DICT.get(segment, [""])[0] + return phoneme + tone + + +def chinese_text_to_phonemes(text: str) -> str: + tokenized_text = jieba.cut(text, HMM=False) + tokenized_text = " ".join(tokenized_text) + pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) + + results: List[str] = [] + + for token in pinyined_text: + if token[-1] in "12345": # TODO transform to is_pinyin() + pinyin_phonemes = _chinese_pinyin_to_phoneme(token) + + results += list(pinyin_phonemes) + else: # is ponctuation or other + results += list(token) + + return "|".join(results) diff --git a/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py new file mode 100644 index 00000000..cdca44ac --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py @@ -0,0 +1,420 @@ + +PINYIN_DICT = { + "a": ["a"], + "ai": ["ai"], + "an": ["an"], + "ang": ["ɑŋ"], + "ao": ["aʌ"], + "ba": ["ba"], + "bai": ["bai"], + "ban": ["ban"], + "bang": ["bɑŋ"], + "bao": ["baʌ"], + # "be": ["be"], doesnt exist + "bei": ["bɛi"], + "ben": ["bœn"], + "beng": ["bɵŋ"], + "bi": ["bi"], + "bian": ["biɛn"], + "biao": ["biaʌ"], + "bie": ["bie"], + "bin": ["bin"], + "bing": ["bɨŋ"], + "bo": ["bo"], + "bu": ["bu"], + "ca": ["tsa"], + "cai": ["tsai"], + "can": ["tsan"], + "cang": ["tsɑŋ"], + "cao": ["tsaʌ"], + "ce": ["tsø"], + "cen": ["tsœn"], + "ceng": ["tsɵŋ"], + "cha": ["ʈʂa"], + "chai": ["ʈʂai"], + "chan": ["ʈʂan"], + "chang": ["ʈʂɑŋ"], + "chao": ["ʈʂaʌ"], + "che": ["ʈʂø"], + "chen": ["ʈʂœn"], + "cheng": ["ʈʂɵŋ"], + "chi": ["ʈʂʏ"], + "chong": ["ʈʂoŋ"], + "chou": ["ʈʂou"], + "chu": ["ʈʂu"], + "chua": ["ʈʂua"], + "chuai": ["ʈʂuai"], + "chuan": ["ʈʂuan"], + "chuang": ["ʈʂuɑŋ"], + "chui": ["ʈʂuei"], + "chun": ["ʈʂun"], + "chuo": ["ʈʂuo"], + "ci": ["tsɪ"], + "cong": ["tsoŋ"], + "cou": ["tsou"], + "cu": ["tsu"], + "cuan": ["tsuan"], + "cui": ["tsuei"], + "cun": ["tsun"], + "cuo": ["tsuo"], + "da": ["da"], + "dai": ["dai"], + "dan": ["dan"], + "dang": ["dɑŋ"], + "dao": ["daʌ"], + "de": ["dø"], + "dei": ["dei"], + # "den": ["dœn"], + "deng": ["dɵŋ"], + "di": ["di"], + "dia": ["dia"], + "dian": ["diɛn"], + "diao": ["diaʌ"], + "die": ["die"], + "ding": ["dɨŋ"], + "diu": ["dio"], + "dong": ["doŋ"], + "dou": ["dou"], + "du": ["du"], + "duan": ["duan"], + "dui": ["duei"], + "dun": ["dun"], + "duo": ["duo"], + "e": ["ø"], + "ei": ["ei"], + "en": ["œn"], + # "ng": ["œn"], + # "eng": ["ɵŋ"], + "er": ["er"], + "fa": ["fa"], + "fan": ["fan"], + "fang": ["fɑŋ"], + "fei": ["fei"], + "fen": ["fœn"], + "feng": ["fɵŋ"], + "fo": ["fo"], + "fou": ["fou"], + "fu": ["fu"], + "ga": ["ga"], + "gai": ["gai"], + "gan": ["gan"], + "gang": ["gɑŋ"], + "gao": ["gaʌ"], + "ge": ["gø"], + "gei": ["gei"], + "gen": ["gœn"], + "geng": ["gɵŋ"], + "gong": ["goŋ"], + "gou": ["gou"], + "gu": ["gu"], + "gua": ["gua"], + "guai": ["guai"], + "guan": ["guan"], + "guang": ["guɑŋ"], + "gui": ["guei"], + "gun": ["gun"], + "guo": ["guo"], + "ha": ["xa"], + "hai": ["xai"], + "han": ["xan"], + "hang": ["xɑŋ"], + "hao": ["xaʌ"], + "he": ["xø"], + "hei": ["xei"], + "hen": ["xœn"], + "heng": ["xɵŋ"], + "hong": ["xoŋ"], + "hou": ["xou"], + "hu": ["xu"], + "hua": ["xua"], + "huai": ["xuai"], + "huan": ["xuan"], + "huang": ["xuɑŋ"], + "hui": ["xuei"], + "hun": ["xun"], + "huo": ["xuo"], + "ji": ["dʑi"], + "jia": ["dʑia"], + "jian": ["dʑiɛn"], + "jiang": ["dʑiɑŋ"], + "jiao": ["dʑiaʌ"], + "jie": ["dʑie"], + "jin": ["dʑin"], + "jing": ["dʑɨŋ"], + "jiong": ["dʑioŋ"], + "jiu": ["dʑio"], + "ju": ["dʑy"], + "juan": ["dʑyɛn"], + "jue": ["dʑye"], + "jun": ["dʑyn"], + "ka": ["ka"], + "kai": ["kai"], + "kan": ["kan"], + "kang": ["kɑŋ"], + "kao": ["kaʌ"], + "ke": ["kø"], + "kei": ["kei"], + "ken": ["kœn"], + "keng": ["kɵŋ"], + "kong": ["koŋ"], + "kou": ["kou"], + "ku": ["ku"], + "kua": ["kua"], + "kuai": ["kuai"], + "kuan": ["kuan"], + "kuang": ["kuɑŋ"], + "kui": ["kuei"], + "kun": ["kun"], + "kuo": ["kuo"], + "la": ["la"], + "lai": ["lai"], + "lan": ["lan"], + "lang": ["lɑŋ"], + "lao": ["laʌ"], + "le": ["lø"], + "lei": ["lei"], + "leng": ["lɵŋ"], + "li": ["li"], + "lia": ["lia"], + "lian": ["liɛn"], + "liang": ["liɑŋ"], + "liao": ["liaʌ"], + "lie": ["lie"], + "lin": ["lin"], + "ling": ["lɨŋ"], + "liu": ["lio"], + "lo": ["lo"], + "long": ["loŋ"], + "lou": ["lou"], + "lu": ["lu"], + "lv": ["ly"], + "luan": ["luan"], + "lve": ["lye"], + "lue": ["lue"], + "lun": ["lun"], + "luo": ["luo"], + "ma": ["ma"], + "mai": ["mai"], + "man": ["man"], + "mang": ["mɑŋ"], + "mao": ["maʌ"], + "me": ["mø"], + "mei": ["mei"], + "men": ["mœn"], + "meng": ["mɵŋ"], + "mi": ["mi"], + "mian": ["miɛn"], + "miao": ["miaʌ"], + "mie": ["mie"], + "min": ["min"], + "ming": ["mɨŋ"], + "miu": ["mio"], + "mo": ["mo"], + "mou": ["mou"], + "mu": ["mu"], + "na": ["na"], + "nai": ["nai"], + "nan": ["nan"], + "nang": ["nɑŋ"], + "nao": ["naʌ"], + "ne": ["nø"], + "nei": ["nei"], + "nen": ["nœn"], + "neng": ["nɵŋ"], + "ni": ["ni"], + "nia": ["nia"], + "nian": ["niɛn"], + "niang": ["niɑŋ"], + "niao": ["niaʌ"], + "nie": ["nie"], + "nin": ["nin"], + "ning": ["nɨŋ"], + "niu": ["nio"], + "nong": ["noŋ"], + "nou": ["nou"], + "nu": ["nu"], + "nv": ["ny"], + "nuan": ["nuan"], + "nve": ["nye"], + "nue": ["nye"], + "nuo": ["nuo"], + "o": ["o"], + "ou": ["ou"], + "pa": ["pa"], + "pai": ["pai"], + "pan": ["pan"], + "pang": ["pɑŋ"], + "pao": ["paʌ"], + "pe": ["pø"], + "pei": ["pei"], + "pen": ["pœn"], + "peng": ["pɵŋ"], + "pi": ["pi"], + "pian": ["piɛn"], + "piao": ["piaʌ"], + "pie": ["pie"], + "pin": ["pin"], + "ping": ["pɨŋ"], + "po": ["po"], + "pou": ["pou"], + "pu": ["pu"], + "qi": ["tɕi"], + "qia": ["tɕia"], + "qian": ["tɕiɛn"], + "qiang": ["tɕiɑŋ"], + "qiao": ["tɕiaʌ"], + "qie": ["tɕie"], + "qin": ["tɕin"], + "qing": ["tɕɨŋ"], + "qiong": ["tɕioŋ"], + "qiu": ["tɕio"], + "qu": ["tɕy"], + "quan": ["tɕyɛn"], + "que": ["tɕye"], + "qun": ["tɕyn"], + "ran": ["ʐan"], + "rang": ["ʐɑŋ"], + "rao": ["ʐaʌ"], + "re": ["ʐø"], + "ren": ["ʐœn"], + "reng": ["ʐɵŋ"], + "ri": ["ʐʏ"], + "rong": ["ʐoŋ"], + "rou": ["ʐou"], + "ru": ["ʐu"], + "rua": ["ʐua"], + "ruan": ["ʐuan"], + "rui": ["ʐuei"], + "run": ["ʐun"], + "ruo": ["ʐuo"], + "sa": ["sa"], + "sai": ["sai"], + "san": ["san"], + "sang": ["sɑŋ"], + "sao": ["saʌ"], + "se": ["sø"], + "sen": ["sœn"], + "seng": ["sɵŋ"], + "sha": ["ʂa"], + "shai": ["ʂai"], + "shan": ["ʂan"], + "shang": ["ʂɑŋ"], + "shao": ["ʂaʌ"], + "she": ["ʂø"], + "shei": ["ʂei"], + "shen": ["ʂœn"], + "sheng": ["ʂɵŋ"], + "shi": ["ʂʏ"], + "shou": ["ʂou"], + "shu": ["ʂu"], + "shua": ["ʂua"], + "shuai": ["ʂuai"], + "shuan": ["ʂuan"], + "shuang": ["ʂuɑŋ"], + "shui": ["ʂuei"], + "shun": ["ʂun"], + "shuo": ["ʂuo"], + "si": ["sɪ"], + "song": ["soŋ"], + "sou": ["sou"], + "su": ["su"], + "suan": ["suan"], + "sui": ["suei"], + "sun": ["sun"], + "suo": ["suo"], + "ta": ["ta"], + "tai": ["tai"], + "tan": ["tan"], + "tang": ["tɑŋ"], + "tao": ["taʌ"], + "te": ["tø"], + "tei": ["tei"], + "teng": ["tɵŋ"], + "ti": ["ti"], + "tian": ["tiɛn"], + "tiao": ["tiaʌ"], + "tie": ["tie"], + "ting": ["tɨŋ"], + "tong": ["toŋ"], + "tou": ["tou"], + "tu": ["tu"], + "tuan": ["tuan"], + "tui": ["tuei"], + "tun": ["tun"], + "tuo": ["tuo"], + "wa": ["wa"], + "wai": ["wai"], + "wan": ["wan"], + "wang": ["wɑŋ"], + "wei": ["wei"], + "wen": ["wœn"], + "weng": ["wɵŋ"], + "wo": ["wo"], + "wu": ["wu"], + "xi": ["ɕi"], + "xia": ["ɕia"], + "xian": ["ɕiɛn"], + "xiang": ["ɕiɑŋ"], + "xiao": ["ɕiaʌ"], + "xie": ["ɕie"], + "xin": ["ɕin"], + "xing": ["ɕɨŋ"], + "xiong": ["ɕioŋ"], + "xiu": ["ɕio"], + "xu": ["ɕy"], + "xuan": ["ɕyɛn"], + "xue": ["ɕye"], + "xun": ["ɕyn"], + "ya": ["ia"], + "yan": ["iɛn"], + "yang": ["iɑŋ"], + "yao": ["iaʌ"], + "ye": ["ie"], + "yi": ["i"], + "yin": ["in"], + "ying": ["ɨŋ"], + "yo": ["io"], + "yong": ["ioŋ"], + "you": ["io"], + "yu": ["y"], + "yuan": ["yɛn"], + "yue": ["ye"], + "yun": ["yn"], + "za": ["dza"], + "zai": ["dzai"], + "zan": ["dzan"], + "zang": ["dzɑŋ"], + "zao": ["dzaʌ"], + "ze": ["dzø"], + "zei": ["dzei"], + "zen": ["dzœn"], + "zeng": ["dzɵŋ"], + "zha": ["dʒa"], + "zhai": ["dʒai"], + "zhan": ["dʒan"], + "zhang": ["dʒɑŋ"], + "zhao": ["dʒaʌ"], + "zhe": ["dʒø"], + # "zhei": ["dʒei"], it doesn't exist + "zhen": ["dʒœn"], + "zheng": ["dʒɵŋ"], + "zhi": ["dʒʏ"], + "zhong": ["dʒoŋ"], + "zhou": ["dʒou"], + "zhu": ["dʒu"], + "zhua": ["dʒua"], + "zhuai": ["dʒuai"], + "zhuan": ["dʒuan"], + "zhuang": ["dʒuɑŋ"], + "zhui": ["dʒuei"], + "zhun": ["dʒun"], + "zhuo": ["dʒuo"], + "zi": ["dzɪ"], + "zong": ["dzoŋ"], + "zou": ["dzou"], + "zu": ["dzu"], + "zuan": ["dzuan"], + "zui": ["dzuei"], + "zun": ["dzun"], + "zuo": ["dzuo"], +} \ No newline at end of file diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index be587211..e7b1546e 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -219,6 +219,7 @@ def synthesis(model, ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process model outputs. speaker_id (int): id of speaker + style_wav (str | Dict[str, float]): Uses for style embedding of GST. style_wav (str): Uses for style embedding of GST. truncated (bool): keep model states after inference. It can be used for continuous inference at long texts. diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 9771e691..16172596 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations, make_symbols, phonemes, symbols) +from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes # pylint: disable=unnecessary-comprehension @@ -29,8 +30,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+' def text2phone(text, language): ''' - Convert graphemes to phonemes. + Convert graphemes to phonemes. For most of the languages, it calls + the phonemizer python library that calls espeak/espeak-ng. For chinese + mandarin, it calls pypinyin + custom function for phonemizing + Parameters: + text (str): text to phonemize + language (str): language of the text + Returns: + ph (str): phonemes as a string seperated by "|" + ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" ''' + + # TO REVIEW : How to have a good implementation for this? + if language == "chinese-mandarin": + ph = chinese_text_to_phonemes(text) + return ph + + seperator = phonemizer.separator.Separator(' |', '', '|') #try: punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 7c3f1017..49a25557 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -15,6 +15,8 @@ from unidecode import unidecode from .number_norm import normalize_numbers from .abbreviations import abbreviations_en, abbreviations_fr from .time import expand_time_english +from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text + # Regular expression matching whitespace: _whitespace_re = re.compile(r'\s+') @@ -122,6 +124,13 @@ def portuguese_cleaners(text): text = collapse_whitespace(text) return text +def chinese_mandarin_cleaners(text: str) -> str: + '''Basic pipeline for chinese''' + text = replace_numbers_to_characters_in_text(text) + return text + + + def phoneme_cleaners(text): '''Pipeline for phonemes mode, including number and abbreviation expansion.''' text = expand_numbers(text) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2a779e53..4b4bc04c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,6 +122,13 @@ class Synthesizer(object): speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None + + # check if compute gst style + gst_style_input = None + if self.tts_config.use_gst: + if self.tts_config.gst["gst_style_input"] not in ["", {}]: + style_wav = self.tts_config.gst["gst_style_input"] + for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( @@ -131,7 +138,7 @@ class Synthesizer(object): self.use_cuda, self.ap, speaker_idx, - None, + gst_style_input, False, self.tts_config.enable_eos_bos_chars, use_gl, diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb new file mode 100644 index 00000000..709dbb8d --- /dev/null +++ b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6LWsNd3_M3MP" + }, + "source": [ + "# Mozilla TTS on CPU Real-Time Speech Synthesis " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "FAqrSIWgLyP0" + }, + "source": [ + "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", + "\n", + "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", + "\n", + "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", + "\n", + "Note that both model performances can be improved with more training." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ku-dA4DKoeXk" + }, + "source": [ + "### Download Models" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "colab_type": "code", + "id": "jGIgnWhGsxU1", + "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: cannot create directory 'data/': File exists\n", + "Downloading...\n", + "From: https://drive.google.com/uc?id=1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV\n", + "To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/tts_scale_stats.npy\n", + "100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 18.1MB/s]\n" + ] + } + ], + "source": [ + "! mkdir data/\n", + "! gdown --id 1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw -O data/tts_model.pth.tar\n", + "! gdown --id 1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz -O data/tts_config.json\n", + "! gdown --id 1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV -O data/tts_scale_stats.npy" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "colab_type": "code", + "id": "4dnpE0-kvTsu", + "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU\n", + "To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/vocoder_scale_stats.npy\n", + "100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 16.7MB/s]\n" + ] + } + ], + "source": [ + "! gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n", + "! gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/vocoder_config.json\n", + "! gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/vocoder_scale_stats.npy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Zlgi8fPdpRF0" + }, + "source": [ + "### Define TTS function" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "f-Yc42nQZG5A" + }, + "outputs": [], + "source": [ + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, style_wav=None):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=style_wav,\n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", + " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " waveform = waveform.flatten()\n", + " if use_cuda:\n", + " waveform = waveform.cpu()\n", + " waveform = waveform.numpy()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZksegYQepkFg" + }, + "source": [ + "### Load Models" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oVa0kOamprgj" + }, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "import time\n", + "import IPython\n", + "\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.utils.io import load_config\n", + "from TTS.tts.utils.text.symbols import symbols, phonemes, make_symbols\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.tts.utils.synthesis import synthesis" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "EY-sHVO8IFSH" + }, + "outputs": [], + "source": [ + "# runtime settings\n", + "use_cuda = False" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_1aIUp2FpxOQ" + }, + "outputs": [], + "source": [ + "# model paths\n", + "TTS_MODEL = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/checkpoint_17000.pth.tar\"\n", + "TTS_CONFIG = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/config.json\"\n", + "\n", + "TTS_MODEL = \"data/tts_model.pth.tar\"\n", + "TTS_CONFIG = \"data/tts_config.json\"\n", + "\n", + "VOCODER_MODEL = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar\"\n", + "VOCODER_CONFIG = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/config.json\"\n", + "\n", + "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n", + "VOCODER_CONFIG = \"data/vocoder_config.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CpgmdBVQplbv" + }, + "outputs": [], + "source": [ + "# load configs\n", + "TTS_CONFIG = load_config(TTS_CONFIG)\n", + "VOCODER_CONFIG = load_config(VOCODER_CONFIG)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "colab_type": "code", + "id": "zmrQxiozIUVE", + "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:22050\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:0\n", + " | > fft_size:1024\n", + " | > power:1.5\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:60\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:50.0\n", + " | > mel_fmax:7600.0\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:data/tts_scale_stats.npy\n", + " | > hop_length:256\n", + " | > win_length:1024\n" + ] + } + ], + "source": [ + "# load the audio processor\n", + "TTS_CONFIG.audio['stats_path'] = 'data/tts_scale_stats.npy'\n", + "ap = AudioProcessor(**TTS_CONFIG.audio) " + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "colab_type": "code", + "id": "8fLoI4ipqMeS", + "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Using model: tacotron2\n" + ] + } + ], + "source": [ + "# LOAD TTS MODEL\n", + "# multi speaker \n", + "speaker_id = None\n", + "speakers = []\n", + "\n", + "# load the model (chinese_mandarin special characters/punctuations are in the tts_config.json)\n", + "if TTS_CONFIG.get(\"characters\"):\n", + " _characters = TTS_CONFIG[\"characters\"][\"characters\"]\n", + " _phonemes = TTS_CONFIG[\"characters\"][\"phonemes\"]\n", + " _punctuations = TTS_CONFIG[\"characters\"][\"punctuations\"]\n", + " _pad = TTS_CONFIG[\"characters\"][\"pad\"]\n", + " _eos = TTS_CONFIG[\"characters\"][\"eos\"]\n", + " _bos = TTS_CONFIG[\"characters\"][\"bos\"]\n", + " \n", + " symbols, phonemes = make_symbols(_characters, _phonemes, punctuations= _punctuations, pad=_pad, eos=_eos, bos=_bos )\n", + "\n", + "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", + "\n", + "# load model state\n", + "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n", + "\n", + "# load the model\n", + "model.load_state_dict(cp['model'])\n", + "if use_cuda:\n", + " model.cuda()\n", + "model.eval()\n", + "\n", + "# set model stepsize\n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "zKoq0GgzqzhQ", + "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Generator Model: multiband_melgan_generator\n", + " > Setting up Audio Processor...\n", + " | > sample_rate:22050\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:0\n", + " | > fft_size:1024\n", + " | > power:None\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:None\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:50.0\n", + " | > mel_fmax:7600.0\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:data/vocoder_scale_stats.npy\n", + " | > hop_length:256\n", + " | > win_length:1024\n", + "\n", + "Vocoder loaded\n" + ] + } + ], + "source": [ + "from TTS.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "# LOAD VOCODER MODEL\n", + "vocoder_model = setup_generator(VOCODER_CONFIG)\n", + "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n", + "vocoder_model.remove_weight_norm()\n", + "vocoder_model.inference_padding = 0\n", + "\n", + "\n", + "VOCODER_CONFIG.audio['stats_path'] = 'data/vocoder_scale_stats.npy'\n", + "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", + "if use_cuda:\n", + " vocoder_model.cuda()\n", + "vocoder_model.eval()\n", + "print(\"\\nVocoder loaded\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ws_YkPKsLgo-" + }, + "source": [ + "## Run Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# Here some test sentences for you to play with :\n", + "sentence = \"我从来不会说很标准的中文。\"\n", + "sentence = \"我喜欢听人工智能的博客。\"\n", + "sentence = \"我来自一个法国郊区的地方。\"\n", + "sentence = \"不比不知道,一比吓一跳!\"\n", + "sentence = \"台湾是一个真的很好玩的地方!\"\n", + "sentence = \"干一行,行一行,行行都行。\"\n", + "sentence = \"我要盖被子,好尴尬!\"" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also play with the style_wav global style token. However, the lady speaking in the baker dataset\n", + "# has no emotion through all the sentences. It's hard to get some nice GST with this.\n", + "# That's also why adding \"!\" or \"?\" at the end of sentence change nothing. The dataset has no such prosody.\n", + "style_wav = {\"2\": 0.3, \"1\": -0.1}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 134 + }, + "colab_type": "code", + "id": "FuWxZ9Ey5Puj", + "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(50688,)\n", + " > Run-time: 1.5945854187011719\n", + " > Real-time factor: 0.6935317513786934\n", + " > Time per step: 3.145291761617468e-05\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sentence = \"我喜欢听人工智能的博客。\"\n", + "style_wav = {\"2\": 0.2, \"7\": -0.1}\n", + "\n", + "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True, style_wav= style_wav)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 8a106e0527374f9ad5a42dbc198a9fd7039e773f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Feb 2021 17:06:03 +0000 Subject: [PATCH 083/100] fix #655 --- TTS/utils/arguments.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 922caaa0..c7c0f9db 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -172,6 +172,7 @@ def process_args(args, model_type): audio_path = os.path.join(out_path, "test_audios") c_logger = ConsoleLogger() + tb_logger = None if args.rank == 0: os.makedirs(audio_path, exist_ok=True) From 61c88beb943db7c9a79176c17aef59757a67dbee Mon Sep 17 00:00:00 2001 From: gerazov Date: Mon, 15 Feb 2021 18:40:17 +0100 Subject: [PATCH 084/100] refactored keep_all_best --- TTS/bin/train_glow_tts.py | 6 +++--- TTS/bin/train_speedy_speech.py | 6 +++--- TTS/bin/train_tacotron.py | 6 +++--- TTS/bin/train_vocoder_gan.py | 6 +++--- TTS/bin/train_vocoder_wavegrad.py | 6 +++--- TTS/bin/train_vocoder_wavernn.py | 6 +++--- TTS/tts/configs/config.json | 4 ++-- TTS/tts/configs/glow_tts_gated_conv.json | 4 ++-- TTS/tts/configs/glow_tts_ljspeech.json | 4 ++-- TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json | 4 ++-- TTS/tts/configs/speedy_speech_ljspeech.json | 4 ++-- TTS/utils/arguments.py | 4 ++-- TTS/vocoder/configs/multiband-melgan_and_rwd_config.json | 4 ++-- TTS/vocoder/configs/multiband_melgan_config.json | 4 ++-- TTS/vocoder/configs/multiband_melgan_config_mozilla.json | 4 ++-- TTS/vocoder/configs/parallel_wavegan_config.json | 4 ++-- TTS/vocoder/configs/universal_fullband_melgan.json | 4 ++-- TTS/vocoder/configs/wavegrad_libritts.json | 4 ++-- TTS/vocoder/configs/wavernn_config.json | 4 ++-- TTS/vocoder/utils/io.py | 4 ++-- tests/inputs/test_glow_tts.json | 4 ++-- tests/inputs/test_speedy_speech.json | 4 ++-- tests/inputs/test_train_config.json | 4 ++-- tests/inputs/test_vocoder_multiband_melgan_config.json | 4 ++-- tests/inputs/test_vocoder_wavegrad.json | 4 ++-- tests/inputs/test_vocoder_wavernn_config.json | 4 ++-- 26 files changed, 58 insertions(+), 58 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 072ad41b..68471559 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -548,8 +548,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") - keep_best = c.get('keep_best', False) - keep_after = c.get('keep_after', 10000) # void if keep_best False + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) @@ -571,7 +571,7 @@ def main(args): # pylint: disable=redefined-outer-name target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, - keep_best=keep_best, keep_after=keep_after) + keep_all_best=keep_all_best, keep_after=keep_after) if __name__ == '__main__': diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 8e9dbc2e..ebe78b21 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -512,8 +512,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") - keep_best = c.get('keep_best', False) - keep_after = c.get('keep_after', 10000) # void if keep_best False + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False # define dataloaders train_loader = setup_loader(ap, 1, is_val=False, verbose=True) @@ -533,7 +533,7 @@ def main(args): # pylint: disable=redefined-outer-name target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, - keep_best=keep_best, keep_after=keep_after) + keep_all_best=keep_all_best, keep_after=keep_after) if __name__ == '__main__': diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 2382c0be..615901e1 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -591,8 +591,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") - keep_best = c.get('keep_best', False) - keep_after = c.get('keep_after', 10000) # void if keep_best False + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False # define data loaders train_loader = setup_loader(ap, @@ -644,7 +644,7 @@ def main(args): # pylint: disable=redefined-outer-name epoch, c.r, OUT_PATH, - keep_best=keep_best, + keep_all_best=keep_all_best, keep_after=keep_after, scaler=scaler.state_dict() if c.mixed_precision else None ) diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 9043a560..708bf350 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -555,8 +555,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with best loss of {best_loss}.") - keep_best = c.get('keep_best', False) - keep_after = c.get('keep_after', 10000) # void if keep_best False + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False global_step = args.restore_step for epoch in range(0, c.epochs): @@ -581,7 +581,7 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, - keep_best=keep_best, + keep_all_best=keep_all_best, keep_after=keep_after, model_losses=eval_avg_loss_dict, ) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 271e8d4c..51a31509 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -403,8 +403,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") - keep_best = c.get('keep_best', False) - keep_after = c.get('keep_after', 10000) # void if keep_best False + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False global_step = args.restore_step for epoch in range(0, c.epochs): @@ -426,7 +426,7 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, - keep_best=keep_best, + keep_all_best=keep_all_best, keep_after=keep_after, model_losses=eval_avg_loss_dict, scaler=scaler.state_dict() if c.mixed_precision else None diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index 5fde5025..8e9c6a8b 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -426,8 +426,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = torch.load(args.best_path, map_location='cpu')['model_loss'] print(f" > Starting with loaded last best loss {best_loss}.") - keep_best = c.get('keep_best', False) - keep_after = c.get('keep_after', 10000) # void if keep_best False + keep_all_best = c.get('keep_all_best', False) + keep_after = c.get('keep_after', 10000) # void if keep_all_best False global_step = args.restore_step for epoch in range(0, c.epochs): @@ -450,7 +450,7 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, - keep_best=keep_best, + keep_all_best=keep_all_best, keep_after=keep_after, model_losses=eval_avg_loss_dict, scaler=scaler.state_dict() if c.mixed_precision else None diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json index 5bd249d9..ba33acc5 100644 --- a/TTS/tts/configs/config.json +++ b/TTS/tts/configs/config.json @@ -121,8 +121,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/tts/configs/glow_tts_gated_conv.json b/TTS/tts/configs/glow_tts_gated_conv.json index 865c6f29..c4d7b1e5 100644 --- a/TTS/tts/configs/glow_tts_gated_conv.json +++ b/TTS/tts/configs/glow_tts_gated_conv.json @@ -93,8 +93,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "apex_amp_level": null, diff --git a/TTS/tts/configs/glow_tts_ljspeech.json b/TTS/tts/configs/glow_tts_ljspeech.json index 6e15de10..5a4c47c2 100644 --- a/TTS/tts/configs/glow_tts_ljspeech.json +++ b/TTS/tts/configs/glow_tts_ljspeech.json @@ -105,8 +105,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json b/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json index 3cf66870..11e42259 100644 --- a/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json +++ b/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json @@ -121,8 +121,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/tts/configs/speedy_speech_ljspeech.json b/TTS/tts/configs/speedy_speech_ljspeech.json index 9f1d3f8b..f61f35cd 100644 --- a/TTS/tts/configs/speedy_speech_ljspeech.json +++ b/TTS/tts/configs/speedy_speech_ljspeech.json @@ -109,8 +109,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n "mixed_precision": false, diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 0a06b562..87c0c885 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -68,7 +68,7 @@ def parse_arguments(argv): return parser.parse_args() -def get_last_models(path): +def get_last_checkpoint(path): """Get latest checkpoint or/and best model in path. It is based on globbing for `*.pth.tar` and the RegEx @@ -145,7 +145,7 @@ def process_args(args, model_type): if args.continue_path: args.output_path = args.continue_path args.config_path = os.path.join(args.continue_path, "config.json") - args.restore_path, best_model = get_last_models(args.continue_path) + args.restore_path, best_model = get_last_checkpoint(args.continue_path) if not args.best_path: args.best_path = best_model diff --git a/TTS/vocoder/configs/multiband-melgan_and_rwd_config.json b/TTS/vocoder/configs/multiband-melgan_and_rwd_config.json index b4d42f4b..2670c0f3 100644 --- a/TTS/vocoder/configs/multiband-melgan_and_rwd_config.json +++ b/TTS/vocoder/configs/multiband-melgan_and_rwd_config.json @@ -138,8 +138,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/multiband_melgan_config.json b/TTS/vocoder/configs/multiband_melgan_config.json index af2af8a3..807f0836 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.json +++ b/TTS/vocoder/configs/multiband_melgan_config.json @@ -128,8 +128,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/multiband_melgan_config_mozilla.json b/TTS/vocoder/configs/multiband_melgan_config_mozilla.json index 0f133fa7..255315c8 100644 --- a/TTS/vocoder/configs/multiband_melgan_config_mozilla.json +++ b/TTS/vocoder/configs/multiband_melgan_config_mozilla.json @@ -141,8 +141,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/parallel_wavegan_config.json b/TTS/vocoder/configs/parallel_wavegan_config.json index 85e659f4..193b1f7b 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.json +++ b/TTS/vocoder/configs/parallel_wavegan_config.json @@ -130,8 +130,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/universal_fullband_melgan.json b/TTS/vocoder/configs/universal_fullband_melgan.json index efb6f3cd..511ae70e 100644 --- a/TTS/vocoder/configs/universal_fullband_melgan.json +++ b/TTS/vocoder/configs/universal_fullband_melgan.json @@ -124,8 +124,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/wavegrad_libritts.json b/TTS/vocoder/configs/wavegrad_libritts.json index 9107d556..ade20a8f 100644 --- a/TTS/vocoder/configs/wavegrad_libritts.json +++ b/TTS/vocoder/configs/wavegrad_libritts.json @@ -103,8 +103,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 5000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 220904c9..aa2d7b9f 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -89,8 +89,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/TTS/vocoder/utils/io.py b/TTS/vocoder/utils/io.py index 232d972d..60def72a 100644 --- a/TTS/vocoder/utils/io.py +++ b/TTS/vocoder/utils/io.py @@ -64,7 +64,7 @@ def save_checkpoint(model, optimizer, scheduler, model_disc, optimizer_disc, def save_best_model(current_loss, best_loss, model, optimizer, scheduler, model_disc, optimizer_disc, scheduler_disc, current_step, - epoch, out_path, keep_best=False, keep_after=10000, + epoch, out_path, keep_all_best=False, keep_after=10000, **kwargs): if current_loss < best_loss: best_model_name = f'best_model_{current_step}.pth.tar' @@ -82,7 +82,7 @@ def save_best_model(current_loss, best_loss, model, optimizer, scheduler, model_loss=current_loss, **kwargs) # only delete previous if current is saved successfully - if not keep_best or (current_step < keep_after): + if not keep_all_best or (current_step < keep_after): model_names = glob.glob( os.path.join(out_path, 'best_model*.pth.tar')) for model_name in model_names: diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index 338ed8ec..0ee9395b 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -106,8 +106,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": true, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "apex_amp_level": null, diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech.json index 4f9f36bc..c4e27737 100644 --- a/tests/inputs/test_speedy_speech.json +++ b/tests/inputs/test_speedy_speech.json @@ -111,8 +111,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": true, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n "mixed_precision": false, diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_train_config.json index 8c9e20d3..14449867 100644 --- a/tests/inputs/test_train_config.json +++ b/tests/inputs/test_train_config.json @@ -122,8 +122,8 @@ "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": true, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json index e1d201ab..92deaee4 100644 --- a/tests/inputs/test_vocoder_multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -131,8 +131,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": true, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json index 5a068751..f6208e8d 100644 --- a/tests/inputs/test_vocoder_wavegrad.json +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -101,8 +101,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 10000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": true, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index 4239e8bd..decafa70 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -97,8 +97,8 @@ "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_best": true, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_best is true + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING From 3e59d3c28d918038c2e10cebcbc6613e675fc377 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:25:18 +0100 Subject: [PATCH 085/100] modify according to PR reviews --- TTS/tts/datasets/preprocess.py | 12 ++++-- TTS/tts/utils/chinese_mandarin/numbers.py | 46 +++++++++++++++++------ TTS/tts/utils/synthesis.py | 1 - TTS/tts/utils/text/__init__.py | 15 ++++---- TTS/utils/synthesizer.py | 2 +- 5 files changed, 52 insertions(+), 24 deletions(-) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index be479376..78bf14d1 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -3,6 +3,7 @@ from glob import glob import re import sys from pathlib import Path +from typing import List, Tuple from tqdm import tqdm @@ -355,10 +356,15 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): +def baker(root_path: str, meta_file: str) -> List[List[str]]: + """Normalizes the Baker meta data file to TTS format -# ======================================== Baker (chinese mandarin single speaker) =========================================== -def baker(root_path, meta_file): - """Normalizes the Baker meta data file to TTS format""" + Args: + root_path (str): path to the baker dataset + meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence + Returns: + List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences + """ txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "baker" diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py index 8d2f40ff..a662ea1c 100644 --- a/TTS/tts/utils/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -10,16 +10,24 @@ import re import itertools -def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): - """ - Converts numbers to Chinese representations. - `big` : use financial characters. - `simp` : use simplified characters instead of traditional characters. - `o` : use 〇 for zero. - `twoalt`: use 两/兩 for two when appropriate. - Note that `o` and `twoalt` is ignored when `big` is used, - and `twoalt` is ignored when `o` is used for formal representations. +def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str: + """Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九) + + Args: + num (str): arabic number to convert + big (bool, optional): use financial characters. Defaults to False. + simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True. + o (bool, optional): use 〇 for 'zero'. Defaults to False. + twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False. + + Raises: + ValueError: if number is more than 1e48 + ValueError: if 'e' exposent in number + + Returns: + str: converted number as hanzi characters """ + # check num first nd = str(num) if abs(float(nd)) >= 1e48: @@ -97,11 +105,27 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): -def _number_replace(match : re.Match): +def _number_replace(match: re.Match) -> str: + """function to apply in a match, transform all numbers in a match by chinese characters + + Args: + match (re.Match): numbers regex matches + + Returns: + str: replaced characters for the numbers + """ match_str: str = match.group() return _num2chinese(match_str) -def replace_numbers_to_characters_in_text(text : str): +def replace_numbers_to_characters_in_text(text: str) -> str: + """Replace all arabic numbers in a text by their equivalent in chinese characters (simplified) + + Args: + text (str): input text to transform + + Returns: + str: output text + """ text = re.sub(r'[0-9]+', _number_replace, text) return text \ No newline at end of file diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index e7b1546e..adbd0d20 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -220,7 +220,6 @@ def synthesis(model, model outputs. speaker_id (int): id of speaker style_wav (str | Dict[str, float]): Uses for style embedding of GST. - style_wav (str): Uses for style embedding of GST. truncated (bool): keep model states after inference. It can be used for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 16172596..4f4a38ea 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -29,16 +29,15 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+' def text2phone(text, language): - ''' - Convert graphemes to phonemes. For most of the languages, it calls + '''Convert graphemes to phonemes. For most of the languages, it calls the phonemizer python library that calls espeak/espeak-ng. For chinese mandarin, it calls pypinyin + custom function for phonemizing - Parameters: - text (str): text to phonemize - language (str): language of the text - Returns: - ph (str): phonemes as a string seperated by "|" - ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" + Parameters: + text (str): text to phonemize + language (str): language of the text + Returns: + ph (str): phonemes as a string seperated by "|" + ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" ''' # TO REVIEW : How to have a good implementation for this? diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 4b4bc04c..3e65e175 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -127,7 +127,7 @@ class Synthesizer(object): gst_style_input = None if self.tts_config.use_gst: if self.tts_config.gst["gst_style_input"] not in ["", {}]: - style_wav = self.tts_config.gst["gst_style_input"] + gst_style_input = self.tts_config.gst["gst_style_input"] for sen in sens: # synthesize voice From a9ea71c601a31256c6fa0d9e31de7d04cbbe0d8e Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:41:06 +0100 Subject: [PATCH 086/100] remove re.Match typing in '_number_replace()' --- TTS/tts/utils/chinese_mandarin/numbers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py index a662ea1c..0befe6b1 100644 --- a/TTS/tts/utils/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -105,7 +105,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str: -def _number_replace(match: re.Match) -> str: +def _number_replace(match) -> str: """function to apply in a match, transform all numbers in a match by chinese characters Args: From fe049cb48091562e13a0294d4df730d21816d105 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:53:39 +0100 Subject: [PATCH 087/100] add pypinyin and jieba to requierements.txt (chinese implementation) --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 7a0d9f76..659fe787 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,8 @@ numba==0.48 librosa==0.7.2 phonemizer>=2.2.0 unidecode==0.4.20 +pypinyin +jieba tensorboardX matplotlib Pillow From 45435624678fb756406e97f881f0bbc785748e6e Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 20:23:02 +0100 Subject: [PATCH 088/100] remove gst handling in synthetizer.py class --- TTS/utils/synthesizer.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 3e65e175..2a779e53 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,13 +122,6 @@ class Synthesizer(object): speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None - - # check if compute gst style - gst_style_input = None - if self.tts_config.use_gst: - if self.tts_config.gst["gst_style_input"] not in ["", {}]: - gst_style_input = self.tts_config.gst["gst_style_input"] - for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( @@ -138,7 +131,7 @@ class Synthesizer(object): self.use_cuda, self.ap, speaker_idx, - gst_style_input, + None, False, self.tts_config.enable_eos_bos_chars, use_gl, From ce0c5eccbda3223fa652aa0caa84bdb09a119732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 17 Feb 2021 00:35:43 +0000 Subject: [PATCH 089/100] do not test server and modelManager until fixing #657 --- .circleci/config.yml | 2 -- TTS/bin/synthesize.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6570bad6..96fc11b4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,9 +33,7 @@ jobs: cardboardlinter --refspec ${CIRCLE_BRANCH} -n auto - run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker - run: | - sudo ./tests/test_server_package.sh sudo ./tests/test_glow-tts_train.sh - sudo ./tests/test_server_package.sh sudo ./tests/test_tacotron_train.sh sudo ./tests/test_vocoder_gan_train.sh sudo ./tests/test_vocoder_wavegrad_train.sh diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 382a4fc6..009affe5 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -67,7 +67,7 @@ def main(): parser.add_argument( '--text', type=str, - default=None, + required=True, help='Text to generate speech.' ) From f6e6314910be652eb5cc2c29f8022e433bf681bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 17 Feb 2021 13:35:23 +0000 Subject: [PATCH 090/100] add RUSLAN dataset preprocessor --- TTS/tts/datasets/preprocess.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 7815d87d..bed76c86 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -153,7 +153,8 @@ def mailabs(root_path, meta_files=None): def ljspeech(root_path, meta_file): - """Normalizes the Nancy meta data file to TTS format""" + """Normalizes the LJSpeech meta data file to TTS format + https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "ljspeech" @@ -166,6 +167,21 @@ def ljspeech(root_path, meta_file): return items +def ruslan(root_path, meta_file): + """Normalizes the RUSLAN meta data file to TTS format + https://ruslan-corpus.github.io/""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "ljspeech" + with open(txt_file, 'r') as ttf: + for line in ttf: + cols = line.split('|') + wav_file = os.path.join(root_path, 'RUSLAN', cols[0] + '.wav') + text = cols[1] + items.append([text, wav_file, speaker_name]) + return items + + def css10(root_path, meta_file): """Normalizes the CSS10 dataset file to TTS format""" txt_file = os.path.join(root_path, meta_file) From a8ea0ea6cefb173cc1be9b414b4e40a2df392a5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 17 Feb 2021 13:35:41 +0000 Subject: [PATCH 091/100] Docstrings for audioprocessor --- TTS/utils/audio.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index af613ba3..9a50d908 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -9,6 +9,40 @@ from TTS.tts.utils.data import StandardScaler #pylint: disable=too-many-public-methods class AudioProcessor(object): + """Audio Processor for TTS used by all the data pipelines. + + Note: + All the class arguments are set to default values to enable a flexible initialization + of the class with the model config. They are not meaningful for all the arguments. + + Args: + sample_rate (int, optional): target audio sampling rate. Defaults to None. + resample (bool, optional): enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False. + num_mels (int, optional): number of melspectrogram dimensions. Defaults to None. + min_level_db (int, optional): minimum db threshold for the computed melspectrograms. Defaults to None. + frame_shift_ms (int, optional): milliseconds of frames between STFT columns. Defaults to None. + frame_length_ms (int, optional): milliseconds of STFT window length. Defaults to None. + hop_length (int, optional): number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None. + win_length (int, optional): STFT window length. Used if ```frame_length_ms``` is None. Defaults to None. + ref_level_db (int, optional): reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None. + fft_size (int, optional): FFT window size for STFT. Defaults to 1024. + power (int, optional): Exponent value applied to the spectrogram before GriffinLim. Defaults to None. + preemphasis (float, optional): Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0. + signal_norm (bool, optional): enable/disable signal normalization. Defaults to None. + symmetric_norm (bool, optional): enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None. + max_norm (float, optional): ```k``` defining the normalization range. Defaults to None. + mel_fmin (int, optional): minimum filter frequency for computing melspectrograms. Defaults to None. + mel_fmax (int, optional): maximum filter frequency for computing melspectrograms.. Defaults to None. + spec_gain (int, optional): gain applied when converting amplitude to DB. Defaults to 20. + stft_pad_mode (str, optional): Padding mode for STFT. Defaults to 'reflect'. + clip_norm (bool, optional): enable/disable clipping the our of range values in the normalized audio signal. Defaults to True. + griffin_lim_iters (int, optional): Number of GriffinLim iterations. Defaults to None. + do_trim_silence (bool, optional): enable/disable silence trimming when loading the audio signal. Defaults to False. + trim_db (int, optional): DB threshold used for silence trimming. Defaults to 60. + do_sound_norm (bool, optional): enable/disable signal normalization. Defaults to False. + stats_path (str, optional): Path to the computed stats file. Defaults to None. + verbose (bool, optional): enable/disable logging. Defaults to True. + """ def __init__(self, sample_rate=None, resample=False, From c4c7bc1b88223af9799116158ed83590555bdb89 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Mon, 15 Feb 2021 16:04:47 +0100 Subject: [PATCH 092/100] Chinese mandarin implementation (tacotron2) --- TTS/.models.json | 10 + TTS/tts/datasets/preprocess.py | 16 + TTS/tts/utils/chinese_mandarin/__init__.py | 0 TTS/tts/utils/chinese_mandarin/numbers.py | 107 ++++ TTS/tts/utils/chinese_mandarin/phonemizer.py | 41 ++ .../chinese_mandarin/pinyinToPhonemes.py | 420 ++++++++++++++ TTS/tts/utils/synthesis.py | 1 + TTS/tts/utils/text/__init__.py | 18 +- TTS/tts/utils/text/cleaners.py | 9 + TTS/utils/synthesizer.py | 9 +- ...on2_TTS_and_MultiBand_MelGAN_Example.ipynb | 529 ++++++++++++++++++ 11 files changed, 1158 insertions(+), 2 deletions(-) create mode 100644 TTS/tts/utils/chinese_mandarin/__init__.py create mode 100644 TTS/tts/utils/chinese_mandarin/numbers.py create mode 100644 TTS/tts/utils/chinese_mandarin/phonemizer.py create mode 100644 TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py create mode 100644 notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb diff --git a/TTS/.models.json b/TTS/.models.json index 05997461..0fb187a4 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -75,6 +75,16 @@ "contact":"erengolge@gmail.com" } } + }, + "zh":{ + "baker":{ + "tacotron2-DDC-GST":{ + "model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw", + "config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz", + "stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV", + "commit": "" + } + } } }, "vocoder_models":{ diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 7815d87d..be479376 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -352,3 +352,19 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): with open(str(cache_to), 'r') as f: return [x.strip().split('|') for x in f.readlines()] + + + + +# ======================================== Baker (chinese mandarin single speaker) =========================================== +def baker(root_path, meta_file): + """Normalizes the Baker meta data file to TTS format""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "baker" + with open(txt_file, 'r') as ttf: + for line in ttf: + wav_name, text = line.rstrip('\n').split("|") + wav_path = os.path.join(root_path, "clips_22", wav_name) + items.append([text, wav_path, speaker_name]) + return items diff --git a/TTS/tts/utils/chinese_mandarin/__init__.py b/TTS/tts/utils/chinese_mandarin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py new file mode 100644 index 00000000..8d2f40ff --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -0,0 +1,107 @@ + +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed under WTFPL or the Unlicense or CC0. +# This uses Python 3, but it's easy to port to Python 2 by changing +# strings to u'xx'. + +import re +import itertools + + +def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): + """ + Converts numbers to Chinese representations. + `big` : use financial characters. + `simp` : use simplified characters instead of traditional characters. + `o` : use 〇 for zero. + `twoalt`: use 两/兩 for two when appropriate. + Note that `o` and `twoalt` is ignored when `big` is used, + and `twoalt` is ignored when `o` is used for formal representations. + """ + # check num first + nd = str(num) + if abs(float(nd)) >= 1e48: + raise ValueError('number out of range') + elif 'e' in nd: + raise ValueError('scientific notation is not supported') + c_symbol = '正负点' if simp else '正負點' + if o: # formal + twoalt = False + if big: + c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖' + c_unit1 = '拾佰仟' + c_twoalt = '贰' if simp else '貳' + else: + c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九' + c_unit1 = '十百千' + if twoalt: + c_twoalt = '两' if simp else '兩' + else: + c_twoalt = '二' + c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載' + revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l))) + nd = str(num) + result = [] + if nd[0] == '+': + result.append(c_symbol[0]) + elif nd[0] == '-': + result.append(c_symbol[1]) + if '.' in nd: + integer, remainder = nd.lstrip('+-').split('.') + else: + integer, remainder = nd.lstrip('+-'), None + if int(integer): + splitted = [integer[max(i - 4, 0):i] + for i in range(len(integer), 0, -4)] + intresult = [] + for nu, unit in enumerate(splitted): + # special cases + if int(unit) == 0: # 0000 + intresult.append(c_basic[0]) + continue + elif nu > 0 and int(unit) == 2: # 0002 + intresult.append(c_twoalt + c_unit2[nu - 1]) + continue + ulist = [] + unit = unit.zfill(4) + for nc, ch in enumerate(reversed(unit)): + if ch == '0': + if ulist: # ???0 + ulist.append(c_basic[0]) + elif nc == 0: + ulist.append(c_basic[int(ch)]) + elif nc == 1 and ch == '1' and unit[1] == '0': + # special case for tens + # edit the 'elif' if you don't like + # 十四, 三千零十四, 三千三百一十四 + ulist.append(c_unit1[0]) + elif nc > 1 and ch == '2': + ulist.append(c_twoalt + c_unit1[nc - 1]) + else: + ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) + ustr = revuniq(ulist) + if nu == 0: + intresult.append(ustr) + else: + intresult.append(ustr + c_unit2[nu - 1]) + result.append(revuniq(intresult).strip(c_basic[0])) + else: + result.append(c_basic[0]) + if remainder: + result.append(c_symbol[2]) + result.append(''.join(c_basic[int(ch)] for ch in remainder)) + return ''.join(result) + + + + +def _number_replace(match : re.Match): + match_str: str = match.group() + return _num2chinese(match_str) + + +def replace_numbers_to_characters_in_text(text : str): + text = re.sub(r'[0-9]+', _number_replace, text) + return text \ No newline at end of file diff --git a/TTS/tts/utils/chinese_mandarin/phonemizer.py b/TTS/tts/utils/chinese_mandarin/phonemizer.py new file mode 100644 index 00000000..7742c491 --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/phonemizer.py @@ -0,0 +1,41 @@ +from typing import List + +import pypinyin + +from .pinyinToPhonemes import PINYIN_DICT + + +import jieba + + +def _chinese_character_to_pinyin(text: str) -> List[str]: + pinyins = pypinyin.pinyin( + text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True + ) + pinyins_flat_list = [item for sublist in pinyins for item in sublist] + return pinyins_flat_list + + +def _chinese_pinyin_to_phoneme(pinyin: str) -> str: + segment = pinyin[:-1] + tone = pinyin[-1] + phoneme = PINYIN_DICT.get(segment, [""])[0] + return phoneme + tone + + +def chinese_text_to_phonemes(text: str) -> str: + tokenized_text = jieba.cut(text, HMM=False) + tokenized_text = " ".join(tokenized_text) + pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) + + results: List[str] = [] + + for token in pinyined_text: + if token[-1] in "12345": # TODO transform to is_pinyin() + pinyin_phonemes = _chinese_pinyin_to_phoneme(token) + + results += list(pinyin_phonemes) + else: # is ponctuation or other + results += list(token) + + return "|".join(results) diff --git a/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py new file mode 100644 index 00000000..cdca44ac --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py @@ -0,0 +1,420 @@ + +PINYIN_DICT = { + "a": ["a"], + "ai": ["ai"], + "an": ["an"], + "ang": ["ɑŋ"], + "ao": ["aʌ"], + "ba": ["ba"], + "bai": ["bai"], + "ban": ["ban"], + "bang": ["bɑŋ"], + "bao": ["baʌ"], + # "be": ["be"], doesnt exist + "bei": ["bɛi"], + "ben": ["bœn"], + "beng": ["bɵŋ"], + "bi": ["bi"], + "bian": ["biɛn"], + "biao": ["biaʌ"], + "bie": ["bie"], + "bin": ["bin"], + "bing": ["bɨŋ"], + "bo": ["bo"], + "bu": ["bu"], + "ca": ["tsa"], + "cai": ["tsai"], + "can": ["tsan"], + "cang": ["tsɑŋ"], + "cao": ["tsaʌ"], + "ce": ["tsø"], + "cen": ["tsœn"], + "ceng": ["tsɵŋ"], + "cha": ["ʈʂa"], + "chai": ["ʈʂai"], + "chan": ["ʈʂan"], + "chang": ["ʈʂɑŋ"], + "chao": ["ʈʂaʌ"], + "che": ["ʈʂø"], + "chen": ["ʈʂœn"], + "cheng": ["ʈʂɵŋ"], + "chi": ["ʈʂʏ"], + "chong": ["ʈʂoŋ"], + "chou": ["ʈʂou"], + "chu": ["ʈʂu"], + "chua": ["ʈʂua"], + "chuai": ["ʈʂuai"], + "chuan": ["ʈʂuan"], + "chuang": ["ʈʂuɑŋ"], + "chui": ["ʈʂuei"], + "chun": ["ʈʂun"], + "chuo": ["ʈʂuo"], + "ci": ["tsɪ"], + "cong": ["tsoŋ"], + "cou": ["tsou"], + "cu": ["tsu"], + "cuan": ["tsuan"], + "cui": ["tsuei"], + "cun": ["tsun"], + "cuo": ["tsuo"], + "da": ["da"], + "dai": ["dai"], + "dan": ["dan"], + "dang": ["dɑŋ"], + "dao": ["daʌ"], + "de": ["dø"], + "dei": ["dei"], + # "den": ["dœn"], + "deng": ["dɵŋ"], + "di": ["di"], + "dia": ["dia"], + "dian": ["diɛn"], + "diao": ["diaʌ"], + "die": ["die"], + "ding": ["dɨŋ"], + "diu": ["dio"], + "dong": ["doŋ"], + "dou": ["dou"], + "du": ["du"], + "duan": ["duan"], + "dui": ["duei"], + "dun": ["dun"], + "duo": ["duo"], + "e": ["ø"], + "ei": ["ei"], + "en": ["œn"], + # "ng": ["œn"], + # "eng": ["ɵŋ"], + "er": ["er"], + "fa": ["fa"], + "fan": ["fan"], + "fang": ["fɑŋ"], + "fei": ["fei"], + "fen": ["fœn"], + "feng": ["fɵŋ"], + "fo": ["fo"], + "fou": ["fou"], + "fu": ["fu"], + "ga": ["ga"], + "gai": ["gai"], + "gan": ["gan"], + "gang": ["gɑŋ"], + "gao": ["gaʌ"], + "ge": ["gø"], + "gei": ["gei"], + "gen": ["gœn"], + "geng": ["gɵŋ"], + "gong": ["goŋ"], + "gou": ["gou"], + "gu": ["gu"], + "gua": ["gua"], + "guai": ["guai"], + "guan": ["guan"], + "guang": ["guɑŋ"], + "gui": ["guei"], + "gun": ["gun"], + "guo": ["guo"], + "ha": ["xa"], + "hai": ["xai"], + "han": ["xan"], + "hang": ["xɑŋ"], + "hao": ["xaʌ"], + "he": ["xø"], + "hei": ["xei"], + "hen": ["xœn"], + "heng": ["xɵŋ"], + "hong": ["xoŋ"], + "hou": ["xou"], + "hu": ["xu"], + "hua": ["xua"], + "huai": ["xuai"], + "huan": ["xuan"], + "huang": ["xuɑŋ"], + "hui": ["xuei"], + "hun": ["xun"], + "huo": ["xuo"], + "ji": ["dʑi"], + "jia": ["dʑia"], + "jian": ["dʑiɛn"], + "jiang": ["dʑiɑŋ"], + "jiao": ["dʑiaʌ"], + "jie": ["dʑie"], + "jin": ["dʑin"], + "jing": ["dʑɨŋ"], + "jiong": ["dʑioŋ"], + "jiu": ["dʑio"], + "ju": ["dʑy"], + "juan": ["dʑyɛn"], + "jue": ["dʑye"], + "jun": ["dʑyn"], + "ka": ["ka"], + "kai": ["kai"], + "kan": ["kan"], + "kang": ["kɑŋ"], + "kao": ["kaʌ"], + "ke": ["kø"], + "kei": ["kei"], + "ken": ["kœn"], + "keng": ["kɵŋ"], + "kong": ["koŋ"], + "kou": ["kou"], + "ku": ["ku"], + "kua": ["kua"], + "kuai": ["kuai"], + "kuan": ["kuan"], + "kuang": ["kuɑŋ"], + "kui": ["kuei"], + "kun": ["kun"], + "kuo": ["kuo"], + "la": ["la"], + "lai": ["lai"], + "lan": ["lan"], + "lang": ["lɑŋ"], + "lao": ["laʌ"], + "le": ["lø"], + "lei": ["lei"], + "leng": ["lɵŋ"], + "li": ["li"], + "lia": ["lia"], + "lian": ["liɛn"], + "liang": ["liɑŋ"], + "liao": ["liaʌ"], + "lie": ["lie"], + "lin": ["lin"], + "ling": ["lɨŋ"], + "liu": ["lio"], + "lo": ["lo"], + "long": ["loŋ"], + "lou": ["lou"], + "lu": ["lu"], + "lv": ["ly"], + "luan": ["luan"], + "lve": ["lye"], + "lue": ["lue"], + "lun": ["lun"], + "luo": ["luo"], + "ma": ["ma"], + "mai": ["mai"], + "man": ["man"], + "mang": ["mɑŋ"], + "mao": ["maʌ"], + "me": ["mø"], + "mei": ["mei"], + "men": ["mœn"], + "meng": ["mɵŋ"], + "mi": ["mi"], + "mian": ["miɛn"], + "miao": ["miaʌ"], + "mie": ["mie"], + "min": ["min"], + "ming": ["mɨŋ"], + "miu": ["mio"], + "mo": ["mo"], + "mou": ["mou"], + "mu": ["mu"], + "na": ["na"], + "nai": ["nai"], + "nan": ["nan"], + "nang": ["nɑŋ"], + "nao": ["naʌ"], + "ne": ["nø"], + "nei": ["nei"], + "nen": ["nœn"], + "neng": ["nɵŋ"], + "ni": ["ni"], + "nia": ["nia"], + "nian": ["niɛn"], + "niang": ["niɑŋ"], + "niao": ["niaʌ"], + "nie": ["nie"], + "nin": ["nin"], + "ning": ["nɨŋ"], + "niu": ["nio"], + "nong": ["noŋ"], + "nou": ["nou"], + "nu": ["nu"], + "nv": ["ny"], + "nuan": ["nuan"], + "nve": ["nye"], + "nue": ["nye"], + "nuo": ["nuo"], + "o": ["o"], + "ou": ["ou"], + "pa": ["pa"], + "pai": ["pai"], + "pan": ["pan"], + "pang": ["pɑŋ"], + "pao": ["paʌ"], + "pe": ["pø"], + "pei": ["pei"], + "pen": ["pœn"], + "peng": ["pɵŋ"], + "pi": ["pi"], + "pian": ["piɛn"], + "piao": ["piaʌ"], + "pie": ["pie"], + "pin": ["pin"], + "ping": ["pɨŋ"], + "po": ["po"], + "pou": ["pou"], + "pu": ["pu"], + "qi": ["tɕi"], + "qia": ["tɕia"], + "qian": ["tɕiɛn"], + "qiang": ["tɕiɑŋ"], + "qiao": ["tɕiaʌ"], + "qie": ["tɕie"], + "qin": ["tɕin"], + "qing": ["tɕɨŋ"], + "qiong": ["tɕioŋ"], + "qiu": ["tɕio"], + "qu": ["tɕy"], + "quan": ["tɕyɛn"], + "que": ["tɕye"], + "qun": ["tɕyn"], + "ran": ["ʐan"], + "rang": ["ʐɑŋ"], + "rao": ["ʐaʌ"], + "re": ["ʐø"], + "ren": ["ʐœn"], + "reng": ["ʐɵŋ"], + "ri": ["ʐʏ"], + "rong": ["ʐoŋ"], + "rou": ["ʐou"], + "ru": ["ʐu"], + "rua": ["ʐua"], + "ruan": ["ʐuan"], + "rui": ["ʐuei"], + "run": ["ʐun"], + "ruo": ["ʐuo"], + "sa": ["sa"], + "sai": ["sai"], + "san": ["san"], + "sang": ["sɑŋ"], + "sao": ["saʌ"], + "se": ["sø"], + "sen": ["sœn"], + "seng": ["sɵŋ"], + "sha": ["ʂa"], + "shai": ["ʂai"], + "shan": ["ʂan"], + "shang": ["ʂɑŋ"], + "shao": ["ʂaʌ"], + "she": ["ʂø"], + "shei": ["ʂei"], + "shen": ["ʂœn"], + "sheng": ["ʂɵŋ"], + "shi": ["ʂʏ"], + "shou": ["ʂou"], + "shu": ["ʂu"], + "shua": ["ʂua"], + "shuai": ["ʂuai"], + "shuan": ["ʂuan"], + "shuang": ["ʂuɑŋ"], + "shui": ["ʂuei"], + "shun": ["ʂun"], + "shuo": ["ʂuo"], + "si": ["sɪ"], + "song": ["soŋ"], + "sou": ["sou"], + "su": ["su"], + "suan": ["suan"], + "sui": ["suei"], + "sun": ["sun"], + "suo": ["suo"], + "ta": ["ta"], + "tai": ["tai"], + "tan": ["tan"], + "tang": ["tɑŋ"], + "tao": ["taʌ"], + "te": ["tø"], + "tei": ["tei"], + "teng": ["tɵŋ"], + "ti": ["ti"], + "tian": ["tiɛn"], + "tiao": ["tiaʌ"], + "tie": ["tie"], + "ting": ["tɨŋ"], + "tong": ["toŋ"], + "tou": ["tou"], + "tu": ["tu"], + "tuan": ["tuan"], + "tui": ["tuei"], + "tun": ["tun"], + "tuo": ["tuo"], + "wa": ["wa"], + "wai": ["wai"], + "wan": ["wan"], + "wang": ["wɑŋ"], + "wei": ["wei"], + "wen": ["wœn"], + "weng": ["wɵŋ"], + "wo": ["wo"], + "wu": ["wu"], + "xi": ["ɕi"], + "xia": ["ɕia"], + "xian": ["ɕiɛn"], + "xiang": ["ɕiɑŋ"], + "xiao": ["ɕiaʌ"], + "xie": ["ɕie"], + "xin": ["ɕin"], + "xing": ["ɕɨŋ"], + "xiong": ["ɕioŋ"], + "xiu": ["ɕio"], + "xu": ["ɕy"], + "xuan": ["ɕyɛn"], + "xue": ["ɕye"], + "xun": ["ɕyn"], + "ya": ["ia"], + "yan": ["iɛn"], + "yang": ["iɑŋ"], + "yao": ["iaʌ"], + "ye": ["ie"], + "yi": ["i"], + "yin": ["in"], + "ying": ["ɨŋ"], + "yo": ["io"], + "yong": ["ioŋ"], + "you": ["io"], + "yu": ["y"], + "yuan": ["yɛn"], + "yue": ["ye"], + "yun": ["yn"], + "za": ["dza"], + "zai": ["dzai"], + "zan": ["dzan"], + "zang": ["dzɑŋ"], + "zao": ["dzaʌ"], + "ze": ["dzø"], + "zei": ["dzei"], + "zen": ["dzœn"], + "zeng": ["dzɵŋ"], + "zha": ["dʒa"], + "zhai": ["dʒai"], + "zhan": ["dʒan"], + "zhang": ["dʒɑŋ"], + "zhao": ["dʒaʌ"], + "zhe": ["dʒø"], + # "zhei": ["dʒei"], it doesn't exist + "zhen": ["dʒœn"], + "zheng": ["dʒɵŋ"], + "zhi": ["dʒʏ"], + "zhong": ["dʒoŋ"], + "zhou": ["dʒou"], + "zhu": ["dʒu"], + "zhua": ["dʒua"], + "zhuai": ["dʒuai"], + "zhuan": ["dʒuan"], + "zhuang": ["dʒuɑŋ"], + "zhui": ["dʒuei"], + "zhun": ["dʒun"], + "zhuo": ["dʒuo"], + "zi": ["dzɪ"], + "zong": ["dzoŋ"], + "zou": ["dzou"], + "zu": ["dzu"], + "zuan": ["dzuan"], + "zui": ["dzuei"], + "zun": ["dzun"], + "zuo": ["dzuo"], +} \ No newline at end of file diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index be587211..e7b1546e 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -219,6 +219,7 @@ def synthesis(model, ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process model outputs. speaker_id (int): id of speaker + style_wav (str | Dict[str, float]): Uses for style embedding of GST. style_wav (str): Uses for style embedding of GST. truncated (bool): keep model states after inference. It can be used for continuous inference at long texts. diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 9771e691..16172596 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations, make_symbols, phonemes, symbols) +from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes # pylint: disable=unnecessary-comprehension @@ -29,8 +30,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+' def text2phone(text, language): ''' - Convert graphemes to phonemes. + Convert graphemes to phonemes. For most of the languages, it calls + the phonemizer python library that calls espeak/espeak-ng. For chinese + mandarin, it calls pypinyin + custom function for phonemizing + Parameters: + text (str): text to phonemize + language (str): language of the text + Returns: + ph (str): phonemes as a string seperated by "|" + ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" ''' + + # TO REVIEW : How to have a good implementation for this? + if language == "chinese-mandarin": + ph = chinese_text_to_phonemes(text) + return ph + + seperator = phonemizer.separator.Separator(' |', '', '|') #try: punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 7c3f1017..49a25557 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -15,6 +15,8 @@ from unidecode import unidecode from .number_norm import normalize_numbers from .abbreviations import abbreviations_en, abbreviations_fr from .time import expand_time_english +from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text + # Regular expression matching whitespace: _whitespace_re = re.compile(r'\s+') @@ -122,6 +124,13 @@ def portuguese_cleaners(text): text = collapse_whitespace(text) return text +def chinese_mandarin_cleaners(text: str) -> str: + '''Basic pipeline for chinese''' + text = replace_numbers_to_characters_in_text(text) + return text + + + def phoneme_cleaners(text): '''Pipeline for phonemes mode, including number and abbreviation expansion.''' text = expand_numbers(text) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2a779e53..4b4bc04c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,6 +122,13 @@ class Synthesizer(object): speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None + + # check if compute gst style + gst_style_input = None + if self.tts_config.use_gst: + if self.tts_config.gst["gst_style_input"] not in ["", {}]: + style_wav = self.tts_config.gst["gst_style_input"] + for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( @@ -131,7 +138,7 @@ class Synthesizer(object): self.use_cuda, self.ap, speaker_idx, - None, + gst_style_input, False, self.tts_config.enable_eos_bos_chars, use_gl, diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb new file mode 100644 index 00000000..709dbb8d --- /dev/null +++ b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6LWsNd3_M3MP" + }, + "source": [ + "# Mozilla TTS on CPU Real-Time Speech Synthesis " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "FAqrSIWgLyP0" + }, + "source": [ + "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", + "\n", + "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", + "\n", + "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", + "\n", + "Note that both model performances can be improved with more training." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ku-dA4DKoeXk" + }, + "source": [ + "### Download Models" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "colab_type": "code", + "id": "jGIgnWhGsxU1", + "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: cannot create directory 'data/': File exists\n", + "Downloading...\n", + "From: https://drive.google.com/uc?id=1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV\n", + "To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/tts_scale_stats.npy\n", + "100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 18.1MB/s]\n" + ] + } + ], + "source": [ + "! mkdir data/\n", + "! gdown --id 1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw -O data/tts_model.pth.tar\n", + "! gdown --id 1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz -O data/tts_config.json\n", + "! gdown --id 1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV -O data/tts_scale_stats.npy" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "colab_type": "code", + "id": "4dnpE0-kvTsu", + "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU\n", + "To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/vocoder_scale_stats.npy\n", + "100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 16.7MB/s]\n" + ] + } + ], + "source": [ + "! gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n", + "! gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/vocoder_config.json\n", + "! gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/vocoder_scale_stats.npy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Zlgi8fPdpRF0" + }, + "source": [ + "### Define TTS function" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "f-Yc42nQZG5A" + }, + "outputs": [], + "source": [ + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, style_wav=None):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=style_wav,\n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", + " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " waveform = waveform.flatten()\n", + " if use_cuda:\n", + " waveform = waveform.cpu()\n", + " waveform = waveform.numpy()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZksegYQepkFg" + }, + "source": [ + "### Load Models" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oVa0kOamprgj" + }, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "import time\n", + "import IPython\n", + "\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.utils.io import load_config\n", + "from TTS.tts.utils.text.symbols import symbols, phonemes, make_symbols\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.tts.utils.synthesis import synthesis" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "EY-sHVO8IFSH" + }, + "outputs": [], + "source": [ + "# runtime settings\n", + "use_cuda = False" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_1aIUp2FpxOQ" + }, + "outputs": [], + "source": [ + "# model paths\n", + "TTS_MODEL = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/checkpoint_17000.pth.tar\"\n", + "TTS_CONFIG = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/config.json\"\n", + "\n", + "TTS_MODEL = \"data/tts_model.pth.tar\"\n", + "TTS_CONFIG = \"data/tts_config.json\"\n", + "\n", + "VOCODER_MODEL = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar\"\n", + "VOCODER_CONFIG = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/config.json\"\n", + "\n", + "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n", + "VOCODER_CONFIG = \"data/vocoder_config.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CpgmdBVQplbv" + }, + "outputs": [], + "source": [ + "# load configs\n", + "TTS_CONFIG = load_config(TTS_CONFIG)\n", + "VOCODER_CONFIG = load_config(VOCODER_CONFIG)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "colab_type": "code", + "id": "zmrQxiozIUVE", + "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:22050\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:0\n", + " | > fft_size:1024\n", + " | > power:1.5\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:60\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:50.0\n", + " | > mel_fmax:7600.0\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:data/tts_scale_stats.npy\n", + " | > hop_length:256\n", + " | > win_length:1024\n" + ] + } + ], + "source": [ + "# load the audio processor\n", + "TTS_CONFIG.audio['stats_path'] = 'data/tts_scale_stats.npy'\n", + "ap = AudioProcessor(**TTS_CONFIG.audio) " + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "colab_type": "code", + "id": "8fLoI4ipqMeS", + "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Using model: tacotron2\n" + ] + } + ], + "source": [ + "# LOAD TTS MODEL\n", + "# multi speaker \n", + "speaker_id = None\n", + "speakers = []\n", + "\n", + "# load the model (chinese_mandarin special characters/punctuations are in the tts_config.json)\n", + "if TTS_CONFIG.get(\"characters\"):\n", + " _characters = TTS_CONFIG[\"characters\"][\"characters\"]\n", + " _phonemes = TTS_CONFIG[\"characters\"][\"phonemes\"]\n", + " _punctuations = TTS_CONFIG[\"characters\"][\"punctuations\"]\n", + " _pad = TTS_CONFIG[\"characters\"][\"pad\"]\n", + " _eos = TTS_CONFIG[\"characters\"][\"eos\"]\n", + " _bos = TTS_CONFIG[\"characters\"][\"bos\"]\n", + " \n", + " symbols, phonemes = make_symbols(_characters, _phonemes, punctuations= _punctuations, pad=_pad, eos=_eos, bos=_bos )\n", + "\n", + "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", + "\n", + "# load model state\n", + "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n", + "\n", + "# load the model\n", + "model.load_state_dict(cp['model'])\n", + "if use_cuda:\n", + " model.cuda()\n", + "model.eval()\n", + "\n", + "# set model stepsize\n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "zKoq0GgzqzhQ", + "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Generator Model: multiband_melgan_generator\n", + " > Setting up Audio Processor...\n", + " | > sample_rate:22050\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:0\n", + " | > fft_size:1024\n", + " | > power:None\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:None\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:50.0\n", + " | > mel_fmax:7600.0\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:data/vocoder_scale_stats.npy\n", + " | > hop_length:256\n", + " | > win_length:1024\n", + "\n", + "Vocoder loaded\n" + ] + } + ], + "source": [ + "from TTS.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "# LOAD VOCODER MODEL\n", + "vocoder_model = setup_generator(VOCODER_CONFIG)\n", + "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n", + "vocoder_model.remove_weight_norm()\n", + "vocoder_model.inference_padding = 0\n", + "\n", + "\n", + "VOCODER_CONFIG.audio['stats_path'] = 'data/vocoder_scale_stats.npy'\n", + "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", + "if use_cuda:\n", + " vocoder_model.cuda()\n", + "vocoder_model.eval()\n", + "print(\"\\nVocoder loaded\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ws_YkPKsLgo-" + }, + "source": [ + "## Run Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# Here some test sentences for you to play with :\n", + "sentence = \"我从来不会说很标准的中文。\"\n", + "sentence = \"我喜欢听人工智能的博客。\"\n", + "sentence = \"我来自一个法国郊区的地方。\"\n", + "sentence = \"不比不知道,一比吓一跳!\"\n", + "sentence = \"台湾是一个真的很好玩的地方!\"\n", + "sentence = \"干一行,行一行,行行都行。\"\n", + "sentence = \"我要盖被子,好尴尬!\"" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also play with the style_wav global style token. However, the lady speaking in the baker dataset\n", + "# has no emotion through all the sentences. It's hard to get some nice GST with this.\n", + "# That's also why adding \"!\" or \"?\" at the end of sentence change nothing. The dataset has no such prosody.\n", + "style_wav = {\"2\": 0.3, \"1\": -0.1}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 134 + }, + "colab_type": "code", + "id": "FuWxZ9Ey5Puj", + "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(50688,)\n", + " > Run-time: 1.5945854187011719\n", + " > Real-time factor: 0.6935317513786934\n", + " > Time per step: 3.145291761617468e-05\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sentence = \"我喜欢听人工智能的博客。\"\n", + "style_wav = {\"2\": 0.2, \"7\": -0.1}\n", + "\n", + "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True, style_wav= style_wav)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From fb0655d1e78c3348f842214e57150e860fbb9755 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:25:18 +0100 Subject: [PATCH 093/100] modify according to PR reviews --- TTS/tts/datasets/preprocess.py | 12 ++++-- TTS/tts/utils/chinese_mandarin/numbers.py | 46 +++++++++++++++++------ TTS/tts/utils/synthesis.py | 1 - TTS/tts/utils/text/__init__.py | 15 ++++---- TTS/utils/synthesizer.py | 2 +- 5 files changed, 52 insertions(+), 24 deletions(-) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index be479376..78bf14d1 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -3,6 +3,7 @@ from glob import glob import re import sys from pathlib import Path +from typing import List, Tuple from tqdm import tqdm @@ -355,10 +356,15 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): +def baker(root_path: str, meta_file: str) -> List[List[str]]: + """Normalizes the Baker meta data file to TTS format -# ======================================== Baker (chinese mandarin single speaker) =========================================== -def baker(root_path, meta_file): - """Normalizes the Baker meta data file to TTS format""" + Args: + root_path (str): path to the baker dataset + meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence + Returns: + List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences + """ txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "baker" diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py index 8d2f40ff..a662ea1c 100644 --- a/TTS/tts/utils/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -10,16 +10,24 @@ import re import itertools -def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): - """ - Converts numbers to Chinese representations. - `big` : use financial characters. - `simp` : use simplified characters instead of traditional characters. - `o` : use 〇 for zero. - `twoalt`: use 两/兩 for two when appropriate. - Note that `o` and `twoalt` is ignored when `big` is used, - and `twoalt` is ignored when `o` is used for formal representations. +def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str: + """Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九) + + Args: + num (str): arabic number to convert + big (bool, optional): use financial characters. Defaults to False. + simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True. + o (bool, optional): use 〇 for 'zero'. Defaults to False. + twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False. + + Raises: + ValueError: if number is more than 1e48 + ValueError: if 'e' exposent in number + + Returns: + str: converted number as hanzi characters """ + # check num first nd = str(num) if abs(float(nd)) >= 1e48: @@ -97,11 +105,27 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): -def _number_replace(match : re.Match): +def _number_replace(match: re.Match) -> str: + """function to apply in a match, transform all numbers in a match by chinese characters + + Args: + match (re.Match): numbers regex matches + + Returns: + str: replaced characters for the numbers + """ match_str: str = match.group() return _num2chinese(match_str) -def replace_numbers_to_characters_in_text(text : str): +def replace_numbers_to_characters_in_text(text: str) -> str: + """Replace all arabic numbers in a text by their equivalent in chinese characters (simplified) + + Args: + text (str): input text to transform + + Returns: + str: output text + """ text = re.sub(r'[0-9]+', _number_replace, text) return text \ No newline at end of file diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index e7b1546e..adbd0d20 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -220,7 +220,6 @@ def synthesis(model, model outputs. speaker_id (int): id of speaker style_wav (str | Dict[str, float]): Uses for style embedding of GST. - style_wav (str): Uses for style embedding of GST. truncated (bool): keep model states after inference. It can be used for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 16172596..4f4a38ea 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -29,16 +29,15 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+' def text2phone(text, language): - ''' - Convert graphemes to phonemes. For most of the languages, it calls + '''Convert graphemes to phonemes. For most of the languages, it calls the phonemizer python library that calls espeak/espeak-ng. For chinese mandarin, it calls pypinyin + custom function for phonemizing - Parameters: - text (str): text to phonemize - language (str): language of the text - Returns: - ph (str): phonemes as a string seperated by "|" - ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" + Parameters: + text (str): text to phonemize + language (str): language of the text + Returns: + ph (str): phonemes as a string seperated by "|" + ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" ''' # TO REVIEW : How to have a good implementation for this? diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 4b4bc04c..3e65e175 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -127,7 +127,7 @@ class Synthesizer(object): gst_style_input = None if self.tts_config.use_gst: if self.tts_config.gst["gst_style_input"] not in ["", {}]: - style_wav = self.tts_config.gst["gst_style_input"] + gst_style_input = self.tts_config.gst["gst_style_input"] for sen in sens: # synthesize voice From 3911b87e54c451b6b1f65d89d381ad4b4a82a5d6 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:41:06 +0100 Subject: [PATCH 094/100] remove re.Match typing in '_number_replace()' --- TTS/tts/utils/chinese_mandarin/numbers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py index a662ea1c..0befe6b1 100644 --- a/TTS/tts/utils/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -105,7 +105,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str: -def _number_replace(match: re.Match) -> str: +def _number_replace(match) -> str: """function to apply in a match, transform all numbers in a match by chinese characters Args: From 184ce077180115de1fbf3e948d3b27644b5f329e Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:53:39 +0100 Subject: [PATCH 095/100] add pypinyin and jieba to requierements.txt (chinese implementation) --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 7a0d9f76..659fe787 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,8 @@ numba==0.48 librosa==0.7.2 phonemizer>=2.2.0 unidecode==0.4.20 +pypinyin +jieba tensorboardX matplotlib Pillow From 22a6bbfa80e049bf453f14dd3829f48c3a401ea5 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 20:23:02 +0100 Subject: [PATCH 096/100] remove gst handling in synthetizer.py class --- TTS/utils/synthesizer.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 3e65e175..2a779e53 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,13 +122,6 @@ class Synthesizer(object): speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None - - # check if compute gst style - gst_style_input = None - if self.tts_config.use_gst: - if self.tts_config.gst["gst_style_input"] not in ["", {}]: - gst_style_input = self.tts_config.gst["gst_style_input"] - for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( @@ -138,7 +131,7 @@ class Synthesizer(object): self.use_cuda, self.ap, speaker_idx, - gst_style_input, + None, False, self.tts_config.enable_eos_bos_chars, use_gl, From 5b70c8ba4fc8fe9499fc7a47ecd707c4b09be223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 18 Feb 2021 17:20:36 +0000 Subject: [PATCH 097/100] enable backward compat for loading the best model --- TTS/utils/arguments.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index e4983bfb..bad06262 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -86,24 +86,34 @@ def get_last_checkpoint(path): last_models = {} last_model_nums = {} for key in ['checkpoint', 'best_model']: - last_model_num = 0 + last_model_num = None last_model = None + # pass all the checkpoint files and find + # the one with the largest model number suffix. for file_name in file_names: - try: - model_num = int(re.search( - f"{key}_([0-9]+)", file_name).groups()[0]) - if model_num > last_model_num: + match = re.search(f"{key}_([0-9]+)", file_name) + if match is not None: + model_num = int(match.groups()[0]) + if model_num > last_model_num or last_model_num is None: last_model_num = model_num last_model = file_name - except AttributeError: # if there's no match in the filename - continue - last_models[key] = last_model - last_model_nums[key] = last_model_num + + # if there is not checkpoint found above + # find the checkpoint with the latest + # modification date. + key_file_names = [fn for fn in file_names if key in fn] + if last_model is None and len(key_file_names) > 0: + last_model = max(key_file_names, key=os.path.getctime) + last_model_num = os.path.getctime(last_model) + + if last_model is not None: + last_models[key] = last_model + last_model_nums[key] = last_model_num # check what models were found if not last_models: raise ValueError(f"No models found in continue path {path}!") - elif 'checkpoint' not in last_models: # no checkpoint just best model + if 'checkpoint' not in last_models: # no checkpoint just best model last_models['checkpoint'] = last_models['best_model'] elif 'best_model' not in last_models: # no best model # this shouldn't happen, but let's handle it just in case From 547bfc4ce99e7782f98d0403429d15ec23cdc91d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 18 Feb 2021 18:24:03 +0000 Subject: [PATCH 098/100] bug fix --- TTS/utils/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index bad06262..6a09986c 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -94,7 +94,7 @@ def get_last_checkpoint(path): match = re.search(f"{key}_([0-9]+)", file_name) if match is not None: model_num = int(match.groups()[0]) - if model_num > last_model_num or last_model_num is None: + if last_model_num is None or model_num > last_model_num: last_model_num = model_num last_model = file_name From 7f36d9113191e5d80c0b8957e1fe2988e269f3b2 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Mon, 1 Mar 2021 14:55:05 +0100 Subject: [PATCH 099/100] update chinese model --- TTS/.models.json | 8 ++++---- TTS/tts/utils/text/__init__.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 0fb187a4..440ca7a9 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -76,12 +76,12 @@ } } }, - "zh":{ + "zh-CN":{ "baker":{ "tacotron2-DDC-GST":{ - "model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw", - "config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz", - "stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV", + "model_file": "1SYpv7V__QYDjKXa_vJmNXo1CSkcoZovy", + "config_file": "14BIvfJXnFHi3jcxYNX40__TR6RwJOZqi", + "stats_file": "1ECRlXybT6rAWp269CkhjUPwcZ10CkcqD", "commit": "" } } diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 4f4a38ea..2a724650 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -41,7 +41,7 @@ def text2phone(text, language): ''' # TO REVIEW : How to have a good implementation for this? - if language == "chinese-mandarin": + if language == "zh-CN": ph = chinese_text_to_phonemes(text) return ph From c0ed527551aff2b69d5a0405066b0b71f6f4ab91 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Mon, 1 Mar 2021 15:30:49 +0100 Subject: [PATCH 100/100] update mandarin example notebook with new model gdrive ids --- ...C_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb index 709dbb8d..b28dd90c 100644 --- a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb +++ b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb @@ -64,9 +64,9 @@ ], "source": [ "! mkdir data/\n", - "! gdown --id 1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw -O data/tts_model.pth.tar\n", - "! gdown --id 1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz -O data/tts_config.json\n", - "! gdown --id 1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV -O data/tts_scale_stats.npy" + "! gdown --id 1SYpv7V__QYDjKXa_vJmNXo1CSkcoZovy -O data/tts_model.pth.tar\n", + "! gdown --id 14BIvfJXnFHi3jcxYNX40__TR6RwJOZqi -O data/tts_config.json\n", + "! gdown --id 1ECRlXybT6rAWp269CkhjUPwcZ10CkcqD -O data/tts_scale_stats.npy" ] }, { @@ -526,4 +526,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file