From 75c507c36a923f61a9c8e3869c7be5c96bd94773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 30 Nov 2021 15:57:12 +0100 Subject: [PATCH] Update VITS LJspeech recipe --- recipes/ljspeech/vits_tts/train_vits.py | 22 ++++++++++++++----- .../test_vocoder_multiband_melgan_config.json | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py index e86cc861..0588e9d9 100644 --- a/recipes/ljspeech/vits_tts/train_vits.py +++ b/recipes/ljspeech/vits_tts/train_vits.py @@ -6,6 +6,7 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -35,7 +36,7 @@ config = VitsConfig( batch_size=48, eval_batch_size=16, batch_group_size=5, - num_loader_workers=4, + num_loader_workers=0, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, @@ -53,14 +54,24 @@ config = VitsConfig( datasets=[dataset_config], ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +tokenizer = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init model -model = Vits(config) +model = Vits(config, ap, tokenizer, speaker_manager=None) # init the trainer and 🚀 trainer = Trainer( @@ -70,6 +81,5 @@ trainer = Trainer( model=model, train_samples=train_samples, eval_samples=eval_samples, - training_assets={"audio_processor": ap}, ) trainer.fit() diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json index b8b192e4..82afc977 100644 --- a/tests/inputs/test_vocoder_multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -86,7 +86,7 @@ "mel_fmax": null }, - "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch + "target_loss": "G_avg_loss", // loss value to pick the best model to save after each epoch // DISCRIMINATOR "discriminator_model": "melgan_multiscale_discriminator",