From df0d58bf09c7189bebbd2e60498e9a3c300981f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Dec 2021 15:15:56 +0000 Subject: [PATCH] Update VCTK recipes --- TTS/tts/datasets/formatters.py | 2 +- .../speedy_speech/train_speedy_speech.py | 6 --- recipes/vctk/fast_pitch/train_fast_pitch.py | 43 +++++++++++------ recipes/vctk/fast_speech/train_fast_speech.py | 48 +++++++++++-------- recipes/vctk/glow_tts/train_glow_tts.py | 41 +++++++++++----- .../vctk/speedy_speech/train_speedy_speech.py | 44 ++++++++++------- .../vctk/tacotron-DDC/train_tacotron-DDC.py | 42 ++++++++++------ .../vctk/tacotron2-DDC/train_tacotron2-ddc.py | 41 ++++++++++------ recipes/vctk/tacotron2/train_tacotron2.py | 41 ++++++++++------ 9 files changed, 192 insertions(+), 116 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 1375757a..546c3cc3 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -289,7 +289,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None): return items -def vctk(root_path, meta_files=None, wavs_path="wav22", mic="mic2", ignored_speakers=None): +def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2", ignored_speakers=None): """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip""" file_ext = 'flac' test_speakers = meta_files diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index 468e8a5f..2f8896c5 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -68,12 +68,6 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) - -# load training samples -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) - # init model model = ForwardTTS(config, ap, tokenizer) diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index f40587e0..f7a2ef06 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -6,6 +6,7 @@ from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -32,6 +33,7 @@ config = FastPitchConfig( num_loader_workers=8, num_eval_loader_workers=4, compute_input_seq_cache=True, + precompute_num_workers=4, compute_f0=True, f0_cache_path=os.path.join(output_path, "f0_cache"), run_eval=True, @@ -39,23 +41,35 @@ config = FastPitchConfig( epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=50, print_eval=False, mixed_precision=False, - sort_by_audio_len=True, - max_seq_len=500000, + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=500000, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, ) -# init audio processor -ap = AudioProcessor(**config.audio) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -65,16 +79,15 @@ speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) config.model_args.num_speakers = speaker_manager.num_speakers # init model -model = ForwardTTS(config, speaker_manager) +model = ForwardTTS(config, ap, tokenizer, speaker_manager=speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() + diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index b2988809..853bbb54 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -6,6 +6,7 @@ from TTS.tts.configs.fast_speech_config import FastSpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -25,37 +26,48 @@ audio_config = BaseAudioConfig( ) config = FastSpeechConfig( - run_name="fast_pitch_ljspeech", + run_name="fast_speech_vctk", audio=audio_config, batch_size=32, eval_batch_size=16, num_loader_workers=8, num_eval_loader_workers=4, compute_input_seq_cache=True, - compute_f0=True, - f0_cache_path=os.path.join(output_path, "f0_cache"), + precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=50, print_eval=False, mixed_precision=False, - sort_by_audio_len=True, - max_seq_len=500000, + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=500000, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, ) -# init audio processor -ap = AudioProcessor(**config.audio) +## INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -65,16 +77,14 @@ speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) config.model_args.num_speakers = speaker_manager.num_speakers # init model -model = ForwardTTS(config, speaker_manager) +model = ForwardTTS(config, ap, tokenizer, speaker_manager=speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) -trainer.fit() + +# AND... 3,2,1... 🚀 +trainer.fit() \ No newline at end of file diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index 8c9f5388..30050ef5 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -7,6 +7,7 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor # set experiment paths @@ -32,6 +33,7 @@ config = GlowTTSConfig( eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, + precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, @@ -45,12 +47,27 @@ config = GlowTTSConfig( output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=500000, ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -60,16 +77,14 @@ speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) config.num_speakers = speaker_manager.num_speakers # init model -model = GlowTTS(config, speaker_manager) +model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) -trainer.fit() + +# AND... 3,2,1... 🚀 +trainer.fit() \ No newline at end of file diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py index 81f78d26..85e347fc 100644 --- a/recipes/vctk/speedy_speech/train_speedy_speech.py +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -6,6 +6,7 @@ from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -32,30 +33,41 @@ config = SpeedySpeechConfig( num_loader_workers=8, num_eval_loader_workers=4, compute_input_seq_cache=True, - compute_f0=True, - f0_cache_path=os.path.join(output_path, "f0_cache"), + precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=50, print_eval=False, mixed_precision=False, - sort_by_audio_len=True, - max_seq_len=500000, + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=500000, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, ) -# init audio processor -ap = AudioProcessor(**config.audio) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -65,16 +77,14 @@ speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) config.model_args.num_speakers = speaker_manager.num_speakers # init model -model = ForwardTTS(config, speaker_manager) +model = ForwardTTS(config, ap, tokenizer, speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py index b0030f17..7960b34b 100644 --- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -7,6 +7,7 @@ from TTS.tts.configs.tacotron_config import TacotronConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron import Tacotron from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -32,6 +33,7 @@ config = TacotronConfig( # This is the config that is saved for the future use eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, + precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, r=6, @@ -45,18 +47,30 @@ config = TacotronConfig( # This is the config that is saved for the future use print_step=25, print_eval=False, mixed_precision=True, - sort_by_audio_len=True, - min_seq_len=0, - max_seq_len=44000 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=44000 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, # set this to enable multi-sepeaker training ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +## INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -65,16 +79,14 @@ speaker_manager = SpeakerManager() speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model -model = Tacotron(config, speaker_manager) +model = Tacotron(config, ap, tokenizer, speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index 63efb784..bc7951b5 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -7,6 +7,7 @@ from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -44,9 +45,10 @@ config = Tacotron2Config( # This is the config that is saved for the future use print_step=150, print_eval=False, mixed_precision=True, - sort_by_audio_len=True, - min_seq_len=14800, - max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=44000 * 10, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, # set this to enable multi-sepeaker training @@ -60,10 +62,21 @@ config = Tacotron2Config( # This is the config that is saved for the future use lr=3e-5, ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -72,16 +85,14 @@ speaker_manager = SpeakerManager() speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model -model = Tacotron2(config, speaker_manager) +model = Tacotron2(config, ap, tokenizer, speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py index 346d650b..82dedade 100644 --- a/recipes/vctk/tacotron2/train_tacotron2.py +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -7,6 +7,7 @@ from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -44,9 +45,10 @@ config = Tacotron2Config( # This is the config that is saved for the future use print_step=150, print_eval=False, mixed_precision=True, - sort_by_audio_len=True, - min_seq_len=14800, - max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=44000 * 10, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, # set this to enable multi-sepeaker training @@ -60,10 +62,21 @@ config = Tacotron2Config( # This is the config that is saved for the future use lr=3e-5, ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +## INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -72,16 +85,14 @@ speaker_manager = SpeakerManager() speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model -model = Tacotron2(config, speaker_manager) +model = Tacotron2(config, ap, tokenizer, speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit()