diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index 4379c8ca..5972dc90 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -3,8 +3,7 @@ import unittest from TTS.config import load_config from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_checkpoint -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.utils.io import save_checkpoint from TTS.utils.synthesizer import Synthesizer from .. import get_tests_output_path @@ -14,15 +13,10 @@ class SynthesizerTest(unittest.TestCase): # pylint: disable=R0201 def _create_random_model(self): # pylint: disable=global-statement - global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json")) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - num_chars = len(phonemes) if config.use_phonemes else len(symbols) - model = setup_model(num_chars, 0, config) + model = setup_model(config) output_path = os.path.join(get_tests_output_path()) - save_checkpoint(model, None, 10, 10, 1, output_path, None) + save_checkpoint(config, model, None, None, 10, 1, output_path) def test_in_out(self): self._create_random_model() diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index d16167ed..8c795d58 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -6,7 +6,6 @@ import torch from tests import get_tests_input_path, get_tests_output_path, run_cli from TTS.config import load_config from TTS.tts.models import setup_model -from TTS.tts.utils.text.symbols import phonemes, symbols torch.manual_seed(1) @@ -21,8 +20,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -40,8 +38,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -59,8 +56,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 61d67c5c..3700b1d3 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -13,7 +13,7 @@ config = AlignTTSConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 8a2a8fb3..171f2cdc 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -41,64 +41,11 @@ class GlowTTSTrainTest(unittest.TestCase): criterion = GlowTTSLoss() # model to train - model = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) # reference model to compare model weights - model_ref = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + model_ref = GlowTTS(config).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) @@ -149,34 +96,8 @@ class GlowTTSInferenceTest(unittest.TestCase): speaker_ids = torch.randint(0, 5, (8,)).long().to(device) # create model - model = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) model.eval() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 30aaefc4..24c5c4cf 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -13,7 +13,7 @@ config = GlowTTSConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, use_espeak_phonemes=True, diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index d2f62d49..a5c481f1 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -1,7 +1,8 @@ import torch +from TTS.tts.configs import SpeedySpeechConfig from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor -from TTS.tts.models.speedy_speech import SpeedySpeech +from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs from TTS.tts.utils.data import sequence_mask use_cuda = torch.cuda.is_available() @@ -40,7 +41,8 @@ def test_speedy_speech(): y_lengths = durations.sum(1) - model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128) + config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128)) + model = SpeedySpeech(config) if use_cuda: model.cuda() @@ -55,7 +57,12 @@ def test_speedy_speech(): assert list(o_dr.shape) == [B, T_en] # with speaker embedding - model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) + config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs( + num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256 + ) + ) + model = SpeedySpeech(config).to(device) model.forward( x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} ) @@ -68,9 +75,17 @@ def test_speedy_speech(): assert list(o_dr.shape) == [B, T_en] # with speaker external embedding - model = SpeedySpeech( - num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 - ).to(device) + config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs( + num_chars=num_chars, + out_channels=80, + hidden_channels=128, + num_speakers=10, + use_d_vector=True, + d_vector_dim=256, + ) + ) + model = SpeedySpeech(config).to(device) model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)}) o_de = outputs["model_outputs"] attn = outputs["alignments"] diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index d677f46f..28dc7029 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -4,16 +4,18 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs import SpeedySpeechConfig +from TTS.tts.models.speedy_speech import SpeedySpeechArgs config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0), batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="en-us", diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index 7fda7e09..3313b8c4 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", @@ -24,11 +24,11 @@ config = Tacotron2Config( print_step=1, print_eval=True, use_speaker_embedding=True, - use_external_speaker_embedding_file=True, + use_d_vector_file=True, test_sentences=[ "Be a voice, not an echo.", ], - external_speaker_embedding_file="tests/data/ljspeech/speakers.json", + d_vector_file="tests/data/ljspeech/speakers.json", max_decoder_steps=50, ) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index fc3d9799..a8132467 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -7,6 +7,7 @@ from torch import nn, optim from tests import get_tests_input_path from TTS.tts.configs import Tacotron2Config +from TTS.tts.configs.shared_configs import GSTConfig from TTS.tts.layers.losses import MSELossMasked from TTS.tts.models.tacotron2 import Tacotron2 from TTS.utils.audio import AudioProcessor @@ -17,19 +18,20 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = Tacotron2Config() +config_global = Tacotron2Config(num_chars=32, num_speakers=5, out_channels=80, decoder_output_dim=80) -ap = AudioProcessor(**c.audio) +ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") class TacotronTrainTest(unittest.TestCase): def test_train_step(self): # pylint: disable=no-self-use + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -38,19 +40,19 @@ class TacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device) + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -77,11 +79,12 @@ class TacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -90,19 +93,20 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device) + config.d_vector_dim = 55 + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids} @@ -130,11 +134,12 @@ class TacotronGSTTrainTest(unittest.TestCase): # pylint: disable=no-self-use def test_train_step(self): # with random gst mel style + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -143,19 +148,21 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) + config.use_gst = True + config.gst = GSTConfig() + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -190,7 +197,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -199,19 +206,19 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -242,11 +249,12 @@ class TacotronGSTTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -255,18 +263,19 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(device) + config.d_vector_dim = 55 + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index a242c724..41d694f6 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_tacotron2_tf_model.py b/tests/tts_tests/test_tacotron2_tf_model.py index ee7f720b..431b0c2f 100644 --- a/tests/tts_tests/test_tacotron2_tf_model.py +++ b/tests/tts_tests/test_tacotron2_tf_model.py @@ -110,7 +110,7 @@ class TacotronTFTrainTest(unittest.TestCase): num_chars=24, num_speakers=0, r=3, - postnet_output_dim=80, + out_channels=80, decoder_output_dim=80, attn_type="original", attn_win=False, diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 577de014..e947a54a 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 2abd968d..6c673568 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -6,7 +6,7 @@ import torch from torch import nn, optim from tests import get_tests_input_path -from TTS.tts.configs import TacotronConfig +from TTS.tts.configs import GSTConfig, TacotronConfig from TTS.tts.layers.losses import L1LossMasked from TTS.tts.models.tacotron import Tacotron from TTS.utils.audio import AudioProcessor @@ -17,9 +17,9 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = TacotronConfig() +config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80) -ap = AudioProcessor(**c.audio) +ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -31,11 +31,12 @@ def count_parameters(model): class TacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -44,21 +45,12 @@ class TacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -66,7 +58,7 @@ class TacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -91,11 +83,12 @@ class TacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -104,22 +97,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - d_vector_dim=55, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.d_vector_dim = 55 + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -127,7 +111,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} @@ -152,12 +136,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 120, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 120, (8,)).long().to(device) mel_lengths[-1] = 120 stop_targets = torch.zeros(8, 120, 1).float().to(device) @@ -166,23 +151,14 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - use_gst=True, - gst=c.gst, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.use_gst = True + config.gst = GSTConfig() + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) @@ -191,7 +167,7 @@ class TacotronGSTTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -220,7 +196,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device) + linear_spec = torch.rand(8, mel_spec.size(1), config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device) @@ -229,23 +205,12 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - use_gst=True, - gst=c.gst, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) @@ -254,7 +219,7 @@ class TacotronGSTTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -278,11 +243,12 @@ class TacotronGSTTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -291,24 +257,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - use_gst=True, - gst=c.gst, - r=c.r, - memory_size=c.memory_size, - d_vector_dim=55, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.d_vector_dim = 55 + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -316,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 010154e2..0c35ee28 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -13,7 +13,7 @@ config = TacotronConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index f93a5318..9d4e1933 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -12,7 +12,7 @@ config = FullbandMelganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py index 11057570..c506fb48 100644 --- a/tests/vocoder_tests/test_hifigan_train.py +++ b/tests/vocoder_tests/test_hifigan_train.py @@ -13,7 +13,7 @@ config = HifiganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 551b786a..6ef9cd49 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -12,7 +12,7 @@ config = MelganConfig( batch_size=4, eval_batch_size=4, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index 5c6a0fc8..daf2841b 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -12,7 +12,7 @@ config = MultibandMelganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -30,9 +30,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -40,7 +38,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py index fb6ea87c..a126befe 100644 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ b/tests/vocoder_tests/test_parallel_wavegan_train.py @@ -12,7 +12,7 @@ config = ParallelWaveganConfig( batch_size=4, eval_batch_size=4, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -28,9 +28,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -38,7 +36,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_vocoder_wavernn.py b/tests/vocoder_tests/test_vocoder_wavernn.py index 9c58fa1c..b5c769ee 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn.py +++ b/tests/vocoder_tests/test_vocoder_wavernn.py @@ -3,11 +3,13 @@ import random import numpy as np import torch -from TTS.vocoder.models.wavernn import WaveRNN +from TTS.vocoder.configs import WavernnConfig +from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs def test_wavernn(): - model = WaveRNN( + config = WavernnConfig() + config.model_args = WavernnArgs( rnn_dims=512, fc_dims=512, mode=10, @@ -20,14 +22,30 @@ def test_wavernn(): compute_dims=128, res_out_dims=128, num_res_blocks=10, - hop_length=256, - sample_rate=22050, ) + config.audio.hop_length = 256 + config.audio.sample_rate = 2048 + dummy_x = torch.rand((2, 1280)) dummy_m = torch.rand((2, 80, 9)) y_size = random.randrange(20, 60) dummy_y = torch.rand((80, y_size)) + + # mode: mold + model = Wavernn(config) output = model(dummy_x, dummy_m) - assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape + assert np.all(output.shape == (2, 1280, 30)), output.shape + + # mode: gauss + config.model_params.mode = "gauss" + model = Wavernn(config) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 2)), output.shape + + # mode: quantized + config.model_params.mode = 4 + model = Wavernn(config) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 2 ** 4)), output.shape output = model.inference(dummy_y, True, 5500, 550) assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py index a28409e5..43b5f080 100644 --- a/tests/vocoder_tests/test_wavegrad.py +++ b/tests/vocoder_tests/test_wavegrad.py @@ -4,7 +4,8 @@ import numpy as np import torch from torch import optim -from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.configs import WavegradConfig +from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs # pylint: disable=unused-variable @@ -20,19 +21,16 @@ class WavegradTrainTest(unittest.TestCase): mel_spec = torch.rand(8, 80, 20).to(device) criterion = torch.nn.L1Loss().to(device) - model = Wavegrad( + args = WavegradArgs( in_channels=80, out_channels=1, upsample_factors=[5, 5, 3, 2, 2], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) - model_ref = Wavegrad( - in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ) + model_ref = Wavegrad(config) model.train() model.to(device) betas = np.linspace(1e-6, 1e-2, 1000) diff --git a/tests/vocoder_tests/test_wavegrad_layers.py b/tests/vocoder_tests/test_wavegrad_layers.py index 0180eb0a..a0b021dc 100644 --- a/tests/vocoder_tests/test_wavegrad_layers.py +++ b/tests/vocoder_tests/test_wavegrad_layers.py @@ -1,7 +1,8 @@ import torch +from TTS.vocoder.configs import WavegradConfig from TTS.vocoder.layers.wavegrad import DBlock, FiLM, PositionalEncoding, UBlock -from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs def test_positional_encoding(): @@ -75,12 +76,14 @@ def test_wavegrad_forward(): c = torch.rand(32, 80, 20) noise_scale = torch.rand(32) - model = Wavegrad( + args = WavegradArgs( in_channels=80, out_channels=1, upsample_factors=[5, 5, 3, 2, 2], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) o = model.forward(x, c, noise_scale) assert o.shape[0] == 32 diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index e222de3a..fe56ee78 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -12,7 +12,7 @@ config = WavegradConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,15 +29,15 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} " +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " +) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py index 414ed719..43fc5fb1 100644 --- a/tests/vocoder_tests/test_wavernn_train.py +++ b/tests/vocoder_tests/test_wavernn_train.py @@ -4,15 +4,18 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.vocoder.configs import WavernnConfig +from TTS.vocoder.models.wavernn import WavernnArgs config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") + config = WavernnConfig( + model_params=WavernnArgs(), batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -28,9 +31,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -38,7 +39,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path)