diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 7f55b378..cad89d09 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") - dataset = TTSDataset.TTSDataset( + dataset = TTSDataset( r, c.text_cleaner, compute_linear_spec=True, diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 21a73812..66339a82 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -45,17 +45,25 @@ def test_speedy_speech(): model.cuda() # forward pass - o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations) + outputs = model(x_dummy, x_lengths, y_lengths, durations) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] # with speaker embedding model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.randint(0, 10, (B,)).to(device)) + model.forward( + x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)} + ) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] @@ -63,8 +71,11 @@ def test_speedy_speech(): model = SpeedySpeech( num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.rand((B, 256)).to(device)) + model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)}) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 4d711700..0933ec70 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -52,15 +52,15 @@ class TacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -85,7 +85,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_embeddings = torch.rand(8, 55).to(device) + speaker_ids = torch.rand(8, 55).to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 @@ -104,15 +104,15 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -157,15 +157,15 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -213,15 +213,15 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -270,15 +270,15 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index fcbac0f7..86de5d16 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -68,13 +68,13 @@ class TacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -129,13 +129,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -193,13 +193,13 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -256,13 +256,13 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -318,13 +318,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes