update test for the new input output API of the tts models

pull/602/head
Eren Gölge 2021-05-31 15:43:40 +02:00
parent 506189bdee
commit 9960c0c356
4 changed files with 79 additions and 68 deletions

View File

@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase):
def _create_dataloader(self, batch_size, r, bgs):
items = ljspeech(c.data_path, "metadata.csv")
dataset = TTSDataset.TTSDataset(
dataset = TTSDataset(
r,
c.text_cleaner,
compute_linear_spec=True,

View File

@ -45,17 +45,25 @@ def test_speedy_speech():
model.cuda()
# forward pass
o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations)
outputs = model(x_dummy, x_lengths, y_lengths, durations)
o_de = outputs["model_outputs"]
attn = outputs["alignments"]
o_dr = outputs["durations_log"]
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]
assert list(o_dr.shape) == [B, T_en]
# with speaker embedding
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device)
model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.randint(0, 10, (B,)).to(device))
model.forward(
x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)}
)
o_de = outputs["model_outputs"]
attn = outputs["alignments"]
o_dr = outputs["durations_log"]
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]
assert list(o_dr.shape) == [B, T_en]
@ -63,8 +71,11 @@ def test_speedy_speech():
model = SpeedySpeech(
num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256
).to(device)
model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.rand((B, 256)).to(device))
model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)})
o_de = outputs["model_outputs"]
attn = outputs["alignments"]
o_dr = outputs["durations_log"]
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]
assert list(o_dr.shape) == [B, T_en]

View File

@ -52,15 +52,15 @@ class TacotronTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
@ -85,7 +85,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_embeddings = torch.rand(8, 55).to(device)
speaker_ids = torch.rand(8, 55).to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
@ -104,15 +104,15 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids}
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
@ -157,15 +157,15 @@ class TacotronGSTTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(10):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
@ -213,15 +213,15 @@ class TacotronGSTTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(10):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
@ -270,15 +270,15 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings}
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes

View File

@ -68,13 +68,13 @@ class TacotronTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
@ -129,13 +129,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings}
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
@ -193,13 +193,13 @@ class TacotronGSTTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(10):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
@ -256,13 +256,13 @@ class TacotronGSTTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(10):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
@ -318,13 +318,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings}
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes