mirror of https://github.com/coqui-ai/TTS.git
update test for the new input output API of the tts models
parent
506189bdee
commit
9960c0c356
|
@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase):
|
|||
|
||||
def _create_dataloader(self, batch_size, r, bgs):
|
||||
items = ljspeech(c.data_path, "metadata.csv")
|
||||
dataset = TTSDataset.TTSDataset(
|
||||
dataset = TTSDataset(
|
||||
r,
|
||||
c.text_cleaner,
|
||||
compute_linear_spec=True,
|
||||
|
|
|
@ -45,17 +45,25 @@ def test_speedy_speech():
|
|||
model.cuda()
|
||||
|
||||
# forward pass
|
||||
o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations)
|
||||
outputs = model(x_dummy, x_lengths, y_lengths, durations)
|
||||
o_de = outputs["model_outputs"]
|
||||
attn = outputs["alignments"]
|
||||
o_dr = outputs["durations_log"]
|
||||
|
||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
||||
# with speaker embedding
|
||||
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device)
|
||||
model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.randint(0, 10, (B,)).to(device))
|
||||
model.forward(
|
||||
x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)}
|
||||
)
|
||||
o_de = outputs["model_outputs"]
|
||||
attn = outputs["alignments"]
|
||||
o_dr = outputs["durations_log"]
|
||||
|
||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
||||
|
@ -63,8 +71,11 @@ def test_speedy_speech():
|
|||
model = SpeedySpeech(
|
||||
num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256
|
||||
).to(device)
|
||||
model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.rand((B, 256)).to(device))
|
||||
model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)})
|
||||
o_de = outputs["model_outputs"]
|
||||
attn = outputs["alignments"]
|
||||
o_dr = outputs["durations_log"]
|
||||
|
||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
|
|
@ -52,15 +52,15 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for i in range(5):
|
||||
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
|
||||
)
|
||||
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
||||
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
@ -85,7 +85,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[0] = 30
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
speaker_embeddings = torch.rand(8, 55).to(device)
|
||||
speaker_ids = torch.rand(8, 55).to(device)
|
||||
|
||||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
@ -104,15 +104,15 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for i in range(5):
|
||||
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids}
|
||||
)
|
||||
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
||||
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
@ -157,15 +157,15 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for i in range(10):
|
||||
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
|
||||
)
|
||||
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
||||
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
@ -213,15 +213,15 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for i in range(10):
|
||||
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
|
||||
)
|
||||
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
||||
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
@ -270,15 +270,15 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for i in range(5):
|
||||
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings}
|
||||
)
|
||||
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
||||
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
|
||||
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
|
|
@ -68,13 +68,13 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for _ in range(5):
|
||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
|
||||
)
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
@ -129,13 +129,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for _ in range(5):
|
||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings}
|
||||
)
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
@ -193,13 +193,13 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for _ in range(10):
|
||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
|
||||
)
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
@ -256,13 +256,13 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for _ in range(10):
|
||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids}
|
||||
)
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
@ -318,13 +318,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for _ in range(5):
|
||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings}
|
||||
)
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
|
||||
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
|
||||
loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
|
|
Loading…
Reference in New Issue