2019-11-01 11:23:03 +00:00
|
|
|
import unittest
|
2020-08-04 12:07:47 +00:00
|
|
|
|
2019-11-01 11:23:03 +00:00
|
|
|
import torch as T
|
|
|
|
|
2021-04-12 09:47:39 +00:00
|
|
|
from tests import get_tests_input_path
|
2021-05-26 23:35:58 +00:00
|
|
|
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
2021-05-26 21:14:06 +00:00
|
|
|
from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
|
2021-05-26 23:35:58 +00:00
|
|
|
from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
|
2021-05-31 14:37:15 +00:00
|
|
|
|
2020-07-16 13:05:36 +00:00
|
|
|
file_path = get_tests_input_path()
|
2019-11-01 11:23:03 +00:00
|
|
|
|
|
|
|
|
2021-05-26 23:35:58 +00:00
|
|
|
class LSTMSpeakerEncoderTests(unittest.TestCase):
|
2019-11-12 11:42:42 +00:00
|
|
|
# pylint: disable=R0201
|
2019-11-01 11:23:03 +00:00
|
|
|
def test_in_out(self):
|
|
|
|
dummy_input = T.rand(4, 20, 80) # B x T x D
|
|
|
|
dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
|
2021-05-26 21:14:06 +00:00
|
|
|
model = LSTMSpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3)
|
2019-11-01 11:23:03 +00:00
|
|
|
# computing d vectors
|
|
|
|
output = model.forward(dummy_input)
|
|
|
|
assert output.shape[0] == 4
|
|
|
|
assert output.shape[1] == 256
|
|
|
|
output = model.inference(dummy_input)
|
|
|
|
assert output.shape[0] == 4
|
|
|
|
assert output.shape[1] == 256
|
|
|
|
# compute d vectors by passing LSTM hidden
|
|
|
|
# output = model.forward(dummy_input, dummy_hidden)
|
|
|
|
# assert output.shape[0] == 4
|
|
|
|
# assert output.shape[1] == 20
|
|
|
|
# assert output.shape[2] == 256
|
|
|
|
# check normalization
|
|
|
|
output_norm = T.nn.functional.normalize(output, dim=1, p=2)
|
|
|
|
assert_diff = (output_norm - output).sum().item()
|
2019-11-12 11:42:42 +00:00
|
|
|
assert output.type() == "torch.FloatTensor"
|
2021-04-12 09:47:39 +00:00
|
|
|
assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}"
|
2019-11-01 11:23:03 +00:00
|
|
|
# compute d for a given batch
|
|
|
|
dummy_input = T.rand(1, 240, 80) # B x T x D
|
2021-06-18 17:32:28 +00:00
|
|
|
output = model.compute_embedding(dummy_input, num_frames=160, num_eval=5)
|
2019-11-01 11:23:03 +00:00
|
|
|
assert output.shape[0] == 1
|
|
|
|
assert output.shape[1] == 256
|
|
|
|
assert len(output.shape) == 2
|
|
|
|
|
2021-05-31 14:37:15 +00:00
|
|
|
|
2021-05-26 23:35:58 +00:00
|
|
|
class ResNetSpeakerEncoderTests(unittest.TestCase):
|
|
|
|
# pylint: disable=R0201
|
|
|
|
def test_in_out(self):
|
|
|
|
dummy_input = T.rand(4, 20, 80) # B x T x D
|
|
|
|
dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
|
|
|
|
model = ResNetSpeakerEncoder(input_dim=80, proj_dim=256)
|
|
|
|
# computing d vectors
|
|
|
|
output = model.forward(dummy_input)
|
|
|
|
assert output.shape[0] == 4
|
|
|
|
assert output.shape[1] == 256
|
|
|
|
output = model.forward(dummy_input, l2_norm=True)
|
|
|
|
assert output.shape[0] == 4
|
|
|
|
assert output.shape[1] == 256
|
|
|
|
|
|
|
|
# check normalization
|
|
|
|
output_norm = T.nn.functional.normalize(output, dim=1, p=2)
|
|
|
|
assert_diff = (output_norm - output).sum().item()
|
|
|
|
assert output.type() == "torch.FloatTensor"
|
|
|
|
assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}"
|
|
|
|
# compute d for a given batch
|
|
|
|
dummy_input = T.rand(1, 240, 80) # B x T x D
|
|
|
|
output = model.compute_embedding(dummy_input, num_frames=160, num_eval=10)
|
|
|
|
assert output.shape[0] == 1
|
|
|
|
assert output.shape[1] == 256
|
|
|
|
assert len(output.shape) == 2
|
2019-11-01 11:23:03 +00:00
|
|
|
|
2021-05-31 14:37:15 +00:00
|
|
|
|
2019-11-01 11:23:03 +00:00
|
|
|
class GE2ELossTests(unittest.TestCase):
|
2019-11-12 11:42:42 +00:00
|
|
|
# pylint: disable=R0201
|
2019-11-01 11:23:03 +00:00
|
|
|
def test_in_out(self):
|
|
|
|
# check random input
|
|
|
|
dummy_input = T.rand(4, 5, 64) # num_speaker x num_utterance x dim
|
2019-11-12 11:42:42 +00:00
|
|
|
loss = GE2ELoss(loss_method="softmax")
|
2019-11-01 11:23:03 +00:00
|
|
|
output = loss.forward(dummy_input)
|
2019-11-12 11:42:42 +00:00
|
|
|
assert output.item() >= 0.0
|
2019-11-01 11:23:03 +00:00
|
|
|
# check all zeros
|
|
|
|
dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim
|
2019-11-12 11:42:42 +00:00
|
|
|
loss = GE2ELoss(loss_method="softmax")
|
2019-11-01 11:23:03 +00:00
|
|
|
output = loss.forward(dummy_input)
|
2020-07-31 04:21:31 +00:00
|
|
|
assert output.item() >= 0.0
|
2019-11-01 11:23:03 +00:00
|
|
|
# check speaker loss with orthogonal d-vectors
|
|
|
|
dummy_input = T.empty(3, 64)
|
2020-10-29 14:47:15 +00:00
|
|
|
dummy_input = T.nn.init.orthogonal_(dummy_input)
|
2019-11-12 11:42:42 +00:00
|
|
|
dummy_input = T.cat(
|
|
|
|
[
|
|
|
|
dummy_input[0].repeat(5, 1, 1).transpose(0, 1),
|
|
|
|
dummy_input[1].repeat(5, 1, 1).transpose(0, 1),
|
|
|
|
dummy_input[2].repeat(5, 1, 1).transpose(0, 1),
|
|
|
|
]
|
|
|
|
) # num_speaker x num_utterance x dim
|
|
|
|
loss = GE2ELoss(loss_method="softmax")
|
2019-11-01 11:23:03 +00:00
|
|
|
output = loss.forward(dummy_input)
|
|
|
|
assert output.item() < 0.005
|
|
|
|
|
2021-05-31 14:37:15 +00:00
|
|
|
|
2020-07-31 04:21:31 +00:00
|
|
|
class AngleProtoLossTests(unittest.TestCase):
|
|
|
|
# pylint: disable=R0201
|
|
|
|
def test_in_out(self):
|
|
|
|
# check random input
|
|
|
|
dummy_input = T.rand(4, 5, 64) # num_speaker x num_utterance x dim
|
|
|
|
loss = AngleProtoLoss()
|
|
|
|
output = loss.forward(dummy_input)
|
|
|
|
assert output.item() >= 0.0
|
|
|
|
|
|
|
|
# check all zeros
|
|
|
|
dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim
|
|
|
|
loss = AngleProtoLoss()
|
|
|
|
output = loss.forward(dummy_input)
|
|
|
|
assert output.item() >= 0.0
|
|
|
|
|
|
|
|
# check speaker loss with orthogonal d-vectors
|
|
|
|
dummy_input = T.empty(3, 64)
|
2020-10-29 14:47:15 +00:00
|
|
|
dummy_input = T.nn.init.orthogonal_(dummy_input)
|
2020-07-31 04:21:31 +00:00
|
|
|
dummy_input = T.cat(
|
|
|
|
[
|
|
|
|
dummy_input[0].repeat(5, 1, 1).transpose(0, 1),
|
|
|
|
dummy_input[1].repeat(5, 1, 1).transpose(0, 1),
|
|
|
|
dummy_input[2].repeat(5, 1, 1).transpose(0, 1),
|
|
|
|
]
|
|
|
|
) # num_speaker x num_utterance x dim
|
|
|
|
loss = AngleProtoLoss()
|
|
|
|
output = loss.forward(dummy_input)
|
|
|
|
assert output.item() < 0.005
|
2021-05-26 23:35:58 +00:00
|
|
|
|
2021-05-31 14:37:15 +00:00
|
|
|
|
2021-05-26 23:35:58 +00:00
|
|
|
class SoftmaxAngleProtoLossTests(unittest.TestCase):
|
|
|
|
# pylint: disable=R0201
|
|
|
|
def test_in_out(self):
|
|
|
|
|
|
|
|
embedding_dim = 64
|
|
|
|
num_speakers = 5
|
|
|
|
batch_size = 4
|
|
|
|
|
|
|
|
dummy_label = T.randint(low=0, high=num_speakers, size=(batch_size, num_speakers))
|
|
|
|
# check random input
|
|
|
|
dummy_input = T.rand(batch_size, num_speakers, embedding_dim) # num_speaker x num_utterance x dim
|
|
|
|
loss = SoftmaxAngleProtoLoss(embedding_dim=embedding_dim, n_speakers=num_speakers)
|
|
|
|
output = loss.forward(dummy_input, dummy_label)
|
|
|
|
assert output.item() >= 0.0
|
|
|
|
|
|
|
|
# check all zeros
|
|
|
|
dummy_input = T.ones(batch_size, num_speakers, embedding_dim) # num_speaker x num_utterance x dim
|
|
|
|
loss = SoftmaxAngleProtoLoss(embedding_dim=embedding_dim, n_speakers=num_speakers)
|
|
|
|
output = loss.forward(dummy_input, dummy_label)
|
|
|
|
assert output.item() >= 0.0
|