TTS/tests/tts_tests/test_tacotron_layers.py

215 lines
8.3 KiB
Python
Raw Normal View History

2018-02-08 18:10:11 +00:00
import unittest
2021-04-12 09:47:39 +00:00
2018-02-08 18:10:11 +00:00
import torch as T
2020-10-28 14:24:18 +00:00
from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
2021-04-12 09:47:39 +00:00
from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet
2021-05-26 07:53:27 +00:00
from TTS.tts.utils.data import sequence_mask
2018-02-08 18:10:11 +00:00
# pylint: disable=unused-variable
2018-02-08 18:10:11 +00:00
class PrenetTests(unittest.TestCase):
2021-04-12 09:47:39 +00:00
def test_in_out(self): # pylint: disable=no-self-use
2018-02-13 16:08:23 +00:00
layer = Prenet(128, out_features=[256, 128])
2018-05-10 22:59:05 +00:00
dummy_input = T.rand(4, 128)
2018-02-08 18:10:11 +00:00
2018-02-13 16:08:23 +00:00
print(layer)
output = layer(dummy_input)
assert output.shape[0] == 4
assert output.shape[1] == 128
2018-02-08 18:10:11 +00:00
class CBHGTests(unittest.TestCase):
2018-02-13 16:08:23 +00:00
def test_in_out(self):
2021-04-12 09:47:39 +00:00
# pylint: disable=attribute-defined-outside-init
2018-08-13 13:02:30 +00:00
layer = self.cbhg = CBHG(
128,
K=8,
conv_bank_features=80,
conv_projections=[160, 128],
highway_features=80,
gru_features=80,
2021-04-12 09:47:39 +00:00
num_highways=4,
)
# B x D x T
2020-07-23 14:26:20 +00:00
dummy_input = T.rand(4, 128, 8)
2018-02-08 18:10:11 +00:00
2018-02-13 16:08:23 +00:00
print(layer)
output = layer(dummy_input)
assert output.shape[0] == 4
assert output.shape[1] == 8
2018-08-13 13:02:30 +00:00
assert output.shape[2] == 160
2018-02-08 18:10:11 +00:00
class DecoderTests(unittest.TestCase):
2019-07-19 09:35:06 +00:00
@staticmethod
def test_in_out():
2019-07-19 09:12:48 +00:00
layer = Decoder(
2020-07-23 14:26:20 +00:00
in_channels=256,
frame_channels=80,
2019-07-19 09:12:48 +00:00
r=2,
memory_size=4,
attn_windowing=False,
attn_norm="sigmoid",
attn_K=5,
attn_type="original",
2021-04-12 09:47:39 +00:00
prenet_type="original",
2019-07-19 09:12:48 +00:00
prenet_dropout=True,
forward_attn=True,
trans_agent=True,
forward_attn_mask=True,
location_attn=True,
2021-04-12 09:47:39 +00:00
separate_stopnet=True,
)
2018-05-10 22:59:05 +00:00
dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80)
2018-02-13 16:08:23 +00:00
2021-04-12 09:47:39 +00:00
output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None)
2018-04-03 10:24:57 +00:00
2018-02-13 16:08:23 +00:00
assert output.shape[0] == 4
assert output.shape[1] == 80, "size not {}".format(output.shape[1])
assert output.shape[2] == 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4
2018-04-03 10:24:57 +00:00
2021-04-12 09:47:39 +00:00
2018-02-13 16:08:23 +00:00
class EncoderTests(unittest.TestCase):
2021-04-12 09:47:39 +00:00
def test_in_out(self): # pylint: disable=no-self-use
2018-02-13 16:08:23 +00:00
layer = Encoder(128)
2018-05-10 22:59:05 +00:00
dummy_input = T.rand(4, 8, 128)
2018-02-13 16:08:23 +00:00
print(layer)
output = layer(dummy_input)
print(output.shape)
assert output.shape[0] == 4
assert output.shape[1] == 8
assert output.shape[2] == 256 # 128 * 2 BiRNN
2018-04-03 10:24:57 +00:00
2018-02-13 16:08:23 +00:00
2018-03-25 02:22:45 +00:00
class L1LossMaskedTests(unittest.TestCase):
2021-04-12 09:47:39 +00:00
def test_in_out(self): # pylint: disable=no-self-use
2019-11-15 13:30:28 +00:00
# test input == target
2020-01-27 15:02:34 +00:00
layer = L1LossMasked(seq_len_norm=False)
2018-05-10 22:59:05 +00:00
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.ones(4, 8, 128).float()
dummy_length = (T.ones(4) * 8).long()
2018-03-25 02:22:45 +00:00
output = layer(dummy_input, dummy_target, dummy_length)
2018-05-10 22:59:05 +00:00
assert output.item() == 0.0
2018-04-03 10:24:57 +00:00
2019-11-15 13:30:28 +00:00
# test input != target
2018-05-10 22:59:05 +00:00
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.ones(4) * 8).long()
2018-03-25 02:22:45 +00:00
output = layer(dummy_input, dummy_target, dummy_length)
2020-01-27 15:02:34 +00:00
assert output.item() == 1.0, "1.0 vs {}".format(output.item())
2019-11-15 13:30:28 +00:00
# test if padded values of input makes any difference
2018-05-10 22:59:05 +00:00
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()
2021-04-12 09:47:39 +00:00
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
2018-03-25 02:22:45 +00:00
output = layer(dummy_input + mask, dummy_target, dummy_length)
2020-01-27 15:02:34 +00:00
assert output.item() == 1.0, "1.0 vs {}".format(output.item())
2019-11-15 13:30:28 +00:00
dummy_input = T.rand(4, 8, 128).float()
dummy_target = dummy_input.detach()
dummy_length = (T.arange(5, 9)).long()
2021-04-12 09:47:39 +00:00
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
2019-11-15 13:30:28 +00:00
output = layer(dummy_input + mask, dummy_target, dummy_length)
2020-01-27 15:02:34 +00:00
assert output.item() == 0, "0 vs {}".format(output.item())
# seq_len_norm = True
# test input == target
layer = L1LossMasked(seq_len_norm=True)
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.ones(4, 8, 128).float()
dummy_length = (T.ones(4) * 8).long()
output = layer(dummy_input, dummy_target, dummy_length)
assert output.item() == 0.0
# test input != target
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.ones(4) * 8).long()
output = layer(dummy_input, dummy_target, dummy_length)
assert output.item() == 1.0, "1.0 vs {}".format(output.item())
# test if padded values of input makes any difference
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()
2021-04-12 09:47:39 +00:00
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
2020-01-27 15:02:34 +00:00
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
dummy_input = T.rand(4, 8, 128).float()
dummy_target = dummy_input.detach()
dummy_length = (T.arange(5, 9)).long()
2021-04-12 09:47:39 +00:00
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
2020-01-27 15:02:34 +00:00
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert output.item() == 0, "0 vs {}".format(output.item())
2020-10-28 14:24:18 +00:00
class SSIMLossTests(unittest.TestCase):
2021-04-12 09:47:39 +00:00
def test_in_out(self): # pylint: disable=no-self-use
2020-10-28 14:24:18 +00:00
# test input == target
layer = SSIMLoss()
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.ones(4, 8, 128).float()
dummy_length = (T.ones(4) * 8).long()
output = layer(dummy_input, dummy_target, dummy_length)
assert output.item() == 0.0
# test input != target
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.ones(4) * 8).long()
output = layer(dummy_input, dummy_target, dummy_length)
2021-03-08 04:06:54 +00:00
assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item())
2020-10-28 14:24:18 +00:00
# test if padded values of input makes any difference
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()
2021-04-12 09:47:39 +00:00
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
2020-10-28 14:24:18 +00:00
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item())
dummy_input = T.rand(4, 8, 128).float()
dummy_target = dummy_input.detach()
dummy_length = (T.arange(5, 9)).long()
2021-04-12 09:47:39 +00:00
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
2020-10-28 14:24:18 +00:00
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert output.item() == 0, "0 vs {}".format(output.item())
# seq_len_norm = True
# test input == target
layer = L1LossMasked(seq_len_norm=True)
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.ones(4, 8, 128).float()
dummy_length = (T.ones(4) * 8).long()
output = layer(dummy_input, dummy_target, dummy_length)
assert output.item() == 0.0
# test input != target
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.ones(4) * 8).long()
output = layer(dummy_input, dummy_target, dummy_length)
assert output.item() == 1.0, "1.0 vs {}".format(output.item())
# test if padded values of input makes any difference
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()
2021-04-12 09:47:39 +00:00
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
2020-10-28 14:24:18 +00:00
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
dummy_input = T.rand(4, 8, 128).float()
dummy_target = dummy_input.detach()
dummy_length = (T.arange(5, 9)).long()
2021-04-12 09:47:39 +00:00
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
2020-10-28 14:24:18 +00:00
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert output.item() == 0, "0 vs {}".format(output.item())