mirror of https://github.com/coqui-ai/TTS.git
Update tests for the new trainer API
parent
fcfd95669a
commit
626c9d41e6
|
@ -3,8 +3,7 @@ import unittest
|
|||
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.models import setup_model
|
||||
from TTS.tts.utils.io import save_checkpoint
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.io import save_checkpoint
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
|
||||
from .. import get_tests_output_path
|
||||
|
@ -14,15 +13,10 @@ class SynthesizerTest(unittest.TestCase):
|
|||
# pylint: disable=R0201
|
||||
def _create_random_model(self):
|
||||
# pylint: disable=global-statement
|
||||
global symbols, phonemes
|
||||
config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json"))
|
||||
if config.has("characters") and config.characters:
|
||||
symbols, phonemes = make_symbols(**config.characters.to_dict())
|
||||
|
||||
num_chars = len(phonemes) if config.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, 0, config)
|
||||
model = setup_model(config)
|
||||
output_path = os.path.join(get_tests_output_path())
|
||||
save_checkpoint(model, None, 10, 10, 1, output_path, None)
|
||||
save_checkpoint(config, model, None, None, 10, 1, output_path)
|
||||
|
||||
def test_in_out(self):
|
||||
self._create_random_model()
|
||||
|
|
|
@ -6,7 +6,6 @@ import torch
|
|||
from tests import get_tests_input_path, get_tests_output_path, run_cli
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.models import setup_model
|
||||
from TTS.tts.utils.text.symbols import phonemes, symbols
|
||||
|
||||
torch.manual_seed(1)
|
||||
|
||||
|
@ -21,8 +20,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
|||
# load config
|
||||
c = load_config(config_path)
|
||||
# create model
|
||||
num_chars = len(phonemes if c.use_phonemes else symbols)
|
||||
model = setup_model(num_chars, 1, c, d_vector_dim=None)
|
||||
model = setup_model(c)
|
||||
# save model
|
||||
torch.save({"model": model.state_dict()}, checkpoint_path)
|
||||
# run test
|
||||
|
@ -40,8 +38,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
|||
# load config
|
||||
c = load_config(config_path)
|
||||
# create model
|
||||
num_chars = len(phonemes if c.use_phonemes else symbols)
|
||||
model = setup_model(num_chars, 1, c, d_vector_dim=None)
|
||||
model = setup_model(c)
|
||||
# save model
|
||||
torch.save({"model": model.state_dict()}, checkpoint_path)
|
||||
# run test
|
||||
|
@ -59,8 +56,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
|||
# load config
|
||||
c = load_config(config_path)
|
||||
# create model
|
||||
num_chars = len(phonemes if c.use_phonemes else symbols)
|
||||
model = setup_model(num_chars, 1, c, d_vector_dim=None)
|
||||
model = setup_model(c)
|
||||
# save model
|
||||
torch.save({"model": model.state_dict()}, checkpoint_path)
|
||||
# run test
|
||||
|
|
|
@ -13,7 +13,7 @@ config = AlignTTSConfig(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=False,
|
||||
phoneme_language="en-us",
|
||||
|
|
|
@ -41,64 +41,11 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
criterion = GlowTTSLoss()
|
||||
|
||||
# model to train
|
||||
model = GlowTTS(
|
||||
num_chars=32,
|
||||
hidden_channels_enc=48,
|
||||
hidden_channels_dec=48,
|
||||
hidden_channels_dp=32,
|
||||
out_channels=80,
|
||||
encoder_type="rel_pos_transformer",
|
||||
encoder_params={
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"num_heads": 2,
|
||||
"hidden_channels_ffn": 16, # 4 times the hidden_channels
|
||||
"input_length": None,
|
||||
},
|
||||
use_encoder_prenet=True,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=1,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.0,
|
||||
num_speakers=0,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_squeeze=1,
|
||||
sigmoid_scale=False,
|
||||
mean_only=False,
|
||||
).to(device)
|
||||
config = GlowTTSConfig(num_chars=32)
|
||||
model = GlowTTS(config).to(device)
|
||||
|
||||
# reference model to compare model weights
|
||||
model_ref = GlowTTS(
|
||||
num_chars=32,
|
||||
hidden_channels_enc=48,
|
||||
hidden_channels_dec=48,
|
||||
hidden_channels_dp=32,
|
||||
out_channels=80,
|
||||
encoder_type="rel_pos_transformer",
|
||||
encoder_params={
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"num_heads": 2,
|
||||
"hidden_channels_ffn": 16, # 4 times the hidden_channels
|
||||
"input_length": None,
|
||||
},
|
||||
use_encoder_prenet=True,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=1,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.0,
|
||||
num_speakers=0,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_squeeze=1,
|
||||
sigmoid_scale=False,
|
||||
mean_only=False,
|
||||
).to(device)
|
||||
model_ref = GlowTTS(config).to(device)
|
||||
|
||||
model.train()
|
||||
print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
|
||||
|
@ -149,34 +96,8 @@ class GlowTTSInferenceTest(unittest.TestCase):
|
|||
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
|
||||
|
||||
# create model
|
||||
model = GlowTTS(
|
||||
num_chars=32,
|
||||
hidden_channels_enc=48,
|
||||
hidden_channels_dec=48,
|
||||
hidden_channels_dp=32,
|
||||
out_channels=80,
|
||||
encoder_type="rel_pos_transformer",
|
||||
encoder_params={
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"num_heads": 2,
|
||||
"hidden_channels_ffn": 16, # 4 times the hidden_channels
|
||||
"input_length": None,
|
||||
},
|
||||
use_encoder_prenet=True,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=1,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.0,
|
||||
num_speakers=0,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_squeeze=1,
|
||||
sigmoid_scale=False,
|
||||
mean_only=False,
|
||||
).to(device)
|
||||
config = GlowTTSConfig(num_chars=32)
|
||||
model = GlowTTS(config).to(device)
|
||||
|
||||
model.eval()
|
||||
print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
|
||||
|
|
|
@ -13,7 +13,7 @@ config = GlowTTSConfig(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=True,
|
||||
use_espeak_phonemes=True,
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import torch
|
||||
|
||||
from TTS.tts.configs import SpeedySpeechConfig
|
||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeech
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
|
@ -40,7 +41,8 @@ def test_speedy_speech():
|
|||
|
||||
y_lengths = durations.sum(1)
|
||||
|
||||
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128)
|
||||
config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128))
|
||||
model = SpeedySpeech(config)
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
|
||||
|
@ -55,7 +57,12 @@ def test_speedy_speech():
|
|||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
||||
# with speaker embedding
|
||||
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device)
|
||||
config = SpeedySpeechConfig(
|
||||
model_args=SpeedySpeechArgs(
|
||||
num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256
|
||||
)
|
||||
)
|
||||
model = SpeedySpeech(config).to(device)
|
||||
model.forward(
|
||||
x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)}
|
||||
)
|
||||
|
@ -68,9 +75,17 @@ def test_speedy_speech():
|
|||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
||||
# with speaker external embedding
|
||||
model = SpeedySpeech(
|
||||
num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256
|
||||
).to(device)
|
||||
config = SpeedySpeechConfig(
|
||||
model_args=SpeedySpeechArgs(
|
||||
num_chars=num_chars,
|
||||
out_channels=80,
|
||||
hidden_channels=128,
|
||||
num_speakers=10,
|
||||
use_d_vector=True,
|
||||
d_vector_dim=256,
|
||||
)
|
||||
)
|
||||
model = SpeedySpeech(config).to(device)
|
||||
model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)})
|
||||
o_de = outputs["model_outputs"]
|
||||
attn = outputs["alignments"]
|
||||
|
|
|
@ -4,16 +4,18 @@ import shutil
|
|||
|
||||
from tests import get_device_id, get_tests_output_path, run_cli
|
||||
from TTS.tts.configs import SpeedySpeechConfig
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
|
||||
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||
|
||||
|
||||
config = SpeedySpeechConfig(
|
||||
model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0),
|
||||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=True,
|
||||
phoneme_language="en-us",
|
||||
|
|
|
@ -13,7 +13,7 @@ config = Tacotron2Config(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=False,
|
||||
phoneme_language="en-us",
|
||||
|
@ -24,11 +24,11 @@ config = Tacotron2Config(
|
|||
print_step=1,
|
||||
print_eval=True,
|
||||
use_speaker_embedding=True,
|
||||
use_external_speaker_embedding_file=True,
|
||||
use_d_vector_file=True,
|
||||
test_sentences=[
|
||||
"Be a voice, not an echo.",
|
||||
],
|
||||
external_speaker_embedding_file="tests/data/ljspeech/speakers.json",
|
||||
d_vector_file="tests/data/ljspeech/speakers.json",
|
||||
max_decoder_steps=50,
|
||||
)
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ from torch import nn, optim
|
|||
|
||||
from tests import get_tests_input_path
|
||||
from TTS.tts.configs import Tacotron2Config
|
||||
from TTS.tts.configs.shared_configs import GSTConfig
|
||||
from TTS.tts.layers.losses import MSELossMasked
|
||||
from TTS.tts.models.tacotron2 import Tacotron2
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
@ -17,19 +18,20 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
c = Tacotron2Config()
|
||||
config_global = Tacotron2Config(num_chars=32, num_speakers=5, out_channels=80, decoder_output_dim=80)
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
ap = AudioProcessor(**config_global.audio)
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
||||
|
||||
class TacotronTrainTest(unittest.TestCase):
|
||||
def test_train_step(self): # pylint: disable=no-self-use
|
||||
config = config_global.copy()
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
|
||||
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
||||
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[0] = 30
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
@ -38,19 +40,19 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = MSELossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device)
|
||||
model = Tacotron2(config).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for i in range(5):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
|
||||
|
@ -77,11 +79,12 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def test_train_step():
|
||||
config = config_global.copy()
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
|
||||
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
||||
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[0] = 30
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
@ -90,19 +93,20 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = MSELossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device)
|
||||
config.d_vector_dim = 55
|
||||
model = Tacotron2(config).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for i in range(5):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids}
|
||||
|
@ -130,11 +134,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
# pylint: disable=no-self-use
|
||||
def test_train_step(self):
|
||||
# with random gst mel style
|
||||
config = config_global.copy()
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
|
||||
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
||||
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[0] = 30
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
@ -143,19 +148,21 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = MSELossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device)
|
||||
config.use_gst = True
|
||||
config.gst = GSTConfig()
|
||||
model = Tacotron2(config).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for i in range(10):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
|
||||
|
@ -190,7 +197,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
|
||||
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
||||
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[0] = 30
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
@ -199,19 +206,19 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = MSELossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device)
|
||||
model = Tacotron2(config).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for i in range(10):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
|
||||
|
@ -242,11 +249,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def test_train_step():
|
||||
config = config_global.copy()
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
|
||||
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
||||
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[0] = 30
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
@ -255,18 +263,19 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
criterion = MSELossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(device)
|
||||
config.d_vector_dim = 55
|
||||
model = Tacotron2(config).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for i in range(5):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}
|
||||
|
|
|
@ -13,7 +13,7 @@ config = Tacotron2Config(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=False,
|
||||
phoneme_language="en-us",
|
||||
|
|
|
@ -110,7 +110,7 @@ class TacotronTFTrainTest(unittest.TestCase):
|
|||
num_chars=24,
|
||||
num_speakers=0,
|
||||
r=3,
|
||||
postnet_output_dim=80,
|
||||
out_channels=80,
|
||||
decoder_output_dim=80,
|
||||
attn_type="original",
|
||||
attn_win=False,
|
||||
|
|
|
@ -13,7 +13,7 @@ config = Tacotron2Config(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=False,
|
||||
phoneme_language="en-us",
|
||||
|
|
|
@ -6,7 +6,7 @@ import torch
|
|||
from torch import nn, optim
|
||||
|
||||
from tests import get_tests_input_path
|
||||
from TTS.tts.configs import TacotronConfig
|
||||
from TTS.tts.configs import GSTConfig, TacotronConfig
|
||||
from TTS.tts.layers.losses import L1LossMasked
|
||||
from TTS.tts.models.tacotron import Tacotron
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
@ -17,9 +17,9 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
c = TacotronConfig()
|
||||
config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
ap = AudioProcessor(**config_global.audio)
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
||||
|
||||
|
@ -31,11 +31,12 @@ def count_parameters(model):
|
|||
class TacotronTrainTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def test_train_step():
|
||||
config = config_global.copy()
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
|
||||
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[-1] = mel_spec.size(1)
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
@ -44,21 +45,12 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = L1LossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron(
|
||||
num_chars=32,
|
||||
num_speakers=5,
|
||||
postnet_output_dim=c.audio["fft_size"],
|
||||
decoder_output_dim=c.audio["num_mels"],
|
||||
r=c.r,
|
||||
memory_size=c.memory_size,
|
||||
).to(
|
||||
device
|
||||
) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model.train()
|
||||
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
|
||||
model_ref = copy.deepcopy(model)
|
||||
|
@ -66,7 +58,7 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for _ in range(5):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
|
||||
|
@ -91,11 +83,12 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def test_train_step():
|
||||
config = config_global.copy()
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
|
||||
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[-1] = mel_spec.size(1)
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
@ -104,22 +97,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = L1LossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron(
|
||||
num_chars=32,
|
||||
num_speakers=5,
|
||||
postnet_output_dim=c.audio["fft_size"],
|
||||
decoder_output_dim=c.audio["num_mels"],
|
||||
r=c.r,
|
||||
memory_size=c.memory_size,
|
||||
d_vector_dim=55,
|
||||
).to(
|
||||
device
|
||||
) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
config.d_vector_dim = 55
|
||||
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model.train()
|
||||
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
|
||||
model_ref = copy.deepcopy(model)
|
||||
|
@ -127,7 +111,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for _ in range(5):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}
|
||||
|
@ -152,12 +136,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
class TacotronGSTTrainTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def test_train_step():
|
||||
config = config_global.copy()
|
||||
# with random gst mel style
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device)
|
||||
linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device)
|
||||
mel_spec = torch.rand(8, 120, config.audio["num_mels"]).to(device)
|
||||
linear_spec = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device)
|
||||
mel_lengths = torch.randint(20, 120, (8,)).long().to(device)
|
||||
mel_lengths[-1] = 120
|
||||
stop_targets = torch.zeros(8, 120, 1).float().to(device)
|
||||
|
@ -166,23 +151,14 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = L1LossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron(
|
||||
num_chars=32,
|
||||
num_speakers=5,
|
||||
use_gst=True,
|
||||
gst=c.gst,
|
||||
postnet_output_dim=c.audio["fft_size"],
|
||||
decoder_output_dim=c.audio["num_mels"],
|
||||
r=c.r,
|
||||
memory_size=c.memory_size,
|
||||
).to(
|
||||
device
|
||||
) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
config.use_gst = True
|
||||
config.gst = GSTConfig()
|
||||
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model.train()
|
||||
# print(model)
|
||||
print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
|
||||
|
@ -191,7 +167,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for _ in range(10):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
|
||||
|
@ -220,7 +196,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device)
|
||||
linear_spec = torch.rand(8, mel_spec.size(1), config.audio["fft_size"] // 2 + 1).to(device)
|
||||
mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device)
|
||||
mel_lengths[-1] = mel_spec.size(1)
|
||||
stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device)
|
||||
|
@ -229,23 +205,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = L1LossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron(
|
||||
num_chars=32,
|
||||
num_speakers=5,
|
||||
use_gst=True,
|
||||
gst=c.gst,
|
||||
postnet_output_dim=c.audio["fft_size"],
|
||||
decoder_output_dim=c.audio["num_mels"],
|
||||
r=c.r,
|
||||
memory_size=c.memory_size,
|
||||
).to(
|
||||
device
|
||||
) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model.train()
|
||||
# print(model)
|
||||
print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
|
||||
|
@ -254,7 +219,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for _ in range(10):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
|
||||
|
@ -278,11 +243,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def test_train_step():
|
||||
config = config_global.copy()
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
|
||||
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
|
||||
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
|
||||
linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
|
||||
mel_lengths[-1] = mel_spec.size(1)
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
@ -291,24 +257,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()) :, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = L1LossMasked(seq_len_norm=False).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron(
|
||||
num_chars=32,
|
||||
num_speakers=5,
|
||||
postnet_output_dim=c.audio["fft_size"],
|
||||
decoder_output_dim=c.audio["num_mels"],
|
||||
use_gst=True,
|
||||
gst=c.gst,
|
||||
r=c.r,
|
||||
memory_size=c.memory_size,
|
||||
d_vector_dim=55,
|
||||
).to(
|
||||
device
|
||||
) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
config.d_vector_dim = 55
|
||||
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model.train()
|
||||
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
|
||||
model_ref = copy.deepcopy(model)
|
||||
|
@ -316,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=config.lr)
|
||||
for _ in range(5):
|
||||
outputs = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}
|
||||
|
|
|
@ -13,7 +13,7 @@ config = TacotronConfig(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=False,
|
||||
phoneme_language="en-us",
|
||||
|
|
|
@ -12,7 +12,7 @@ config = FullbandMelganConfig(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
|
@ -29,9 +29,7 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
)
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
|
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
|
|||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
|
||||
)
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -13,7 +13,7 @@ config = HifiganConfig(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
|
@ -29,9 +29,7 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
)
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
|
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
|
|||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
|
||||
)
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -12,7 +12,7 @@ config = MelganConfig(
|
|||
batch_size=4,
|
||||
eval_batch_size=4,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
|
@ -29,9 +29,7 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
)
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
|
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
|
|||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
|
||||
)
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -12,7 +12,7 @@ config = MultibandMelganConfig(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
|
@ -30,9 +30,7 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
)
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
|
@ -40,7 +38,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
|
|||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
|
||||
)
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -12,7 +12,7 @@ config = ParallelWaveganConfig(
|
|||
batch_size=4,
|
||||
eval_batch_size=4,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
|
@ -28,9 +28,7 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
)
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
|
@ -38,7 +36,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
|
|||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
|
||||
)
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -3,11 +3,13 @@ import random
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
from TTS.vocoder.models.wavernn import WaveRNN
|
||||
from TTS.vocoder.configs import WavernnConfig
|
||||
from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs
|
||||
|
||||
|
||||
def test_wavernn():
|
||||
model = WaveRNN(
|
||||
config = WavernnConfig()
|
||||
config.model_args = WavernnArgs(
|
||||
rnn_dims=512,
|
||||
fc_dims=512,
|
||||
mode=10,
|
||||
|
@ -20,14 +22,30 @@ def test_wavernn():
|
|||
compute_dims=128,
|
||||
res_out_dims=128,
|
||||
num_res_blocks=10,
|
||||
hop_length=256,
|
||||
sample_rate=22050,
|
||||
)
|
||||
config.audio.hop_length = 256
|
||||
config.audio.sample_rate = 2048
|
||||
|
||||
dummy_x = torch.rand((2, 1280))
|
||||
dummy_m = torch.rand((2, 80, 9))
|
||||
y_size = random.randrange(20, 60)
|
||||
dummy_y = torch.rand((80, y_size))
|
||||
|
||||
# mode: mold
|
||||
model = Wavernn(config)
|
||||
output = model(dummy_x, dummy_m)
|
||||
assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape
|
||||
assert np.all(output.shape == (2, 1280, 30)), output.shape
|
||||
|
||||
# mode: gauss
|
||||
config.model_params.mode = "gauss"
|
||||
model = Wavernn(config)
|
||||
output = model(dummy_x, dummy_m)
|
||||
assert np.all(output.shape == (2, 1280, 2)), output.shape
|
||||
|
||||
# mode: quantized
|
||||
config.model_params.mode = 4
|
||||
model = Wavernn(config)
|
||||
output = model(dummy_x, dummy_m)
|
||||
assert np.all(output.shape == (2, 1280, 2 ** 4)), output.shape
|
||||
output = model.inference(dummy_y, True, 5500, 550)
|
||||
assert np.all(output.shape == (256 * (y_size - 1),))
|
||||
|
|
|
@ -4,7 +4,8 @@ import numpy as np
|
|||
import torch
|
||||
from torch import optim
|
||||
|
||||
from TTS.vocoder.models.wavegrad import Wavegrad
|
||||
from TTS.vocoder.configs import WavegradConfig
|
||||
from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs
|
||||
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
|
@ -20,19 +21,16 @@ class WavegradTrainTest(unittest.TestCase):
|
|||
mel_spec = torch.rand(8, 80, 20).to(device)
|
||||
|
||||
criterion = torch.nn.L1Loss().to(device)
|
||||
model = Wavegrad(
|
||||
args = WavegradArgs(
|
||||
in_channels=80,
|
||||
out_channels=1,
|
||||
upsample_factors=[5, 5, 3, 2, 2],
|
||||
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
|
||||
)
|
||||
config = WavegradConfig(model_params=args)
|
||||
model = Wavegrad(config)
|
||||
|
||||
model_ref = Wavegrad(
|
||||
in_channels=80,
|
||||
out_channels=1,
|
||||
upsample_factors=[5, 5, 3, 2, 2],
|
||||
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
|
||||
)
|
||||
model_ref = Wavegrad(config)
|
||||
model.train()
|
||||
model.to(device)
|
||||
betas = np.linspace(1e-6, 1e-2, 1000)
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import torch
|
||||
|
||||
from TTS.vocoder.configs import WavegradConfig
|
||||
from TTS.vocoder.layers.wavegrad import DBlock, FiLM, PositionalEncoding, UBlock
|
||||
from TTS.vocoder.models.wavegrad import Wavegrad
|
||||
from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs
|
||||
|
||||
|
||||
def test_positional_encoding():
|
||||
|
@ -75,12 +76,14 @@ def test_wavegrad_forward():
|
|||
c = torch.rand(32, 80, 20)
|
||||
noise_scale = torch.rand(32)
|
||||
|
||||
model = Wavegrad(
|
||||
args = WavegradArgs(
|
||||
in_channels=80,
|
||||
out_channels=1,
|
||||
upsample_factors=[5, 5, 3, 2, 2],
|
||||
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
|
||||
)
|
||||
config = WavegradConfig(model_params=args)
|
||||
model = Wavegrad(config)
|
||||
o = model.forward(x, c, noise_scale)
|
||||
|
||||
assert o.shape[0] == 32
|
||||
|
|
|
@ -12,7 +12,7 @@ config = WavegradConfig(
|
|||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
|
@ -29,15 +29,15 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} "
|
||||
)
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} "
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
|
||||
)
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -4,15 +4,18 @@ import shutil
|
|||
|
||||
from tests import get_device_id, get_tests_output_path, run_cli
|
||||
from TTS.vocoder.configs import WavernnConfig
|
||||
from TTS.vocoder.models.wavernn import WavernnArgs
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
|
||||
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||
|
||||
|
||||
config = WavernnConfig(
|
||||
model_params=WavernnArgs(),
|
||||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
|
@ -28,9 +31,7 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} "
|
||||
)
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
|
@ -38,7 +39,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
|
|||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
|
||||
)
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
Loading…
Reference in New Issue