Update tests for the new trainer API

pull/506/head
Eren Gölge 2021-06-18 13:27:19 +02:00
parent fcfd95669a
commit 626c9d41e6
24 changed files with 174 additions and 272 deletions

View File

@ -3,8 +3,7 @@ import unittest
from TTS.config import load_config
from TTS.tts.models import setup_model
from TTS.tts.utils.io import save_checkpoint
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.io import save_checkpoint
from TTS.utils.synthesizer import Synthesizer
from .. import get_tests_output_path
@ -14,15 +13,10 @@ class SynthesizerTest(unittest.TestCase):
# pylint: disable=R0201
def _create_random_model(self):
# pylint: disable=global-statement
global symbols, phonemes
config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json"))
if config.has("characters") and config.characters:
symbols, phonemes = make_symbols(**config.characters.to_dict())
num_chars = len(phonemes) if config.use_phonemes else len(symbols)
model = setup_model(num_chars, 0, config)
model = setup_model(config)
output_path = os.path.join(get_tests_output_path())
save_checkpoint(model, None, 10, 10, 1, output_path, None)
save_checkpoint(config, model, None, None, 10, 1, output_path)
def test_in_out(self):
self._create_random_model()

View File

@ -6,7 +6,6 @@ import torch
from tests import get_tests_input_path, get_tests_output_path, run_cli
from TTS.config import load_config
from TTS.tts.models import setup_model
from TTS.tts.utils.text.symbols import phonemes, symbols
torch.manual_seed(1)
@ -21,8 +20,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
# load config
c = load_config(config_path)
# create model
num_chars = len(phonemes if c.use_phonemes else symbols)
model = setup_model(num_chars, 1, c, d_vector_dim=None)
model = setup_model(c)
# save model
torch.save({"model": model.state_dict()}, checkpoint_path)
# run test
@ -40,8 +38,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
# load config
c = load_config(config_path)
# create model
num_chars = len(phonemes if c.use_phonemes else symbols)
model = setup_model(num_chars, 1, c, d_vector_dim=None)
model = setup_model(c)
# save model
torch.save({"model": model.state_dict()}, checkpoint_path)
# run test
@ -59,8 +56,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
# load config
c = load_config(config_path)
# create model
num_chars = len(phonemes if c.use_phonemes else symbols)
model = setup_model(num_chars, 1, c, d_vector_dim=None)
model = setup_model(c)
# save model
torch.save({"model": model.state_dict()}, checkpoint_path)
# run test

View File

@ -13,7 +13,7 @@ config = AlignTTSConfig(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
text_cleaner="english_cleaners",
use_phonemes=False,
phoneme_language="en-us",

View File

@ -41,64 +41,11 @@ class GlowTTSTrainTest(unittest.TestCase):
criterion = GlowTTSLoss()
# model to train
model = GlowTTS(
num_chars=32,
hidden_channels_enc=48,
hidden_channels_dec=48,
hidden_channels_dp=32,
out_channels=80,
encoder_type="rel_pos_transformer",
encoder_params={
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 16, # 4 times the hidden_channels
"input_length": None,
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
num_block_layers=4,
dropout_p_dec=0.0,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False,
).to(device)
config = GlowTTSConfig(num_chars=32)
model = GlowTTS(config).to(device)
# reference model to compare model weights
model_ref = GlowTTS(
num_chars=32,
hidden_channels_enc=48,
hidden_channels_dec=48,
hidden_channels_dp=32,
out_channels=80,
encoder_type="rel_pos_transformer",
encoder_params={
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 16, # 4 times the hidden_channels
"input_length": None,
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
num_block_layers=4,
dropout_p_dec=0.0,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False,
).to(device)
model_ref = GlowTTS(config).to(device)
model.train()
print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
@ -149,34 +96,8 @@ class GlowTTSInferenceTest(unittest.TestCase):
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
# create model
model = GlowTTS(
num_chars=32,
hidden_channels_enc=48,
hidden_channels_dec=48,
hidden_channels_dp=32,
out_channels=80,
encoder_type="rel_pos_transformer",
encoder_params={
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 16, # 4 times the hidden_channels
"input_length": None,
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
num_block_layers=4,
dropout_p_dec=0.0,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False,
).to(device)
config = GlowTTSConfig(num_chars=32)
model = GlowTTS(config).to(device)
model.eval()
print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))

View File

@ -13,7 +13,7 @@ config = GlowTTSConfig(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
text_cleaner="english_cleaners",
use_phonemes=True,
use_espeak_phonemes=True,

View File

@ -1,7 +1,8 @@
import torch
from TTS.tts.configs import SpeedySpeechConfig
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
from TTS.tts.models.speedy_speech import SpeedySpeech
from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs
from TTS.tts.utils.data import sequence_mask
use_cuda = torch.cuda.is_available()
@ -40,7 +41,8 @@ def test_speedy_speech():
y_lengths = durations.sum(1)
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128)
config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128))
model = SpeedySpeech(config)
if use_cuda:
model.cuda()
@ -55,7 +57,12 @@ def test_speedy_speech():
assert list(o_dr.shape) == [B, T_en]
# with speaker embedding
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device)
config = SpeedySpeechConfig(
model_args=SpeedySpeechArgs(
num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256
)
)
model = SpeedySpeech(config).to(device)
model.forward(
x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)}
)
@ -68,9 +75,17 @@ def test_speedy_speech():
assert list(o_dr.shape) == [B, T_en]
# with speaker external embedding
model = SpeedySpeech(
num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256
).to(device)
config = SpeedySpeechConfig(
model_args=SpeedySpeechArgs(
num_chars=num_chars,
out_channels=80,
hidden_channels=128,
num_speakers=10,
use_d_vector=True,
d_vector_dim=256,
)
)
model = SpeedySpeech(config).to(device)
model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)})
o_de = outputs["model_outputs"]
attn = outputs["alignments"]

View File

@ -4,16 +4,18 @@ import shutil
from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import SpeedySpeechConfig
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs")
config = SpeedySpeechConfig(
model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0),
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
text_cleaner="english_cleaners",
use_phonemes=True,
phoneme_language="en-us",

View File

@ -13,7 +13,7 @@ config = Tacotron2Config(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
text_cleaner="english_cleaners",
use_phonemes=False,
phoneme_language="en-us",
@ -24,11 +24,11 @@ config = Tacotron2Config(
print_step=1,
print_eval=True,
use_speaker_embedding=True,
use_external_speaker_embedding_file=True,
use_d_vector_file=True,
test_sentences=[
"Be a voice, not an echo.",
],
external_speaker_embedding_file="tests/data/ljspeech/speakers.json",
d_vector_file="tests/data/ljspeech/speakers.json",
max_decoder_steps=50,
)

View File

@ -7,6 +7,7 @@ from torch import nn, optim
from tests import get_tests_input_path
from TTS.tts.configs import Tacotron2Config
from TTS.tts.configs.shared_configs import GSTConfig
from TTS.tts.layers.losses import MSELossMasked
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.utils.audio import AudioProcessor
@ -17,19 +18,20 @@ torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = Tacotron2Config()
config_global = Tacotron2Config(num_chars=32, num_speakers=5, out_channels=80, decoder_output_dim=80)
ap = AudioProcessor(**c.audio)
ap = AudioProcessor(**config_global.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
class TacotronTrainTest(unittest.TestCase):
def test_train_step(self): # pylint: disable=no-self-use
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -38,19 +40,19 @@ class TacotronTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device)
model = Tacotron2(config).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(5):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -77,11 +79,12 @@ class TacotronTrainTest(unittest.TestCase):
class MultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -90,19 +93,20 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device)
config.d_vector_dim = 55
model = Tacotron2(config).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(5):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids}
@ -130,11 +134,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
# pylint: disable=no-self-use
def test_train_step(self):
# with random gst mel style
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -143,19 +148,21 @@ class TacotronGSTTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device)
config.use_gst = True
config.gst = GSTConfig()
model = Tacotron2(config).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(10):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -190,7 +197,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -199,19 +206,19 @@ class TacotronGSTTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device)
model = Tacotron2(config).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(10):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -242,11 +249,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -255,18 +263,19 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(device)
config.d_vector_dim = 55
model = Tacotron2(config).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(5):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}

View File

@ -13,7 +13,7 @@ config = Tacotron2Config(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
text_cleaner="english_cleaners",
use_phonemes=False,
phoneme_language="en-us",

View File

@ -110,7 +110,7 @@ class TacotronTFTrainTest(unittest.TestCase):
num_chars=24,
num_speakers=0,
r=3,
postnet_output_dim=80,
out_channels=80,
decoder_output_dim=80,
attn_type="original",
attn_win=False,

View File

@ -13,7 +13,7 @@ config = Tacotron2Config(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
text_cleaner="english_cleaners",
use_phonemes=False,
phoneme_language="en-us",

View File

@ -6,7 +6,7 @@ import torch
from torch import nn, optim
from tests import get_tests_input_path
from TTS.tts.configs import TacotronConfig
from TTS.tts.configs import GSTConfig, TacotronConfig
from TTS.tts.layers.losses import L1LossMasked
from TTS.tts.models.tacotron import Tacotron
from TTS.utils.audio import AudioProcessor
@ -17,9 +17,9 @@ torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = TacotronConfig()
config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
ap = AudioProcessor(**c.audio)
ap = AudioProcessor(**config_global.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
@ -31,11 +31,12 @@ def count_parameters(model):
class TacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -44,21 +45,12 @@ class TacotronTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
@ -66,7 +58,7 @@ class TacotronTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(5):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -91,11 +83,12 @@ class TacotronTrainTest(unittest.TestCase):
class MultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -104,22 +97,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
d_vector_dim=55,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
config.d_vector_dim = 55
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
@ -127,7 +111,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(5):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}
@ -152,12 +136,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
class TacotronGSTTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
config = config_global.copy()
# with random gst mel style
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device)
mel_spec = torch.rand(8, 120, config.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, 120, (8,)).long().to(device)
mel_lengths[-1] = 120
stop_targets = torch.zeros(8, 120, 1).float().to(device)
@ -166,23 +151,14 @@ class TacotronGSTTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
use_gst=True,
gst=c.gst,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
config.use_gst = True
config.gst = GSTConfig()
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
# print(model)
print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
@ -191,7 +167,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(10):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -220,7 +196,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device)
linear_spec = torch.rand(8, mel_spec.size(1), config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device)
@ -229,23 +205,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
use_gst=True,
gst=c.gst,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
# print(model)
print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
@ -254,7 +219,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(10):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -278,11 +243,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -291,24 +257,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
use_gst=True,
gst=c.gst,
r=c.r,
memory_size=c.memory_size,
d_vector_dim=55,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
config.d_vector_dim = 55
model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
@ -316,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(5):
outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}

View File

@ -13,7 +13,7 @@ config = TacotronConfig(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
text_cleaner="english_cleaners",
use_phonemes=False,
phoneme_language="en-us",

View File

@ -12,7 +12,7 @@ config = FullbandMelganConfig(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
run_eval=True,
test_delay_epochs=-1,
epochs=1,
@ -29,9 +29,7 @@ config.audio.trim_db = 60
config.save_json(config_path)
# train the model for one epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
run_cli(command_train)
# Find latest folder
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
)
run_cli(command_train)
shutil.rmtree(continue_path)

View File

@ -13,7 +13,7 @@ config = HifiganConfig(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
run_eval=True,
test_delay_epochs=-1,
epochs=1,
@ -29,9 +29,7 @@ config.audio.trim_db = 60
config.save_json(config_path)
# train the model for one epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
run_cli(command_train)
# Find latest folder
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
)
run_cli(command_train)
shutil.rmtree(continue_path)

View File

@ -12,7 +12,7 @@ config = MelganConfig(
batch_size=4,
eval_batch_size=4,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
run_eval=True,
test_delay_epochs=-1,
epochs=1,
@ -29,9 +29,7 @@ config.audio.trim_db = 60
config.save_json(config_path)
# train the model for one epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
run_cli(command_train)
# Find latest folder
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
)
run_cli(command_train)
shutil.rmtree(continue_path)

View File

@ -12,7 +12,7 @@ config = MultibandMelganConfig(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
run_eval=True,
test_delay_epochs=-1,
epochs=1,
@ -30,9 +30,7 @@ config.audio.trim_db = 60
config.save_json(config_path)
# train the model for one epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
run_cli(command_train)
# Find latest folder
@ -40,7 +38,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
)
run_cli(command_train)
shutil.rmtree(continue_path)

View File

@ -12,7 +12,7 @@ config = ParallelWaveganConfig(
batch_size=4,
eval_batch_size=4,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
run_eval=True,
test_delay_epochs=-1,
epochs=1,
@ -28,9 +28,7 @@ config.audio.trim_db = 60
config.save_json(config_path)
# train the model for one epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
run_cli(command_train)
# Find latest folder
@ -38,7 +36,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
)
run_cli(command_train)
shutil.rmtree(continue_path)

View File

@ -3,11 +3,13 @@ import random
import numpy as np
import torch
from TTS.vocoder.models.wavernn import WaveRNN
from TTS.vocoder.configs import WavernnConfig
from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs
def test_wavernn():
model = WaveRNN(
config = WavernnConfig()
config.model_args = WavernnArgs(
rnn_dims=512,
fc_dims=512,
mode=10,
@ -20,14 +22,30 @@ def test_wavernn():
compute_dims=128,
res_out_dims=128,
num_res_blocks=10,
hop_length=256,
sample_rate=22050,
)
config.audio.hop_length = 256
config.audio.sample_rate = 2048
dummy_x = torch.rand((2, 1280))
dummy_m = torch.rand((2, 80, 9))
y_size = random.randrange(20, 60)
dummy_y = torch.rand((80, y_size))
# mode: mold
model = Wavernn(config)
output = model(dummy_x, dummy_m)
assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape
assert np.all(output.shape == (2, 1280, 30)), output.shape
# mode: gauss
config.model_params.mode = "gauss"
model = Wavernn(config)
output = model(dummy_x, dummy_m)
assert np.all(output.shape == (2, 1280, 2)), output.shape
# mode: quantized
config.model_params.mode = 4
model = Wavernn(config)
output = model(dummy_x, dummy_m)
assert np.all(output.shape == (2, 1280, 2 ** 4)), output.shape
output = model.inference(dummy_y, True, 5500, 550)
assert np.all(output.shape == (256 * (y_size - 1),))

View File

@ -4,7 +4,8 @@ import numpy as np
import torch
from torch import optim
from TTS.vocoder.models.wavegrad import Wavegrad
from TTS.vocoder.configs import WavegradConfig
from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs
# pylint: disable=unused-variable
@ -20,19 +21,16 @@ class WavegradTrainTest(unittest.TestCase):
mel_spec = torch.rand(8, 80, 20).to(device)
criterion = torch.nn.L1Loss().to(device)
model = Wavegrad(
args = WavegradArgs(
in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
)
config = WavegradConfig(model_params=args)
model = Wavegrad(config)
model_ref = Wavegrad(
in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
)
model_ref = Wavegrad(config)
model.train()
model.to(device)
betas = np.linspace(1e-6, 1e-2, 1000)

View File

@ -1,7 +1,8 @@
import torch
from TTS.vocoder.configs import WavegradConfig
from TTS.vocoder.layers.wavegrad import DBlock, FiLM, PositionalEncoding, UBlock
from TTS.vocoder.models.wavegrad import Wavegrad
from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs
def test_positional_encoding():
@ -75,12 +76,14 @@ def test_wavegrad_forward():
c = torch.rand(32, 80, 20)
noise_scale = torch.rand(32)
model = Wavegrad(
args = WavegradArgs(
in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
)
config = WavegradConfig(model_params=args)
model = Wavegrad(config)
o = model.forward(x, c, noise_scale)
assert o.shape[0] == 32

View File

@ -12,7 +12,7 @@ config = WavegradConfig(
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
run_eval=True,
test_delay_epochs=-1,
epochs=1,
@ -29,15 +29,15 @@ config.audio.trim_db = 60
config.save_json(config_path)
# train the model for one epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} "
)
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
run_cli(command_train)
# Find latest folder
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
# restore the model and continue training for one more epoch
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} "
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
)
run_cli(command_train)
shutil.rmtree(continue_path)

View File

@ -4,15 +4,18 @@ import shutil
from tests import get_device_id, get_tests_output_path, run_cli
from TTS.vocoder.configs import WavernnConfig
from TTS.vocoder.models.wavernn import WavernnArgs
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs")
config = WavernnConfig(
model_params=WavernnArgs(),
batch_size=8,
eval_batch_size=8,
num_loader_workers=0,
num_val_loader_workers=0,
num_eval_loader_workers=0,
run_eval=True,
test_delay_epochs=-1,
epochs=1,
@ -28,9 +31,7 @@ config.audio.trim_db = 60
config.save_json(config_path)
# train the model for one epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} "
)
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
run_cli(command_train)
# Find latest folder
@ -38,7 +39,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
)
run_cli(command_train)
shutil.rmtree(continue_path)