mirror of https://github.com/coqui-ai/TTS.git
code styling
parent
10db2baa06
commit
9ee70af9bb
|
@ -8,11 +8,10 @@ import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.tts.datasets.preprocess import load_meta_data
|
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
|
|
||||||
# from TTS.utils.io import load_config
|
# from TTS.utils.io import load_config
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
|
from TTS.tts.datasets.preprocess import load_meta_data
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -46,8 +46,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
|
||||||
ap=ap,
|
ap=ap,
|
||||||
tp=config.characters,
|
tp=config.characters,
|
||||||
add_blank=config["add_blank"],
|
add_blank=config["add_blank"],
|
||||||
batch_group_size=0 if is_val else config.batch_group_size *
|
batch_group_size=0 if is_val else config.batch_group_size * config.batch_size,
|
||||||
config.batch_size,
|
|
||||||
min_seq_len=config.min_seq_len,
|
min_seq_len=config.min_seq_len,
|
||||||
max_seq_len=config.max_seq_len,
|
max_seq_len=config.max_seq_len,
|
||||||
phoneme_cache_path=config.phoneme_cache_path,
|
phoneme_cache_path=config.phoneme_cache_path,
|
||||||
|
@ -56,8 +55,9 @@ def setup_loader(ap, r, is_val=False, verbose=False):
|
||||||
enable_eos_bos=config.enable_eos_bos_chars,
|
enable_eos_bos=config.enable_eos_bos_chars,
|
||||||
use_noise_augment=not is_val,
|
use_noise_augment=not is_val,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
speaker_mapping=speaker_mapping if config.use_speaker_embedding
|
speaker_mapping=speaker_mapping
|
||||||
and config.use_external_speaker_embedding_file else None,
|
if config.use_speaker_embedding and config.use_external_speaker_embedding_file
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
if config.use_phonemes and config.compute_input_seq_cache:
|
if config.use_phonemes and config.compute_input_seq_cache:
|
||||||
|
@ -73,8 +73,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
|
||||||
collate_fn=dataset.collate_fn,
|
collate_fn=dataset.collate_fn,
|
||||||
drop_last=False,
|
drop_last=False,
|
||||||
sampler=sampler,
|
sampler=sampler,
|
||||||
num_workers=config.num_val_loader_workers
|
num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers,
|
||||||
if is_val else config.num_loader_workers,
|
|
||||||
pin_memory=False,
|
pin_memory=False,
|
||||||
)
|
)
|
||||||
return loader
|
return loader
|
||||||
|
@ -97,9 +96,7 @@ def format_data(data):
|
||||||
speaker_c = data[8]
|
speaker_c = data[8]
|
||||||
else:
|
else:
|
||||||
# return speaker_id to be used by an embedding layer
|
# return speaker_id to be used by an embedding layer
|
||||||
speaker_c = [
|
speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names]
|
||||||
speaker_mapping[speaker_name] for speaker_name in speaker_names
|
|
||||||
]
|
|
||||||
speaker_c = torch.LongTensor(speaker_c)
|
speaker_c = torch.LongTensor(speaker_c)
|
||||||
else:
|
else:
|
||||||
speaker_c = None
|
speaker_c = None
|
||||||
|
@ -114,15 +111,13 @@ def format_data(data):
|
||||||
return text_input, text_lengths, mel_input, mel_lengths, speaker_c, avg_text_length, avg_spec_length, item_idx
|
return text_input, text_lengths, mel_input, mel_lengths, speaker_c, avg_text_length, avg_spec_length, item_idx
|
||||||
|
|
||||||
|
|
||||||
def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step,
|
def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, training_phase):
|
||||||
epoch, training_phase):
|
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
keep_avg = KeepAverage()
|
keep_avg = KeepAverage()
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
batch_n_iter = int(
|
batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus))
|
||||||
len(data_loader.dataset) / (config.batch_size * num_gpus))
|
|
||||||
else:
|
else:
|
||||||
batch_n_iter = int(len(data_loader.dataset) / config.batch_size)
|
batch_n_iter = int(len(data_loader.dataset) / config.batch_size)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
@ -151,12 +146,8 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step,
|
||||||
# forward pass model
|
# forward pass model
|
||||||
with torch.cuda.amp.autocast(enabled=config.mixed_precision):
|
with torch.cuda.amp.autocast(enabled=config.mixed_precision):
|
||||||
decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward(
|
decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward(
|
||||||
text_input,
|
text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase
|
||||||
text_lengths,
|
)
|
||||||
mel_targets,
|
|
||||||
mel_lengths,
|
|
||||||
g=speaker_c,
|
|
||||||
phase=training_phase)
|
|
||||||
|
|
||||||
# compute loss
|
# compute loss
|
||||||
loss_dict = criterion(
|
loss_dict = criterion(
|
||||||
|
@ -175,14 +166,12 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step,
|
||||||
if config.mixed_precision:
|
if config.mixed_precision:
|
||||||
scaler.scale(loss_dict["loss"]).backward()
|
scaler.scale(loss_dict["loss"]).backward()
|
||||||
scaler.unscale_(optimizer)
|
scaler.unscale_(optimizer)
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
|
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
|
||||||
config.grad_clip)
|
|
||||||
scaler.step(optimizer)
|
scaler.step(optimizer)
|
||||||
scaler.update()
|
scaler.update()
|
||||||
else:
|
else:
|
||||||
loss_dict["loss"].backward()
|
loss_dict["loss"].backward()
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
|
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
|
||||||
config.grad_clip)
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
# setup lr
|
# setup lr
|
||||||
|
@ -201,12 +190,9 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step,
|
||||||
|
|
||||||
# aggregate losses from processes
|
# aggregate losses from processes
|
||||||
if num_gpus > 1:
|
if num_gpus > 1:
|
||||||
loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data,
|
loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus)
|
||||||
num_gpus)
|
loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus)
|
||||||
loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data,
|
loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus)
|
||||||
num_gpus)
|
|
||||||
loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data,
|
|
||||||
num_gpus)
|
|
||||||
loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus)
|
loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus)
|
||||||
|
|
||||||
# detach loss values
|
# detach loss values
|
||||||
|
@ -235,18 +221,13 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step,
|
||||||
"loader_time": [loader_time, 2],
|
"loader_time": [loader_time, 2],
|
||||||
"current_lr": current_lr,
|
"current_lr": current_lr,
|
||||||
}
|
}
|
||||||
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values)
|
||||||
log_dict, loss_dict, keep_avg.avg_values)
|
|
||||||
|
|
||||||
if args.rank == 0:
|
if args.rank == 0:
|
||||||
# Plot Training Iter Stats
|
# Plot Training Iter Stats
|
||||||
# reduce TB load
|
# reduce TB load
|
||||||
if global_step % config.tb_plot_step == 0:
|
if global_step % config.tb_plot_step == 0:
|
||||||
iter_stats = {
|
iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time}
|
||||||
"lr": current_lr,
|
|
||||||
"grad_norm": grad_norm,
|
|
||||||
"step_time": step_time
|
|
||||||
}
|
|
||||||
iter_stats.update(loss_dict)
|
iter_stats.update(loss_dict)
|
||||||
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||||
|
|
||||||
|
@ -270,8 +251,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step,
|
||||||
# Diagnostic visualizations
|
# Diagnostic visualizations
|
||||||
if decoder_output is not None:
|
if decoder_output is not None:
|
||||||
idx = np.random.randint(mel_targets.shape[0])
|
idx = np.random.randint(mel_targets.shape[0])
|
||||||
pred_spec = decoder_output[idx].detach().data.cpu().numpy(
|
pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
|
||||||
).T
|
|
||||||
gt_spec = mel_targets[idx].data.cpu().numpy().T
|
gt_spec = mel_targets[idx].data.cpu().numpy().T
|
||||||
align_img = alignments[idx].data.cpu()
|
align_img = alignments[idx].data.cpu()
|
||||||
|
|
||||||
|
@ -285,9 +265,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step,
|
||||||
|
|
||||||
# Sample audio
|
# Sample audio
|
||||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||||
tb_logger.tb_train_audios(global_step,
|
tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"])
|
||||||
{"TrainAudio": train_audio},
|
|
||||||
config.audio["sample_rate"])
|
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|
||||||
# print epoch stats
|
# print epoch stats
|
||||||
|
@ -304,8 +282,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step,
|
||||||
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def evaluate(data_loader, model, criterion, ap, global_step, epoch,
|
def evaluate(data_loader, model, criterion, ap, global_step, epoch, training_phase):
|
||||||
training_phase):
|
|
||||||
model.eval()
|
model.eval()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
keep_avg = KeepAverage()
|
keep_avg = KeepAverage()
|
||||||
|
@ -315,18 +292,13 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch,
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# format data
|
# format data
|
||||||
text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _ = format_data(
|
text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _ = format_data(data)
|
||||||
data)
|
|
||||||
|
|
||||||
# forward pass model
|
# forward pass model
|
||||||
with torch.cuda.amp.autocast(enabled=config.mixed_precision):
|
with torch.cuda.amp.autocast(enabled=config.mixed_precision):
|
||||||
decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward(
|
decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward(
|
||||||
text_input,
|
text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase
|
||||||
text_lengths,
|
)
|
||||||
mel_targets,
|
|
||||||
mel_lengths,
|
|
||||||
g=speaker_c,
|
|
||||||
phase=training_phase)
|
|
||||||
|
|
||||||
# compute loss
|
# compute loss
|
||||||
loss_dict = criterion(
|
loss_dict = criterion(
|
||||||
|
@ -351,14 +323,10 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch,
|
||||||
|
|
||||||
# aggregate losses from processes
|
# aggregate losses from processes
|
||||||
if num_gpus > 1:
|
if num_gpus > 1:
|
||||||
loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data,
|
loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus)
|
||||||
num_gpus)
|
loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus)
|
||||||
loss_dict["loss_ssim"] = reduce_tensor(
|
loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus)
|
||||||
loss_dict["loss_ssim"].data, num_gpus)
|
loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus)
|
||||||
loss_dict["loss_dur"] = reduce_tensor(
|
|
||||||
loss_dict["loss_dur"].data, num_gpus)
|
|
||||||
loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data,
|
|
||||||
num_gpus)
|
|
||||||
|
|
||||||
# detach loss values
|
# detach loss values
|
||||||
loss_dict_new = dict()
|
loss_dict_new = dict()
|
||||||
|
@ -376,8 +344,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch,
|
||||||
keep_avg.update_values(update_train_values)
|
keep_avg.update_values(update_train_values)
|
||||||
|
|
||||||
if config.print_eval:
|
if config.print_eval:
|
||||||
c_logger.print_eval_step(num_iter, loss_dict,
|
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
|
||||||
keep_avg.avg_values)
|
|
||||||
|
|
||||||
if args.rank == 0:
|
if args.rank == 0:
|
||||||
# Diagnostic visualizations
|
# Diagnostic visualizations
|
||||||
|
@ -387,17 +354,14 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch,
|
||||||
align_img = alignments[idx].data.cpu()
|
align_img = alignments[idx].data.cpu()
|
||||||
|
|
||||||
eval_figures = {
|
eval_figures = {
|
||||||
"prediction": plot_spectrogram(pred_spec, ap,
|
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
|
||||||
output_fig=False),
|
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
||||||
"ground_truth": plot_spectrogram(gt_spec, ap,
|
|
||||||
output_fig=False),
|
|
||||||
"alignment": plot_alignment(align_img, output_fig=False),
|
"alignment": plot_alignment(align_img, output_fig=False),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Sample audio
|
# Sample audio
|
||||||
eval_audio = ap.inv_melspectrogram(pred_spec.T)
|
eval_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||||
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
|
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"])
|
||||||
config.audio["sample_rate"])
|
|
||||||
|
|
||||||
# Plot Validation Stats
|
# Plot Validation Stats
|
||||||
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
||||||
|
@ -422,9 +386,9 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch,
|
||||||
print(" | > Synthesizing test sentences")
|
print(" | > Synthesizing test sentences")
|
||||||
if config.use_speaker_embedding:
|
if config.use_speaker_embedding:
|
||||||
if config.use_external_speaker_embedding_file:
|
if config.use_external_speaker_embedding_file:
|
||||||
speaker_embedding = speaker_mapping[list(
|
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][
|
||||||
speaker_mapping.keys())[randrange(
|
"embedding"
|
||||||
len(speaker_mapping) - 1)]]["embedding"]
|
]
|
||||||
speaker_id = None
|
speaker_id = None
|
||||||
else:
|
else:
|
||||||
speaker_id = 0
|
speaker_id = 0
|
||||||
|
@ -452,19 +416,15 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch,
|
||||||
|
|
||||||
file_path = os.path.join(AUDIO_PATH, str(global_step))
|
file_path = os.path.join(AUDIO_PATH, str(global_step))
|
||||||
os.makedirs(file_path, exist_ok=True)
|
os.makedirs(file_path, exist_ok=True)
|
||||||
file_path = os.path.join(file_path,
|
file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx))
|
||||||
"TestSentence_{}.wav".format(idx))
|
|
||||||
ap.save_wav(wav, file_path)
|
ap.save_wav(wav, file_path)
|
||||||
test_audios["{}-audio".format(idx)] = wav
|
test_audios["{}-audio".format(idx)] = wav
|
||||||
test_figures["{}-prediction".format(idx)] = plot_spectrogram(
|
test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap)
|
||||||
postnet_output, ap)
|
test_figures["{}-alignment".format(idx)] = plot_alignment(alignment)
|
||||||
test_figures["{}-alignment".format(idx)] = plot_alignment(
|
|
||||||
alignment)
|
|
||||||
except: # pylint: disable=bare-except
|
except: # pylint: disable=bare-except
|
||||||
print(" !! Error creating Test Sentence -", idx)
|
print(" !! Error creating Test Sentence -", idx)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
tb_logger.tb_test_audios(global_step, test_audios,
|
tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"])
|
||||||
config.audio["sample_rate"])
|
|
||||||
tb_logger.tb_test_figures(global_step, test_figures)
|
tb_logger.tb_test_figures(global_step, test_figures)
|
||||||
return keep_avg.avg_values
|
return keep_avg.avg_values
|
||||||
|
|
||||||
|
@ -479,32 +439,21 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
# DISTRUBUTED
|
# DISTRUBUTED
|
||||||
if num_gpus > 1:
|
if num_gpus > 1:
|
||||||
init_distributed(args.rank, num_gpus, args.group_id,
|
init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"])
|
||||||
config.distributed["backend"],
|
|
||||||
config.distributed["url"])
|
|
||||||
|
|
||||||
# set model characters
|
# set model characters
|
||||||
model_characters = phonemes if config.use_phonemes else symbols
|
model_characters = phonemes if config.use_phonemes else symbols
|
||||||
num_chars = len(model_characters)
|
num_chars = len(model_characters)
|
||||||
|
|
||||||
# load data instances
|
# load data instances
|
||||||
meta_data_train, meta_data_eval = load_meta_data(config.datasets,
|
meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True)
|
||||||
eval_split=True)
|
|
||||||
|
|
||||||
# parse speakers
|
# parse speakers
|
||||||
num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(
|
num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH)
|
||||||
config, args, meta_data_train, OUT_PATH)
|
|
||||||
|
|
||||||
# setup model
|
# setup model
|
||||||
model = setup_model(num_chars,
|
model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim)
|
||||||
num_speakers,
|
optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9)
|
||||||
config,
|
|
||||||
speaker_embedding_dim=speaker_embedding_dim)
|
|
||||||
optimizer = RAdam(model.parameters(),
|
|
||||||
lr=config.lr,
|
|
||||||
weight_decay=0,
|
|
||||||
betas=(0.9, 0.98),
|
|
||||||
eps=1e-9)
|
|
||||||
criterion = AlignTTSLoss(config)
|
criterion = AlignTTSLoss(config)
|
||||||
|
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
|
@ -526,8 +475,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
for group in optimizer.param_groups:
|
for group in optimizer.param_groups:
|
||||||
group["initial_lr"] = config.lr
|
group["initial_lr"] = config.lr
|
||||||
print(" > Model restored from step %d" % checkpoint["step"],
|
print(" > Model restored from step %d" % checkpoint["step"], flush=True)
|
||||||
flush=True)
|
|
||||||
args.restore_step = checkpoint["step"]
|
args.restore_step = checkpoint["step"]
|
||||||
else:
|
else:
|
||||||
args.restore_step = 0
|
args.restore_step = 0
|
||||||
|
@ -541,9 +489,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
model = DDP_th(model, device_ids=[args.rank])
|
model = DDP_th(model, device_ids=[args.rank])
|
||||||
|
|
||||||
if config.noam_schedule:
|
if config.noam_schedule:
|
||||||
scheduler = NoamLR(optimizer,
|
scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1)
|
||||||
warmup_steps=config.warmup_steps,
|
|
||||||
last_epoch=args.restore_step - 1)
|
|
||||||
else:
|
else:
|
||||||
scheduler = None
|
scheduler = None
|
||||||
|
|
||||||
|
@ -554,10 +500,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
best_loss = float("inf")
|
best_loss = float("inf")
|
||||||
print(" > Starting with inf best loss.")
|
print(" > Starting with inf best loss.")
|
||||||
else:
|
else:
|
||||||
print(" > Restoring best loss from "
|
print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...")
|
||||||
f"{os.path.basename(args.best_path)} ...")
|
best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"]
|
||||||
best_loss = torch.load(args.best_path,
|
|
||||||
map_location="cpu")["model_loss"]
|
|
||||||
print(f" > Starting with loaded last best loss {best_loss}.")
|
print(f" > Starting with loaded last best loss {best_loss}.")
|
||||||
keep_all_best = config.keep_all_best
|
keep_all_best = config.keep_all_best
|
||||||
keep_after = config.keep_after # void if keep_all_best False
|
keep_after = config.keep_after # void if keep_all_best False
|
||||||
|
@ -576,9 +520,10 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
phase = 0
|
phase = 0
|
||||||
else:
|
else:
|
||||||
phase = (
|
phase = (
|
||||||
len(config.phase_start_steps) -
|
len(config.phase_start_steps)
|
||||||
[i < global_step
|
- [i < global_step for i in config.phase_start_steps][::-1].index(True)
|
||||||
for i in config.phase_start_steps][::-1].index(True) - 1)
|
- 1
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
phase = None
|
phase = None
|
||||||
return phase
|
return phase
|
||||||
|
@ -587,12 +532,10 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
cur_phase = set_phase()
|
cur_phase = set_phase()
|
||||||
print(f"\n > Current AlignTTS phase: {cur_phase}")
|
print(f"\n > Current AlignTTS phase: {cur_phase}")
|
||||||
c_logger.print_epoch_start(epoch, config.epochs)
|
c_logger.print_epoch_start(epoch, config.epochs)
|
||||||
train_avg_loss_dict, global_step = train(train_loader, model,
|
train_avg_loss_dict, global_step = train(
|
||||||
criterion, optimizer,
|
train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, cur_phase
|
||||||
scheduler, ap, global_step,
|
)
|
||||||
epoch, cur_phase)
|
eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch, cur_phase)
|
||||||
eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap,
|
|
||||||
global_step, epoch, cur_phase)
|
|
||||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||||
target_loss = train_avg_loss_dict["avg_loss"]
|
target_loss = train_avg_loss_dict["avg_loss"]
|
||||||
if config.run_eval:
|
if config.run_eval:
|
||||||
|
@ -613,8 +556,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(
|
args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv)
|
||||||
sys.argv)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
from TTS.config.shared_configs import *
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import *
|
||||||
from TTS.utils.generic_utils import find_module
|
from TTS.utils.generic_utils import find_module
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -14,20 +14,12 @@ class AlignTTSConfig(BaseTTSConfig):
|
||||||
hidden_channels: int = 256
|
hidden_channels: int = 256
|
||||||
encoder_type: str = "fftransformer"
|
encoder_type: str = "fftransformer"
|
||||||
encoder_params: dict = field(
|
encoder_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
|
||||||
"hidden_channels_ffn": 1024,
|
)
|
||||||
"num_heads": 2,
|
|
||||||
"num_layers": 6,
|
|
||||||
"dropout_p": 0.1
|
|
||||||
})
|
|
||||||
decoder_type: str = "fftransformer"
|
decoder_type: str = "fftransformer"
|
||||||
decoder_params: dict = field(
|
decoder_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
|
||||||
"hidden_channels_ffn": 1024,
|
)
|
||||||
"num_heads": 2,
|
|
||||||
"num_layers": 6,
|
|
||||||
"dropout_p": 0.1
|
|
||||||
})
|
|
||||||
phase_start_steps: list = None
|
phase_start_steps: list = None
|
||||||
|
|
||||||
ssim_alpha: float = 1.0
|
ssim_alpha: float = 1.0
|
||||||
|
|
|
@ -256,7 +256,7 @@ def synthesis(
|
||||||
"""
|
"""
|
||||||
# GST processing
|
# GST processing
|
||||||
style_mel = None
|
style_mel = None
|
||||||
if CONFIG.has('gst') and CONFIG.gst and style_wav is not None:
|
if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
|
||||||
if isinstance(style_wav, dict):
|
if isinstance(style_wav, dict):
|
||||||
style_mel = style_wav
|
style_mel = style_wav
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -9,11 +9,11 @@ import re
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from TTS.config import load_config
|
||||||
from TTS.tts.utils.text.symbols import parse_symbols
|
from TTS.tts.utils.text.symbols import parse_symbols
|
||||||
from TTS.utils.console_logger import ConsoleLogger
|
from TTS.utils.console_logger import ConsoleLogger
|
||||||
from TTS.utils.generic_utils import create_experiment_folder, get_git_branch
|
from TTS.utils.generic_utils import create_experiment_folder, get_git_branch
|
||||||
from TTS.utils.io import copy_model_files
|
from TTS.utils.io import copy_model_files
|
||||||
from TTS.config import load_config
|
|
||||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,22 +6,18 @@ from .shared_configs import BaseGANVocoderConfig
|
||||||
@dataclass
|
@dataclass
|
||||||
class FullbandMelganConfig(BaseGANVocoderConfig):
|
class FullbandMelganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for FullbandMelGAN vocoder."""
|
"""Defines parameters for FullbandMelGAN vocoder."""
|
||||||
|
|
||||||
model: str = "melgan"
|
model: str = "melgan"
|
||||||
|
|
||||||
# Model specific params
|
# Model specific params
|
||||||
discriminator_model: str = "melgan_multiscale_discriminator"
|
discriminator_model: str = "melgan_multiscale_discriminator"
|
||||||
discriminator_model_params: dict = field(
|
discriminator_model_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"base_channels": 16, "max_channels": 512, "downsample_factors": [4, 4, 4]}
|
||||||
"base_channels": 16,
|
)
|
||||||
"max_channels": 512,
|
|
||||||
"downsample_factors": [4, 4, 4]
|
|
||||||
})
|
|
||||||
generator_model: str = "melgan_generator"
|
generator_model: str = "melgan_generator"
|
||||||
generator_model_params: dict = field(
|
generator_model_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"upsample_factors": [8, 8, 2, 2], "num_res_blocks": 4}
|
||||||
"upsample_factors": [8, 8, 2, 2],
|
)
|
||||||
"num_res_blocks": 4
|
|
||||||
})
|
|
||||||
|
|
||||||
# Training - overrides
|
# Training - overrides
|
||||||
batch_size: int = 16
|
batch_size: int = 16
|
||||||
|
@ -42,8 +38,9 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
"n_ffts": [1024, 2048, 512],
|
"n_ffts": [1024, 2048, 512],
|
||||||
"hop_lengths": [120, 240, 50],
|
"hop_lengths": [120, 240, 50],
|
||||||
"win_lengths": [600, 1200, 240]
|
"win_lengths": [600, 1200, 240],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# loss weights - overrides
|
# loss weights - overrides
|
||||||
stft_loss_weight: float = 0.5
|
stft_loss_weight: float = 0.5
|
||||||
|
|
|
@ -18,8 +18,9 @@ class HifiganConfig(BaseGANVocoderConfig):
|
||||||
"upsample_initial_channel": 512,
|
"upsample_initial_channel": 512,
|
||||||
"resblock_kernel_sizes": [3, 7, 11],
|
"resblock_kernel_sizes": [3, 7, 11],
|
||||||
"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
||||||
"resblock_type": "1"
|
"resblock_type": "1",
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# LOSS PARAMETERS - overrides
|
# LOSS PARAMETERS - overrides
|
||||||
use_stft_loss: bool = False
|
use_stft_loss: bool = False
|
||||||
|
@ -45,9 +46,10 @@ class HifiganConfig(BaseGANVocoderConfig):
|
||||||
"win_length": 1024,
|
"win_length": 1024,
|
||||||
"n_mels": 80,
|
"n_mels": 80,
|
||||||
"mel_fmin": 0.0,
|
"mel_fmin": 0.0,
|
||||||
"mel_fmax": None
|
"mel_fmax": None,
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# optimizer parameters
|
# optimizer parameters
|
||||||
lr: float = 1e-4
|
lr: float = 1e-4
|
||||||
wd: float = 1e-6
|
wd: float = 1e-6
|
||||||
|
|
|
@ -6,22 +6,18 @@ from .shared_configs import BaseGANVocoderConfig
|
||||||
@dataclass
|
@dataclass
|
||||||
class MelganConfig(BaseGANVocoderConfig):
|
class MelganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for MelGAN vocoder."""
|
"""Defines parameters for MelGAN vocoder."""
|
||||||
|
|
||||||
model: str = "melgan"
|
model: str = "melgan"
|
||||||
|
|
||||||
# Model specific params
|
# Model specific params
|
||||||
discriminator_model: str = "melgan_multiscale_discriminator"
|
discriminator_model: str = "melgan_multiscale_discriminator"
|
||||||
discriminator_model_params: dict = field(
|
discriminator_model_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}
|
||||||
"base_channels": 16,
|
)
|
||||||
"max_channels": 1024,
|
|
||||||
"downsample_factors": [4, 4, 4, 4]
|
|
||||||
})
|
|
||||||
generator_model: str = "melgan_generator"
|
generator_model: str = "melgan_generator"
|
||||||
generator_model_params: dict = field(
|
generator_model_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"upsample_factors": [8, 8, 2, 2], "num_res_blocks": 3}
|
||||||
"upsample_factors": [8, 8, 2, 2],
|
)
|
||||||
"num_res_blocks": 3
|
|
||||||
})
|
|
||||||
|
|
||||||
# Training - overrides
|
# Training - overrides
|
||||||
batch_size: int = 16
|
batch_size: int = 16
|
||||||
|
@ -42,8 +38,9 @@ class MelganConfig(BaseGANVocoderConfig):
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
"n_ffts": [1024, 2048, 512],
|
"n_ffts": [1024, 2048, 512],
|
||||||
"hop_lengths": [120, 240, 50],
|
"hop_lengths": [120, 240, 50],
|
||||||
"win_lengths": [600, 1200, 240]
|
"win_lengths": [600, 1200, 240],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# loss weights - overrides
|
# loss weights - overrides
|
||||||
stft_loss_weight: float = 0.5
|
stft_loss_weight: float = 0.5
|
||||||
|
|
|
@ -6,42 +6,31 @@ from .shared_configs import BaseGANVocoderConfig
|
||||||
@dataclass
|
@dataclass
|
||||||
class MultibandMelganConfig(BaseGANVocoderConfig):
|
class MultibandMelganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for MultiBandMelGAN vocoder."""
|
"""Defines parameters for MultiBandMelGAN vocoder."""
|
||||||
|
|
||||||
model: str = "multiband_melgan"
|
model: str = "multiband_melgan"
|
||||||
|
|
||||||
# Model specific params
|
# Model specific params
|
||||||
discriminator_model: str = "melgan_multiscale_discriminator"
|
discriminator_model: str = "melgan_multiscale_discriminator"
|
||||||
discriminator_model_params: dict = field(
|
discriminator_model_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"base_channels": 16, "max_channels": 512, "downsample_factors": [4, 4, 4]}
|
||||||
"base_channels": 16,
|
)
|
||||||
"max_channels": 512,
|
|
||||||
"downsample_factors": [4, 4, 4]
|
|
||||||
})
|
|
||||||
generator_model: str = "multiband_melgan_generator"
|
generator_model: str = "multiband_melgan_generator"
|
||||||
generator_model_params: dict = field(
|
generator_model_params: dict = field(default_factory=lambda: {"upsample_factors": [8, 4, 2], "num_res_blocks": 4})
|
||||||
default_factory=lambda: {
|
|
||||||
"upsample_factors": [8, 4, 2],
|
|
||||||
"num_res_blocks": 4
|
|
||||||
})
|
|
||||||
use_pqmf: bool = True
|
use_pqmf: bool = True
|
||||||
|
|
||||||
# optimizer - overrides
|
# optimizer - overrides
|
||||||
lr_gen: float = 0.0001 # Initial learning rate.
|
lr_gen: float = 0.0001 # Initial learning rate.
|
||||||
lr_disc: float = 0.0001 # Initial learning rate.
|
lr_disc: float = 0.0001 # Initial learning rate.
|
||||||
optimizer: str = "AdamW"
|
optimizer: str = "AdamW"
|
||||||
optimizer_params: dict = field(default_factory=lambda: {
|
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})
|
||||||
"betas": [0.8, 0.99],
|
|
||||||
"weight_decay": 0.0
|
|
||||||
})
|
|
||||||
lr_scheduler_gen: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
lr_scheduler_gen: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||||
lr_scheduler_gen_params: dict = field(default_factory=lambda: {
|
lr_scheduler_gen_params: dict = field(
|
||||||
"gamma": 0.5,
|
default_factory=lambda: {"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}
|
||||||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
)
|
||||||
})
|
|
||||||
lr_scheduler_disc: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
lr_scheduler_disc: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||||
lr_scheduler_disc_params: dict = field(default_factory=lambda: {
|
lr_scheduler_disc_params: dict = field(
|
||||||
"gamma": 0.5,
|
default_factory=lambda: {"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}
|
||||||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
)
|
||||||
})
|
|
||||||
|
|
||||||
# Training - overrides
|
# Training - overrides
|
||||||
batch_size: int = 64
|
batch_size: int = 64
|
||||||
|
@ -60,11 +49,8 @@ class MultibandMelganConfig(BaseGANVocoderConfig):
|
||||||
use_l1_spec_loss: bool = False
|
use_l1_spec_loss: bool = False
|
||||||
|
|
||||||
subband_stft_loss_params: dict = field(
|
subband_stft_loss_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"n_ffts": [384, 683, 171], "hop_lengths": [30, 60, 10], "win_lengths": [150, 300, 60]}
|
||||||
"n_ffts": [384, 683, 171],
|
)
|
||||||
"hop_lengths": [30, 60, 10],
|
|
||||||
"win_lengths": [150, 300, 60]
|
|
||||||
})
|
|
||||||
|
|
||||||
# loss weights - overrides
|
# loss weights - overrides
|
||||||
stft_loss_weight: float = 0.5
|
stft_loss_weight: float = 0.5
|
||||||
|
|
|
@ -6,21 +6,16 @@ from .shared_configs import BaseGANVocoderConfig
|
||||||
@dataclass
|
@dataclass
|
||||||
class ParallelWaveganConfig(BaseGANVocoderConfig):
|
class ParallelWaveganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for ParallelWavegan vocoder."""
|
"""Defines parameters for ParallelWavegan vocoder."""
|
||||||
|
|
||||||
model: str = "parallel_wavegan"
|
model: str = "parallel_wavegan"
|
||||||
|
|
||||||
# Model specific params
|
# Model specific params
|
||||||
discriminator_model: str = "parallel_wavegan_discriminator"
|
discriminator_model: str = "parallel_wavegan_discriminator"
|
||||||
discriminator_model_params: dict = field(
|
discriminator_model_params: dict = field(default_factory=lambda: {"num_layers": 10})
|
||||||
default_factory=lambda: {
|
|
||||||
"num_layers": 10
|
|
||||||
})
|
|
||||||
generator_model: str = "parallel_wavegan_generator"
|
generator_model: str = "parallel_wavegan_generator"
|
||||||
generator_model_params: dict = field(
|
generator_model_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"upsample_factors": [4, 4, 4, 4], "stacks": 3, "num_res_blocks": 30}
|
||||||
"upsample_factors":[4, 4, 4, 4],
|
)
|
||||||
"stacks": 3,
|
|
||||||
"num_res_blocks": 30
|
|
||||||
})
|
|
||||||
|
|
||||||
# Training - overrides
|
# Training - overrides
|
||||||
batch_size: int = 6
|
batch_size: int = 6
|
||||||
|
@ -42,8 +37,9 @@ class ParallelWaveganConfig(BaseGANVocoderConfig):
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
"n_ffts": [1024, 2048, 512],
|
"n_ffts": [1024, 2048, 512],
|
||||||
"hop_lengths": [120, 240, 50],
|
"hop_lengths": [120, 240, 50],
|
||||||
"win_lengths": [600, 1200, 240]
|
"win_lengths": [600, 1200, 240],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# loss weights - overrides
|
# loss weights - overrides
|
||||||
stft_loss_weight: float = 0.5
|
stft_loss_weight: float = 0.5
|
||||||
|
@ -57,17 +53,8 @@ class ParallelWaveganConfig(BaseGANVocoderConfig):
|
||||||
lr_gen: float = 0.0002 # Initial learning rate.
|
lr_gen: float = 0.0002 # Initial learning rate.
|
||||||
lr_disc: float = 0.0002 # Initial learning rate.
|
lr_disc: float = 0.0002 # Initial learning rate.
|
||||||
optimizer: str = "AdamW"
|
optimizer: str = "AdamW"
|
||||||
optimizer_params: dict = field(default_factory=lambda: {
|
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})
|
||||||
"betas": [0.8, 0.99],
|
|
||||||
"weight_decay": 0.0
|
|
||||||
})
|
|
||||||
lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||||
lr_scheduler_gen_params: dict = field(default_factory=lambda: {
|
lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
|
||||||
"gamma": 0.999,
|
|
||||||
"last_epoch": -1
|
|
||||||
})
|
|
||||||
lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||||
lr_scheduler_disc_params: dict = field(default_factory=lambda: {
|
lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
|
||||||
"gamma": 0.999,
|
|
||||||
"last_epoch": -1
|
|
||||||
})
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
|
||||||
@dataclass
|
@dataclass
|
||||||
class BaseVocoderConfig(BaseTrainingConfig):
|
class BaseVocoderConfig(BaseTrainingConfig):
|
||||||
"""Shared parameters among all the vocoder models."""
|
"""Shared parameters among all the vocoder models."""
|
||||||
|
|
||||||
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
||||||
# dataloading
|
# dataloading
|
||||||
use_noise_augment: bool = False # enable/disable random noise augmentation in spectrograms.
|
use_noise_augment: bool = False # enable/disable random noise augmentation in spectrograms.
|
||||||
|
@ -29,6 +30,7 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
||||||
@dataclass
|
@dataclass
|
||||||
class BaseGANVocoderConfig(BaseVocoderConfig):
|
class BaseGANVocoderConfig(BaseVocoderConfig):
|
||||||
"""Common config interface for all the GAN based vocoder models."""
|
"""Common config interface for all the GAN based vocoder models."""
|
||||||
|
|
||||||
# LOSS PARAMETERS
|
# LOSS PARAMETERS
|
||||||
use_stft_loss: bool = True
|
use_stft_loss: bool = True
|
||||||
use_subband_stft_loss: bool = True
|
use_subband_stft_loss: bool = True
|
||||||
|
@ -49,8 +51,9 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
"n_ffts": [1024, 2048, 512],
|
"n_ffts": [1024, 2048, 512],
|
||||||
"hop_lengths": [120, 240, 50],
|
"hop_lengths": [120, 240, 50],
|
||||||
"win_lengths": [600, 1200, 240]
|
"win_lengths": [600, 1200, 240],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
l1_spec_loss_params: dict = field(
|
l1_spec_loss_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
|
@ -61,8 +64,9 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
|
||||||
"win_length": 1024,
|
"win_length": 1024,
|
||||||
"n_mels": 80,
|
"n_mels": 80,
|
||||||
"mel_fmin": 0.0,
|
"mel_fmin": 0.0,
|
||||||
"mel_fmax": None
|
"mel_fmax": None,
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch
|
target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch
|
||||||
|
|
||||||
|
@ -72,20 +76,11 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
|
||||||
lr_gen: float = 0.0002 # Initial learning rate.
|
lr_gen: float = 0.0002 # Initial learning rate.
|
||||||
lr_disc: float = 0.0002 # Initial learning rate.
|
lr_disc: float = 0.0002 # Initial learning rate.
|
||||||
optimizer: str = "AdamW"
|
optimizer: str = "AdamW"
|
||||||
optimizer_params: dict = field(default_factory=lambda: {
|
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})
|
||||||
"betas": [0.8, 0.99],
|
|
||||||
"weight_decay": 0.0
|
|
||||||
})
|
|
||||||
lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||||
lr_scheduler_gen_params: dict = field(default_factory=lambda: {
|
lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
|
||||||
"gamma": 0.999,
|
|
||||||
"last_epoch": -1
|
|
||||||
})
|
|
||||||
lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||||
lr_scheduler_disc_params: dict = field(default_factory=lambda: {
|
lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
|
||||||
"gamma": 0.999,
|
|
||||||
"last_epoch": -1
|
|
||||||
})
|
|
||||||
|
|
||||||
use_pqmf: bool = False # enable/disable using pqmf for multi-band training. (Multi-band MelGAN)
|
use_pqmf: bool = False # enable/disable using pqmf for multi-band training. (Multi-band MelGAN)
|
||||||
steps_to_start_discriminator = 0 # start training the discriminator after this number of steps.
|
steps_to_start_discriminator = 0 # start training the discriminator after this number of steps.
|
||||||
|
|
|
@ -6,24 +6,22 @@ from .shared_configs import BaseVocoderConfig
|
||||||
@dataclass
|
@dataclass
|
||||||
class WavegradConfig(BaseVocoderConfig):
|
class WavegradConfig(BaseVocoderConfig):
|
||||||
"""Defines parameters for Wavernn vocoder."""
|
"""Defines parameters for Wavernn vocoder."""
|
||||||
model: str = 'wavegrad'
|
|
||||||
|
model: str = "wavegrad"
|
||||||
# Model specific params
|
# Model specific params
|
||||||
generator_model: str = "wavegrad"
|
generator_model: str = "wavegrad"
|
||||||
model_params: dict = field(
|
model_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
"use_weight_norm":
|
"use_weight_norm": True,
|
||||||
True,
|
"y_conv_channels": 32,
|
||||||
"y_conv_channels":
|
"x_conv_channels": 768,
|
||||||
32,
|
|
||||||
"x_conv_channels":
|
|
||||||
768,
|
|
||||||
"ublock_out_channels": [512, 512, 256, 128, 128],
|
"ublock_out_channels": [512, 512, 256, 128, 128],
|
||||||
"dblock_out_channels": [128, 128, 256, 512],
|
"dblock_out_channels": [128, 128, 256, 512],
|
||||||
"upsample_factors": [4, 4, 4, 2, 2],
|
"upsample_factors": [4, 4, 4, 2, 2],
|
||||||
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8],
|
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
|
||||||
[1, 2, 4, 8], [1, 2, 4, 8]]
|
}
|
||||||
})
|
)
|
||||||
target_loss: str = 'avg_wavegrad_loss' # loss value to pick the best model to save after each epoch
|
target_loss: str = "avg_wavegrad_loss" # loss value to pick the best model to save after each epoch
|
||||||
|
|
||||||
# Training - overrides
|
# Training - overrides
|
||||||
epochs: int = 10000
|
epochs: int = 10000
|
||||||
|
@ -35,24 +33,20 @@ class WavegradConfig(BaseVocoderConfig):
|
||||||
eval_split_size: int = 50
|
eval_split_size: int = 50
|
||||||
|
|
||||||
# NOISE SCHEDULE PARAMS
|
# NOISE SCHEDULE PARAMS
|
||||||
train_noise_schedule: dict = field(default_factory=lambda: {
|
train_noise_schedule: dict = field(default_factory=lambda: {"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000})
|
||||||
"min_val": 1e-6,
|
|
||||||
"max_val": 1e-2,
|
|
||||||
"num_steps": 1000
|
|
||||||
})
|
|
||||||
|
|
||||||
test_noise_schedule: dict = field(default_factory=lambda: { # inference noise schedule. Try TTS/bin/tune_wavegrad.py to find the optimal values.
|
test_noise_schedule: dict = field(
|
||||||
"min_val": 1e-6,
|
default_factory=lambda: { # inference noise schedule. Try TTS/bin/tune_wavegrad.py to find the optimal values.
|
||||||
"max_val": 1e-2,
|
"min_val": 1e-6,
|
||||||
"num_steps": 50
|
"max_val": 1e-2,
|
||||||
})
|
"num_steps": 50,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# optimizer overrides
|
# optimizer overrides
|
||||||
grad_clip: float = 1.0
|
grad_clip: float = 1.0
|
||||||
lr: float = 1e-4 # Initial learning rate.
|
lr: float = 1e-4 # Initial learning rate.
|
||||||
lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||||
lr_scheduler_params: dict = field(
|
lr_scheduler_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}
|
||||||
"gamma": 0.5,
|
)
|
||||||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
|
||||||
})
|
|
||||||
|
|
|
@ -6,10 +6,11 @@ from .shared_configs import BaseVocoderConfig
|
||||||
@dataclass
|
@dataclass
|
||||||
class WavernnConfig(BaseVocoderConfig):
|
class WavernnConfig(BaseVocoderConfig):
|
||||||
"""Defines parameters for Wavernn vocoder."""
|
"""Defines parameters for Wavernn vocoder."""
|
||||||
|
|
||||||
model: str = "wavernn"
|
model: str = "wavernn"
|
||||||
|
|
||||||
# Model specific params
|
# Model specific params
|
||||||
mode: str = 'mold' # mold [string], gauss [string], bits [int]
|
mode: str = "mold" # mold [string], gauss [string], bits [int]
|
||||||
mulaw: bool = True # apply mulaw if mode is bits
|
mulaw: bool = True # apply mulaw if mode is bits
|
||||||
generator_model: str = "WaveRNN"
|
generator_model: str = "WaveRNN"
|
||||||
wavernn_model_params: dict = field(
|
wavernn_model_params: dict = field(
|
||||||
|
@ -21,9 +22,9 @@ class WavernnConfig(BaseVocoderConfig):
|
||||||
"num_res_blocks": 10,
|
"num_res_blocks": 10,
|
||||||
"use_aux_net": True,
|
"use_aux_net": True,
|
||||||
"use_upsample_net": True,
|
"use_upsample_net": True,
|
||||||
"upsample_factors":
|
"upsample_factors": [4, 8, 8], # this needs to correctly factorise hop_length
|
||||||
[4, 8, 8] # this needs to correctly factorise hop_length
|
}
|
||||||
})
|
)
|
||||||
|
|
||||||
# Inference
|
# Inference
|
||||||
batched: bool = True
|
batched: bool = True
|
||||||
|
@ -46,7 +47,4 @@ class WavernnConfig(BaseVocoderConfig):
|
||||||
grad_clip: float = 4.0
|
grad_clip: float = 4.0
|
||||||
lr: float = 1e-4 # Initial learning rate.
|
lr: float = 1e-4 # Initial learning rate.
|
||||||
lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||||
lr_scheduler_params: dict = field(default_factory=lambda: {
|
lr_scheduler_params: dict = field(default_factory=lambda: {"gamma": 0.5, "milestones": [200000, 400000, 600000]})
|
||||||
"gamma": 0.5,
|
|
||||||
"milestones": [200000, 400000, 600000]
|
|
||||||
})
|
|
||||||
|
|
|
@ -21,16 +21,14 @@ config = FullbandMelganConfig(
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
config.audio.do_trim_silence = True
|
config.audio.do_trim_silence = True
|
||||||
config.audio.trim_db = 60
|
config.audio.trim_db = 60
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
# train the model for one epoch
|
# train the model for one epoch
|
||||||
command_train = (
|
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
|
||||||
)
|
|
||||||
run_cli(command_train)
|
run_cli(command_train)
|
||||||
|
|
||||||
# Find latest folder
|
# Find latest folder
|
||||||
|
|
|
@ -22,16 +22,14 @@ config = HifiganConfig(
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
config.audio.do_trim_silence = True
|
config.audio.do_trim_silence = True
|
||||||
config.audio.trim_db = 60
|
config.audio.trim_db = 60
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
# train the model for one epoch
|
# train the model for one epoch
|
||||||
command_train = (
|
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
|
||||||
)
|
|
||||||
run_cli(command_train)
|
run_cli(command_train)
|
||||||
|
|
||||||
# Find latest folder
|
# Find latest folder
|
||||||
|
|
|
@ -21,16 +21,14 @@ config = MelganConfig(
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
config.audio.do_trim_silence = True
|
config.audio.do_trim_silence = True
|
||||||
config.audio.trim_db = 60
|
config.audio.trim_db = 60
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
# train the model for one epoch
|
# train the model for one epoch
|
||||||
command_train = (
|
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
|
||||||
)
|
|
||||||
run_cli(command_train)
|
run_cli(command_train)
|
||||||
|
|
||||||
# Find latest folder
|
# Find latest folder
|
||||||
|
|
|
@ -21,16 +21,14 @@ config = MultibandMelganConfig(
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
config.audio.do_trim_silence = True
|
config.audio.do_trim_silence = True
|
||||||
config.audio.trim_db = 60
|
config.audio.trim_db = 60
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
# train the model for one epoch
|
# train the model for one epoch
|
||||||
command_train = (
|
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
|
||||||
)
|
|
||||||
run_cli(command_train)
|
run_cli(command_train)
|
||||||
|
|
||||||
# Find latest folder
|
# Find latest folder
|
||||||
|
|
|
@ -21,16 +21,14 @@ config = ParallelWaveganConfig(
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
config.audio.do_trim_silence = True
|
config.audio.do_trim_silence = True
|
||||||
config.audio.trim_db = 60
|
config.audio.trim_db = 60
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
# train the model for one epoch
|
# train the model for one epoch
|
||||||
command_train = (
|
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
|
||||||
)
|
|
||||||
run_cli(command_train)
|
run_cli(command_train)
|
||||||
|
|
||||||
# Find latest folder
|
# Find latest folder
|
||||||
|
|
|
@ -21,16 +21,14 @@ config = WavegradConfig(
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
config.audio.do_trim_silence = True
|
config.audio.do_trim_silence = True
|
||||||
config.audio.trim_db = 60
|
config.audio.trim_db = 60
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
# train the model for one epoch
|
# train the model for one epoch
|
||||||
command_train = (
|
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} "
|
||||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} "
|
|
||||||
)
|
|
||||||
run_cli(command_train)
|
run_cli(command_train)
|
||||||
|
|
||||||
# Find latest folder
|
# Find latest folder
|
||||||
|
|
|
@ -21,16 +21,14 @@ config = WavernnConfig(
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
config.audio.do_trim_silence = True
|
config.audio.do_trim_silence = True
|
||||||
config.audio.trim_db = 60
|
config.audio.trim_db = 60
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
# train the model for one epoch
|
# train the model for one epoch
|
||||||
command_train = (
|
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} "
|
||||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} "
|
|
||||||
)
|
|
||||||
run_cli(command_train)
|
run_cli(command_train)
|
||||||
|
|
||||||
# Find latest folder
|
# Find latest folder
|
||||||
|
|
Loading…
Reference in New Issue