mirror of https://github.com/coqui-ai/TTS.git
add initial wavernn support
parent
1a87ad82e3
commit
d6bd3cd8b8
|
@ -11,20 +11,27 @@ from TTS.tts.datasets.preprocess import load_meta_data
|
|||
from TTS.utils.io import load_config
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compute mean and variance of spectrogtram features.")
|
||||
parser.add_argument("--config_path", type=str, required=True,
|
||||
help="TTS config file path to define audio processin parameters.")
|
||||
parser.add_argument("--out_path", default=None, type=str,
|
||||
help="directory to save the output file.")
|
||||
description="Compute mean and variance of spectrogtram features."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="TTS config file path to define audio processin parameters.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out_path", default=None, type=str, help="directory to save the output file."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# load config
|
||||
CONFIG = load_config(args.config_path)
|
||||
CONFIG.audio['signal_norm'] = False # do not apply earlier normalization
|
||||
CONFIG.audio['stats_path'] = None # discard pre-defined stats
|
||||
CONFIG.audio["signal_norm"] = False # do not apply earlier normalization
|
||||
CONFIG.audio["stats_path"] = None # discard pre-defined stats
|
||||
|
||||
# load audio processor
|
||||
ap = AudioProcessor(**CONFIG.audio)
|
||||
|
@ -58,27 +65,27 @@ def main():
|
|||
|
||||
output_file_path = os.path.join(args.out_path, "scale_stats.npy")
|
||||
stats = {}
|
||||
stats['mel_mean'] = mel_mean
|
||||
stats['mel_std'] = mel_scale
|
||||
stats['linear_mean'] = linear_mean
|
||||
stats['linear_std'] = linear_scale
|
||||
stats["mel_mean"] = mel_mean
|
||||
stats["mel_std"] = mel_scale
|
||||
stats["linear_mean"] = linear_mean
|
||||
stats["linear_std"] = linear_scale
|
||||
|
||||
print(f' > Avg mel spec mean: {mel_mean.mean()}')
|
||||
print(f' > Avg mel spec scale: {mel_scale.mean()}')
|
||||
print(f' > Avg linear spec mean: {linear_mean.mean()}')
|
||||
print(f' > Avg lienar spec scale: {linear_scale.mean()}')
|
||||
print(f" > Avg mel spec mean: {mel_mean.mean()}")
|
||||
print(f" > Avg mel spec scale: {mel_scale.mean()}")
|
||||
print(f" > Avg linear spec mean: {linear_mean.mean()}")
|
||||
print(f" > Avg lienar spec scale: {linear_scale.mean()}")
|
||||
|
||||
# set default config values for mean-var scaling
|
||||
CONFIG.audio['stats_path'] = output_file_path
|
||||
CONFIG.audio['signal_norm'] = True
|
||||
CONFIG.audio["stats_path"] = output_file_path
|
||||
CONFIG.audio["signal_norm"] = True
|
||||
# remove redundant values
|
||||
del CONFIG.audio['max_norm']
|
||||
del CONFIG.audio['min_level_db']
|
||||
del CONFIG.audio['symmetric_norm']
|
||||
del CONFIG.audio['clip_norm']
|
||||
stats['audio_config'] = CONFIG.audio
|
||||
del CONFIG.audio["max_norm"]
|
||||
del CONFIG.audio["min_level_db"]
|
||||
del CONFIG.audio["symmetric_norm"]
|
||||
del CONFIG.audio["clip_norm"]
|
||||
stats["audio_config"] = CONFIG.audio
|
||||
np.save(output_file_path, stats, allow_pickle=True)
|
||||
print(f' > scale_stats.npy is saved to {output_file_path}')
|
||||
print(f" > scale_stats.npy is saved to {output_file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -10,20 +10,29 @@ import torch
|
|||
from torch.utils.data import DataLoader
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.console_logger import ConsoleLogger
|
||||
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.generic_utils import (
|
||||
KeepAverage,
|
||||
count_parameters,
|
||||
create_experiment_folder,
|
||||
get_git_branch,
|
||||
remove_experiment_folder,
|
||||
set_init_dict,
|
||||
)
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.training import setup_torch_training_env
|
||||
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||
|
||||
# from distribute import (DistributedSampler, apply_gradient_allreduce,
|
||||
# init_distributed, reduce_tensor)
|
||||
from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
|
||||
from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator,
|
||||
setup_generator)
|
||||
from TTS.vocoder.utils.generic_utils import (
|
||||
plot_results,
|
||||
setup_discriminator,
|
||||
setup_generator,
|
||||
)
|
||||
from TTS.vocoder.utils.io import save_best_model, save_checkpoint
|
||||
|
||||
use_cuda, num_gpus = setup_torch_training_env(True, True)
|
||||
|
@ -33,27 +42,30 @@ def setup_loader(ap, is_val=False, verbose=False):
|
|||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
dataset = GANDataset(ap=ap,
|
||||
items=eval_data if is_val else train_data,
|
||||
seq_len=c.seq_len,
|
||||
hop_len=ap.hop_length,
|
||||
pad_short=c.pad_short,
|
||||
conv_pad=c.conv_pad,
|
||||
is_training=not is_val,
|
||||
return_segments=not is_val,
|
||||
use_noise_augment=c.use_noise_augment,
|
||||
use_cache=c.use_cache,
|
||||
verbose=verbose)
|
||||
dataset = GANDataset(
|
||||
ap=ap,
|
||||
items=eval_data if is_val else train_data,
|
||||
seq_len=c.seq_len,
|
||||
hop_len=ap.hop_length,
|
||||
pad_short=c.pad_short,
|
||||
conv_pad=c.conv_pad,
|
||||
is_training=not is_val,
|
||||
return_segments=not is_val,
|
||||
use_noise_augment=c.use_noise_augment,
|
||||
use_cache=c.use_cache,
|
||||
verbose=verbose,
|
||||
)
|
||||
dataset.shuffle_mapping()
|
||||
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(dataset,
|
||||
batch_size=1 if is_val else c.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=False,
|
||||
sampler=None,
|
||||
num_workers=c.num_val_loader_workers
|
||||
if is_val else c.num_loader_workers,
|
||||
pin_memory=False)
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=1 if is_val else c.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=False,
|
||||
sampler=None,
|
||||
num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers,
|
||||
pin_memory=False,
|
||||
)
|
||||
return loader
|
||||
|
||||
|
||||
|
@ -80,16 +92,26 @@ def format_data(data):
|
|||
return co, x, None, None
|
||||
|
||||
|
||||
def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
||||
scheduler_G, scheduler_D, ap, global_step, epoch):
|
||||
def train(
|
||||
model_G,
|
||||
criterion_G,
|
||||
optimizer_G,
|
||||
model_D,
|
||||
criterion_D,
|
||||
optimizer_D,
|
||||
scheduler_G,
|
||||
scheduler_D,
|
||||
ap,
|
||||
global_step,
|
||||
epoch,
|
||||
):
|
||||
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
|
||||
model_G.train()
|
||||
model_D.train()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
if use_cuda:
|
||||
batch_n_iter = int(
|
||||
len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||
else:
|
||||
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
|
||||
end_time = time.time()
|
||||
|
@ -145,16 +167,16 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
|||
scores_fake = D_out_fake
|
||||
|
||||
# compute losses
|
||||
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
|
||||
feats_real, y_hat_sub, y_G_sub)
|
||||
loss_G = loss_G_dict['G_loss']
|
||||
loss_G_dict = criterion_G(
|
||||
y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub
|
||||
)
|
||||
loss_G = loss_G_dict["G_loss"]
|
||||
|
||||
# optimizer generator
|
||||
optimizer_G.zero_grad()
|
||||
loss_G.backward()
|
||||
if c.gen_clip_grad > 0:
|
||||
torch.nn.utils.clip_grad_norm_(model_G.parameters(),
|
||||
c.gen_clip_grad)
|
||||
torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad)
|
||||
optimizer_G.step()
|
||||
if scheduler_G is not None:
|
||||
scheduler_G.step()
|
||||
|
@ -199,14 +221,13 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
|||
|
||||
# compute losses
|
||||
loss_D_dict = criterion_D(scores_fake, scores_real)
|
||||
loss_D = loss_D_dict['D_loss']
|
||||
loss_D = loss_D_dict["D_loss"]
|
||||
|
||||
# optimizer discriminator
|
||||
optimizer_D.zero_grad()
|
||||
loss_D.backward()
|
||||
if c.disc_clip_grad > 0:
|
||||
torch.nn.utils.clip_grad_norm_(model_D.parameters(),
|
||||
c.disc_clip_grad)
|
||||
torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad)
|
||||
optimizer_D.step()
|
||||
if scheduler_D is not None:
|
||||
scheduler_D.step()
|
||||
|
@ -221,34 +242,40 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
|||
epoch_time += step_time
|
||||
|
||||
# get current learning rates
|
||||
current_lr_G = list(optimizer_G.param_groups)[0]['lr']
|
||||
current_lr_D = list(optimizer_D.param_groups)[0]['lr']
|
||||
current_lr_G = list(optimizer_G.param_groups)[0]["lr"]
|
||||
current_lr_D = list(optimizer_D.param_groups)[0]["lr"]
|
||||
|
||||
# update avg stats
|
||||
update_train_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_train_values['avg_' + key] = value
|
||||
update_train_values['avg_loader_time'] = loader_time
|
||||
update_train_values['avg_step_time'] = step_time
|
||||
update_train_values["avg_" + key] = value
|
||||
update_train_values["avg_loader_time"] = loader_time
|
||||
update_train_values["avg_step_time"] = step_time
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
# print training stats
|
||||
if global_step % c.print_step == 0:
|
||||
log_dict = {
|
||||
'step_time': [step_time, 2],
|
||||
'loader_time': [loader_time, 4],
|
||||
"step_time": [step_time, 2],
|
||||
"loader_time": [loader_time, 4],
|
||||
"current_lr_G": current_lr_G,
|
||||
"current_lr_D": current_lr_D
|
||||
"current_lr_D": current_lr_D,
|
||||
}
|
||||
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
||||
log_dict, loss_dict, keep_avg.avg_values)
|
||||
c_logger.print_train_step(
|
||||
batch_n_iter,
|
||||
num_iter,
|
||||
global_step,
|
||||
log_dict,
|
||||
loss_dict,
|
||||
keep_avg.avg_values,
|
||||
)
|
||||
|
||||
# plot step stats
|
||||
if global_step % 10 == 0:
|
||||
iter_stats = {
|
||||
"lr_G": current_lr_G,
|
||||
"lr_D": current_lr_D,
|
||||
"step_time": step_time
|
||||
"step_time": step_time,
|
||||
}
|
||||
iter_stats.update(loss_dict)
|
||||
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||
|
@ -257,27 +284,28 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
|||
if global_step % c.save_step == 0:
|
||||
if c.checkpoint:
|
||||
# save model
|
||||
save_checkpoint(model_G,
|
||||
optimizer_G,
|
||||
scheduler_G,
|
||||
model_D,
|
||||
optimizer_D,
|
||||
scheduler_D,
|
||||
global_step,
|
||||
epoch,
|
||||
OUT_PATH,
|
||||
model_losses=loss_dict)
|
||||
save_checkpoint(
|
||||
model_G,
|
||||
optimizer_G,
|
||||
scheduler_G,
|
||||
model_D,
|
||||
optimizer_D,
|
||||
scheduler_D,
|
||||
global_step,
|
||||
epoch,
|
||||
OUT_PATH,
|
||||
model_losses=loss_dict,
|
||||
)
|
||||
|
||||
# compute spectrograms
|
||||
figures = plot_results(y_hat_vis, y_G, ap, global_step,
|
||||
'train')
|
||||
figures = plot_results(y_hat_vis, y_G, ap, global_step, "train")
|
||||
tb_logger.tb_train_figures(global_step, figures)
|
||||
|
||||
# Sample audio
|
||||
sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy()
|
||||
tb_logger.tb_train_audios(global_step,
|
||||
{'train/audio': sample_voice},
|
||||
c.audio["sample_rate"])
|
||||
tb_logger.tb_train_audios(
|
||||
global_step, {"train/audio": sample_voice}, c.audio["sample_rate"]
|
||||
)
|
||||
end_time = time.time()
|
||||
|
||||
# print epoch stats
|
||||
|
@ -326,7 +354,6 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
|
|||
y_hat = model_G.pqmf_synthesis(y_hat)
|
||||
y_G_sub = model_G.pqmf_analysis(y_G)
|
||||
|
||||
|
||||
scores_fake, feats_fake, feats_real = None, None, None
|
||||
if global_step > c.steps_to_start_discriminator:
|
||||
|
||||
|
@ -352,8 +379,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
|
|||
feats_fake, feats_real = None, None
|
||||
|
||||
# compute losses
|
||||
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
|
||||
feats_real, y_hat_sub, y_G_sub)
|
||||
loss_G_dict = criterion_G(
|
||||
y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub
|
||||
)
|
||||
|
||||
loss_dict = dict()
|
||||
for key, value in loss_G_dict.items():
|
||||
|
@ -403,16 +431,15 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
|
|||
else:
|
||||
loss_dict[key] = value.item()
|
||||
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# update avg stats
|
||||
update_eval_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_eval_values['avg_' + key] = value
|
||||
update_eval_values['avg_loader_time'] = loader_time
|
||||
update_eval_values['avg_step_time'] = step_time
|
||||
update_eval_values["avg_" + key] = value
|
||||
update_eval_values["avg_loader_time"] = loader_time
|
||||
update_eval_values["avg_step_time"] = step_time
|
||||
keep_avg.update_values(update_eval_values)
|
||||
|
||||
# print eval stats
|
||||
|
@ -420,13 +447,14 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
|
|||
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
|
||||
|
||||
# compute spectrograms
|
||||
figures = plot_results(y_hat, y_G, ap, global_step, 'eval')
|
||||
figures = plot_results(y_hat, y_G, ap, global_step, "eval")
|
||||
tb_logger.tb_eval_figures(global_step, figures)
|
||||
|
||||
# Sample audio
|
||||
sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
|
||||
tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice},
|
||||
c.audio["sample_rate"])
|
||||
tb_logger.tb_eval_audios(
|
||||
global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"]
|
||||
)
|
||||
|
||||
# synthesize a full voice
|
||||
data_loader.return_segments = False
|
||||
|
@ -443,7 +471,9 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
print(f" > Loading wavs from: {c.data_path}")
|
||||
if c.feature_path is not None:
|
||||
print(f" > Loading features from: {c.feature_path}")
|
||||
eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size)
|
||||
eval_data, train_data = load_wav_feat_data(
|
||||
c.data_path, c.feature_path, c.eval_split_size
|
||||
)
|
||||
else:
|
||||
eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
|
||||
|
||||
|
@ -461,17 +491,15 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
|
||||
# setup optimizers
|
||||
optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0)
|
||||
optimizer_disc = RAdam(model_disc.parameters(),
|
||||
lr=c.lr_disc,
|
||||
weight_decay=0)
|
||||
optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0)
|
||||
|
||||
# schedulers
|
||||
scheduler_gen = None
|
||||
scheduler_disc = None
|
||||
if 'lr_scheduler_gen' in c:
|
||||
if "lr_scheduler_gen" in c:
|
||||
scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen)
|
||||
scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params)
|
||||
if 'lr_scheduler_disc' in c:
|
||||
if "lr_scheduler_disc" in c:
|
||||
scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc)
|
||||
scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params)
|
||||
|
||||
|
@ -480,47 +508,46 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
criterion_disc = DiscriminatorLoss(c)
|
||||
|
||||
if args.restore_path:
|
||||
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
||||
checkpoint = torch.load(args.restore_path, map_location="cpu")
|
||||
try:
|
||||
print(" > Restoring Generator Model...")
|
||||
model_gen.load_state_dict(checkpoint['model'])
|
||||
model_gen.load_state_dict(checkpoint["model"])
|
||||
print(" > Restoring Generator Optimizer...")
|
||||
optimizer_gen.load_state_dict(checkpoint['optimizer'])
|
||||
optimizer_gen.load_state_dict(checkpoint["optimizer"])
|
||||
print(" > Restoring Discriminator Model...")
|
||||
model_disc.load_state_dict(checkpoint['model_disc'])
|
||||
model_disc.load_state_dict(checkpoint["model_disc"])
|
||||
print(" > Restoring Discriminator Optimizer...")
|
||||
optimizer_disc.load_state_dict(checkpoint['optimizer_disc'])
|
||||
if 'scheduler' in checkpoint:
|
||||
optimizer_disc.load_state_dict(checkpoint["optimizer_disc"])
|
||||
if "scheduler" in checkpoint:
|
||||
print(" > Restoring Generator LR Scheduler...")
|
||||
scheduler_gen.load_state_dict(checkpoint['scheduler'])
|
||||
scheduler_gen.load_state_dict(checkpoint["scheduler"])
|
||||
# NOTE: Not sure if necessary
|
||||
scheduler_gen.optimizer = optimizer_gen
|
||||
if 'scheduler_disc' in checkpoint:
|
||||
if "scheduler_disc" in checkpoint:
|
||||
print(" > Restoring Discriminator LR Scheduler...")
|
||||
scheduler_disc.load_state_dict(checkpoint['scheduler_disc'])
|
||||
scheduler_disc.load_state_dict(checkpoint["scheduler_disc"])
|
||||
scheduler_disc.optimizer = optimizer_disc
|
||||
except RuntimeError:
|
||||
# retore only matching layers.
|
||||
print(" > Partial model initialization...")
|
||||
model_dict = model_gen.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||
model_dict = set_init_dict(model_dict, checkpoint["model"], c)
|
||||
model_gen.load_state_dict(model_dict)
|
||||
|
||||
model_dict = model_disc.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c)
|
||||
model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c)
|
||||
model_disc.load_state_dict(model_dict)
|
||||
del model_dict
|
||||
|
||||
# reset lr if not countinuining training.
|
||||
for group in optimizer_gen.param_groups:
|
||||
group['lr'] = c.lr_gen
|
||||
group["lr"] = c.lr_gen
|
||||
|
||||
for group in optimizer_disc.param_groups:
|
||||
group['lr'] = c.lr_disc
|
||||
group["lr"] = c.lr_disc
|
||||
|
||||
print(" > Model restored from step %d" % checkpoint['step'],
|
||||
flush=True)
|
||||
args.restore_step = checkpoint['step']
|
||||
print(" > Model restored from step %d" % checkpoint["step"], flush=True)
|
||||
args.restore_step = checkpoint["step"]
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
||||
|
@ -539,75 +566,92 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
num_params = count_parameters(model_disc)
|
||||
print(" > Discriminator has {} parameters".format(num_params), flush=True)
|
||||
|
||||
if 'best_loss' not in locals():
|
||||
best_loss = float('inf')
|
||||
if "best_loss" not in locals():
|
||||
best_loss = float("inf")
|
||||
|
||||
global_step = args.restore_step
|
||||
for epoch in range(0, c.epochs):
|
||||
c_logger.print_epoch_start(epoch, c.epochs)
|
||||
_, global_step = train(model_gen, criterion_gen, optimizer_gen,
|
||||
model_disc, criterion_disc, optimizer_disc,
|
||||
scheduler_gen, scheduler_disc, ap, global_step,
|
||||
epoch)
|
||||
eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap,
|
||||
global_step, epoch)
|
||||
_, global_step = train(
|
||||
model_gen,
|
||||
criterion_gen,
|
||||
optimizer_gen,
|
||||
model_disc,
|
||||
criterion_disc,
|
||||
optimizer_disc,
|
||||
scheduler_gen,
|
||||
scheduler_disc,
|
||||
ap,
|
||||
global_step,
|
||||
epoch,
|
||||
)
|
||||
eval_avg_loss_dict = evaluate(
|
||||
model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch
|
||||
)
|
||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||
target_loss = eval_avg_loss_dict[c.target_loss]
|
||||
best_loss = save_best_model(target_loss,
|
||||
best_loss,
|
||||
model_gen,
|
||||
optimizer_gen,
|
||||
scheduler_gen,
|
||||
model_disc,
|
||||
optimizer_disc,
|
||||
scheduler_disc,
|
||||
global_step,
|
||||
epoch,
|
||||
OUT_PATH,
|
||||
model_losses=eval_avg_loss_dict)
|
||||
best_loss = save_best_model(
|
||||
target_loss,
|
||||
best_loss,
|
||||
model_gen,
|
||||
optimizer_gen,
|
||||
scheduler_gen,
|
||||
model_disc,
|
||||
optimizer_disc,
|
||||
scheduler_disc,
|
||||
global_step,
|
||||
epoch,
|
||||
OUT_PATH,
|
||||
model_losses=eval_avg_loss_dict,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--continue_path',
|
||||
"--continue_path",
|
||||
type=str,
|
||||
help=
|
||||
'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default='',
|
||||
required='--config_path' not in sys.argv)
|
||||
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default="",
|
||||
required="--config_path" not in sys.argv,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--restore_path',
|
||||
"--restore_path",
|
||||
type=str,
|
||||
help='Model file to be restored. Use to finetune a model.',
|
||||
default='')
|
||||
parser.add_argument('--config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
required='--continue_path' not in sys.argv)
|
||||
parser.add_argument('--debug',
|
||||
type=bool,
|
||||
default=False,
|
||||
help='Do not verify commit integrity to run training.')
|
||||
help="Model file to be restored. Use to finetune a model.",
|
||||
default="",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_path",
|
||||
type=str,
|
||||
help="Path to config file for training.",
|
||||
required="--continue_path" not in sys.argv,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Do not verify commit integrity to run training.",
|
||||
)
|
||||
|
||||
# DISTRUBUTED
|
||||
parser.add_argument(
|
||||
'--rank',
|
||||
"--rank",
|
||||
type=int,
|
||||
default=0,
|
||||
help='DISTRIBUTED: process rank for distributed training.')
|
||||
parser.add_argument('--group_id',
|
||||
type=str,
|
||||
default="",
|
||||
help='DISTRIBUTED: process group id.')
|
||||
help="DISTRIBUTED: process rank for distributed training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--group_id", type=str, default="", help="DISTRIBUTED: process group id."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.continue_path != '':
|
||||
if args.continue_path != "":
|
||||
args.output_path = args.continue_path
|
||||
args.config_path = os.path.join(args.continue_path, 'config.json')
|
||||
args.config_path = os.path.join(args.continue_path, "config.json")
|
||||
list_of_files = glob.glob(
|
||||
args.continue_path +
|
||||
"/*.pth.tar") # * means all if need specific format then *.csv
|
||||
args.continue_path + "/*.pth.tar"
|
||||
) # * means all if need specific format then *.csv
|
||||
latest_model_file = max(list_of_files, key=os.path.getctime)
|
||||
args.restore_path = latest_model_file
|
||||
print(f" > Training continues for {args.restore_path}")
|
||||
|
@ -618,11 +662,10 @@ if __name__ == '__main__':
|
|||
_ = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
OUT_PATH = args.continue_path
|
||||
if args.continue_path == '':
|
||||
OUT_PATH = create_experiment_folder(c.output_path, c.run_name,
|
||||
args.debug)
|
||||
if args.continue_path == "":
|
||||
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
|
||||
|
||||
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
|
||||
AUDIO_PATH = os.path.join(OUT_PATH, "test_audios")
|
||||
|
||||
c_logger = ConsoleLogger()
|
||||
|
||||
|
@ -632,16 +675,17 @@ if __name__ == '__main__':
|
|||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path,
|
||||
os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
copy_config_file(
|
||||
args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
|
||||
)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
LOG_DIR = OUT_PATH
|
||||
tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER')
|
||||
tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER")
|
||||
|
||||
# write model desc to tensorboard
|
||||
tb_logger.tb_add_text('model-description', c['run_description'], 0)
|
||||
tb_logger.tb_add_text("model-description", c["run_description"], 0)
|
||||
|
||||
try:
|
||||
main(args)
|
||||
|
@ -654,4 +698,4 @@ if __name__ == '__main__':
|
|||
except Exception: # pylint: disable=broad-except
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
sys.exit(1)
|
|
@ -0,0 +1,493 @@
|
|||
import argparse
|
||||
import math
|
||||
import os
|
||||
import pickle
|
||||
import shutil
|
||||
import sys
|
||||
import traceback
|
||||
import time
|
||||
import glob
|
||||
import random
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.tts.utils.visual import plot_spectrogram
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||
from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss
|
||||
from TTS.vocoder.utils.generic_utils import setup_wavernn
|
||||
from TTS.utils.training import setup_torch_training_env
|
||||
from TTS.utils.console_logger import ConsoleLogger
|
||||
from TTS.utils.generic_utils import (
|
||||
KeepAverage,
|
||||
count_parameters,
|
||||
create_experiment_folder,
|
||||
get_git_branch,
|
||||
remove_experiment_folder,
|
||||
set_init_dict,
|
||||
)
|
||||
from TTS.vocoder.utils.io import save_best_model, save_checkpoint
|
||||
|
||||
|
||||
use_cuda, num_gpus = setup_torch_training_env(True, True)
|
||||
|
||||
|
||||
def setup_loader(ap, is_val=False, verbose=False):
|
||||
if is_val and not CONFIG.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
dataset = WaveRNNDataset(
|
||||
ap=ap,
|
||||
items=eval_data if is_val else train_data,
|
||||
seq_len=CONFIG.seq_len,
|
||||
hop_len=ap.hop_length,
|
||||
pad=CONFIG.padding,
|
||||
mode=CONFIG.mode,
|
||||
is_training=not is_val,
|
||||
verbose=verbose,
|
||||
)
|
||||
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
shuffle=True,
|
||||
collate_fn=dataset.collate,
|
||||
batch_size=CONFIG.batch_size,
|
||||
num_workers=CONFIG.num_val_loader_workers
|
||||
if is_val
|
||||
else CONFIG.num_loader_workers,
|
||||
pin_memory=True,
|
||||
)
|
||||
return loader
|
||||
|
||||
|
||||
def format_data(data):
|
||||
# setup input data
|
||||
x = data[0]
|
||||
m = data[1]
|
||||
y = data[2]
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
x = x.cuda(non_blocking=True)
|
||||
m = m.cuda(non_blocking=True)
|
||||
y = y.cuda(non_blocking=True)
|
||||
|
||||
return x, m, y
|
||||
|
||||
|
||||
def train(model, optimizer, criterion, scheduler, ap, global_step, epoch):
|
||||
# create train loader
|
||||
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
if use_cuda:
|
||||
batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus))
|
||||
else:
|
||||
batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size)
|
||||
end_time = time.time()
|
||||
c_logger.print_train_start()
|
||||
# train loop
|
||||
print(" > Training", flush=True)
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
x, m, y = format_data(data)
|
||||
loader_time = time.time() - end_time
|
||||
global_step += 1
|
||||
|
||||
##################
|
||||
# MODEL TRAINING #
|
||||
##################
|
||||
y_hat = model(x, m)
|
||||
y_hat_vis = y_hat # for visualization
|
||||
|
||||
# y_hat = y_hat.transpose(1, 2)
|
||||
if isinstance(model.mode, int):
|
||||
y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
|
||||
else:
|
||||
y = y.float()
|
||||
y = y.unsqueeze(-1)
|
||||
# m_scaled, _ = model.upsample(m)
|
||||
|
||||
# compute losses
|
||||
loss = criterion(y_hat, y)
|
||||
if loss.item() is None:
|
||||
raise RuntimeError(" [!] None loss. Exiting ...")
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
if CONFIG.grad_clip > 0:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip)
|
||||
|
||||
optimizer.step()
|
||||
if scheduler is not None:
|
||||
scheduler.step()
|
||||
|
||||
# get the current learning rate
|
||||
cur_lr = list(optimizer.param_groups)[0]["lr"]
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
update_train_values = dict()
|
||||
loss_dict = dict()
|
||||
loss_dict["model_loss"] = loss.item()
|
||||
for key, value in loss_dict.items():
|
||||
update_train_values["avg_" + key] = value
|
||||
update_train_values["avg_loader_time"] = loader_time
|
||||
update_train_values["avg_step_time"] = step_time
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
# print training stats
|
||||
if global_step % CONFIG.print_step == 0:
|
||||
log_dict = {
|
||||
"step_time": [step_time, 2],
|
||||
"loader_time": [loader_time, 4],
|
||||
"current_lr": cur_lr,
|
||||
}
|
||||
c_logger.print_train_step(
|
||||
batch_n_iter,
|
||||
num_iter,
|
||||
global_step,
|
||||
log_dict,
|
||||
loss_dict,
|
||||
keep_avg.avg_values,
|
||||
)
|
||||
|
||||
# plot step stats
|
||||
if global_step % 10 == 0:
|
||||
iter_stats = {"lr": cur_lr, "step_time": step_time}
|
||||
iter_stats.update(loss_dict)
|
||||
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||
|
||||
# save checkpoint
|
||||
if global_step % CONFIG.save_step == 0:
|
||||
if CONFIG.checkpoint:
|
||||
# save model
|
||||
save_checkpoint(
|
||||
model,
|
||||
optimizer,
|
||||
scheduler,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
global_step,
|
||||
epoch,
|
||||
OUT_PATH,
|
||||
model_losses=loss_dict,
|
||||
)
|
||||
|
||||
# synthesize a full voice
|
||||
wav_path = train_data[random.randrange(0, len(train_data))][0]
|
||||
wav = ap.load_wav(wav_path)
|
||||
ground_mel = ap.melspectrogram(wav)
|
||||
sample_wav = model.generate(
|
||||
ground_mel,
|
||||
CONFIG.batched,
|
||||
CONFIG.target_samples,
|
||||
CONFIG.overlap_samples,
|
||||
)
|
||||
predict_mel = ap.melspectrogram(sample_wav)
|
||||
|
||||
# Sample audio
|
||||
tb_logger.tb_train_audios(
|
||||
global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"]
|
||||
)
|
||||
# compute spectrograms
|
||||
figures = {
|
||||
"prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False),
|
||||
"ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False),
|
||||
}
|
||||
tb_logger.tb_train_figures(global_step, figures)
|
||||
end_time = time.time()
|
||||
|
||||
# print epoch stats
|
||||
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
|
||||
|
||||
# Plot Training Epoch Stats
|
||||
epoch_stats = {"epoch_time": epoch_time}
|
||||
epoch_stats.update(keep_avg.avg_values)
|
||||
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
|
||||
# TODO: plot model stats
|
||||
# if c.tb_model_param_stats:
|
||||
# tb_logger.tb_model_weights(model, global_step)
|
||||
return keep_avg.avg_values, global_step
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def evaluate(model, criterion, ap, global_step, epoch):
|
||||
# create train loader
|
||||
data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0))
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
end_time = time.time()
|
||||
c_logger.print_eval_start()
|
||||
with torch.no_grad():
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
# format data
|
||||
x, m, y = format_data(data)
|
||||
loader_time = time.time() - end_time
|
||||
global_step += 1
|
||||
|
||||
y_hat = model(x, m)
|
||||
if isinstance(model.mode, int):
|
||||
y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
|
||||
else:
|
||||
y = y.float()
|
||||
y = y.unsqueeze(-1)
|
||||
loss = criterion(y_hat, y)
|
||||
# Compute avg loss
|
||||
# if num_gpus > 1:
|
||||
# loss = reduce_tensor(loss.data, num_gpus)
|
||||
loss_dict = dict()
|
||||
loss_dict["model_loss"] = loss.item()
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# update avg stats
|
||||
update_eval_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_eval_values["avg_" + key] = value
|
||||
update_eval_values["avg_loader_time"] = loader_time
|
||||
update_eval_values["avg_step_time"] = step_time
|
||||
keep_avg.update_values(update_eval_values)
|
||||
|
||||
# print eval stats
|
||||
if CONFIG.print_eval:
|
||||
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
|
||||
|
||||
if epoch > CONFIG.test_delay_epochs:
|
||||
# synthesize a full voice
|
||||
wav_path = eval_data[random.randrange(0, len(eval_data))][0]
|
||||
wav = ap.load_wav(wav_path)
|
||||
ground_mel = ap.melspectrogram(wav)
|
||||
sample_wav = model.generate(
|
||||
ground_mel,
|
||||
CONFIG.batched,
|
||||
CONFIG.target_samples,
|
||||
CONFIG.overlap_samples,
|
||||
)
|
||||
predict_mel = ap.melspectrogram(sample_wav)
|
||||
|
||||
# Sample audio
|
||||
tb_logger.tb_eval_audios(
|
||||
global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"]
|
||||
)
|
||||
# compute spectrograms
|
||||
figures = {
|
||||
"prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False),
|
||||
"ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False),
|
||||
}
|
||||
tb_logger.tb_eval_figures(global_step, figures)
|
||||
|
||||
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
||||
return keep_avg.avg_values
|
||||
|
||||
|
||||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global train_data, eval_data
|
||||
|
||||
print(f" > Loading wavs from: {CONFIG.data_path}")
|
||||
if CONFIG.feature_path is not None:
|
||||
print(f" > Loading features from: {CONFIG.feature_path}")
|
||||
eval_data, train_data = load_wav_feat_data(
|
||||
CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size
|
||||
)
|
||||
eval_data, train_data = eval_data, train_data
|
||||
else:
|
||||
eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size)
|
||||
|
||||
# setup audio processor
|
||||
ap = AudioProcessor(**CONFIG.audio)
|
||||
|
||||
# setup model
|
||||
model_wavernn = setup_wavernn(CONFIG)
|
||||
|
||||
# define train functions
|
||||
if CONFIG.mode == "mold":
|
||||
criterion = discretized_mix_logistic_loss
|
||||
elif CONFIG.mode == "gauss":
|
||||
criterion = gaussian_loss
|
||||
elif isinstance(CONFIG.mode, int):
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
|
||||
if use_cuda:
|
||||
model_wavernn.cuda()
|
||||
if isinstance(CONFIG.mode, int):
|
||||
criterion.cuda()
|
||||
|
||||
optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0)
|
||||
scheduler = None
|
||||
if "lr_scheduler" in CONFIG:
|
||||
scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler)
|
||||
scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params)
|
||||
# slow start for the first 5 epochs
|
||||
# lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1)
|
||||
# scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
|
||||
|
||||
# restore any checkpoint
|
||||
if args.restore_path:
|
||||
checkpoint = torch.load(args.restore_path, map_location="cpu")
|
||||
try:
|
||||
print(" > Restoring Model...")
|
||||
model_wavernn.load_state_dict(checkpoint["model"])
|
||||
print(" > Restoring Optimizer...")
|
||||
optimizer.load_state_dict(checkpoint["optimizer"])
|
||||
if "scheduler" in checkpoint:
|
||||
print(" > Restoring Generator LR Scheduler...")
|
||||
scheduler.load_state_dict(checkpoint["scheduler"])
|
||||
scheduler.optimizer = optimizer
|
||||
# TODO: fix resetting restored optimizer lr
|
||||
# optimizer.load_state_dict(checkpoint["optimizer"])
|
||||
except RuntimeError:
|
||||
# retore only matching layers.
|
||||
print(" > Partial model initialization...")
|
||||
model_dict = model_wavernn.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG)
|
||||
model_wavernn.load_state_dict(model_dict)
|
||||
|
||||
print(" > Model restored from step %d" % checkpoint["step"], flush=True)
|
||||
args.restore_step = checkpoint["step"]
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
||||
# DISTRIBUTED
|
||||
# if num_gpus > 1:
|
||||
# model = apply_gradient_allreduce(model)
|
||||
|
||||
num_parameters = count_parameters(model_wavernn)
|
||||
print(" > Model has {} parameters".format(num_parameters), flush=True)
|
||||
|
||||
if "best_loss" not in locals():
|
||||
best_loss = float("inf")
|
||||
|
||||
global_step = args.restore_step
|
||||
for epoch in range(0, CONFIG.epochs):
|
||||
c_logger.print_epoch_start(epoch, CONFIG.epochs)
|
||||
_, global_step = train(
|
||||
model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch
|
||||
)
|
||||
eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch)
|
||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||
target_loss = eval_avg_loss_dict["avg_model_loss"]
|
||||
best_loss = save_best_model(
|
||||
target_loss,
|
||||
best_loss,
|
||||
model_wavernn,
|
||||
optimizer,
|
||||
scheduler,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
global_step,
|
||||
epoch,
|
||||
OUT_PATH,
|
||||
model_losses=eval_avg_loss_dict,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--continue_path",
|
||||
type=str,
|
||||
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default="",
|
||||
required="--config_path" not in sys.argv,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--restore_path",
|
||||
type=str,
|
||||
help="Model file to be restored. Use to finetune a model.",
|
||||
default="",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_path",
|
||||
type=str,
|
||||
help="Path to config file for training.",
|
||||
required="--continue_path" not in sys.argv,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Do not verify commit integrity to run training.",
|
||||
)
|
||||
|
||||
# DISTRUBUTED
|
||||
parser.add_argument(
|
||||
"--rank",
|
||||
type=int,
|
||||
default=0,
|
||||
help="DISTRIBUTED: process rank for distributed training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--group_id", type=str, default="", help="DISTRIBUTED: process group id."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.continue_path != "":
|
||||
args.output_path = args.continue_path
|
||||
args.config_path = os.path.join(args.continue_path, "config.json")
|
||||
list_of_files = glob.glob(
|
||||
args.continue_path + "/*.pth.tar"
|
||||
) # * means all if need specific format then *.csv
|
||||
latest_model_file = max(list_of_files, key=os.path.getctime)
|
||||
args.restore_path = latest_model_file
|
||||
print(f" > Training continues for {args.restore_path}")
|
||||
|
||||
# setup output paths and read configs
|
||||
CONFIG = load_config(args.config_path)
|
||||
# check_config(c)
|
||||
_ = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
OUT_PATH = args.continue_path
|
||||
if args.continue_path == "":
|
||||
OUT_PATH = create_experiment_folder(
|
||||
CONFIG.output_path, CONFIG.run_name, args.debug
|
||||
)
|
||||
|
||||
AUDIO_PATH = os.path.join(OUT_PATH, "test_audios")
|
||||
|
||||
c_logger = ConsoleLogger()
|
||||
|
||||
if args.rank == 0:
|
||||
os.makedirs(AUDIO_PATH, exist_ok=True)
|
||||
new_fields = {}
|
||||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(
|
||||
args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
|
||||
)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
LOG_DIR = OUT_PATH
|
||||
tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER")
|
||||
|
||||
# write model desc to tensorboard
|
||||
tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0)
|
||||
|
||||
try:
|
||||
main(args)
|
||||
except KeyboardInterrupt:
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
try:
|
||||
sys.exit(0)
|
||||
except SystemExit:
|
||||
os._exit(0) # pylint: disable=protected-access
|
||||
except Exception: # pylint: disable=broad-except
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
|
@ -0,0 +1,95 @@
|
|||
{
|
||||
"model": "wavernn",
|
||||
"run_name": "wavernn_test",
|
||||
"run_description": "wavernn_test training",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 20.0, // scaler value appplied after log transform of spectrogram.
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// Generating / Synthesizing
|
||||
"batched": true,
|
||||
"target_samples": 11000, // target number of samples to be generated in each batch entry
|
||||
"overlap_samples": 550, // number of samples for crossfading between batches
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
// "distributed":{
|
||||
// "backend": "nccl",
|
||||
// "url": "tcp:\/\/localhost:54321"
|
||||
// },
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"use_aux_net": true,
|
||||
"use_upsample_net": true,
|
||||
"upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length
|
||||
"seq_len": 1280, // has to be devideable by hop_length
|
||||
"mode": "mold", // mold [string], gauss [string], bits [int]
|
||||
"mulaw": false, // apply mulaw if mode is bits
|
||||
"padding": 2, // pad the input for resnet to see wider input length
|
||||
|
||||
// DATASET
|
||||
"data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files
|
||||
"feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant)
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||
"epochs": 10000, // total number of epochs to train.
|
||||
"warmup_steps": 10,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 10, // early testing only wastes computation time.
|
||||
|
||||
// OPTIMIZER
|
||||
"grad_clip": 4, // apply gradient clipping if > 0
|
||||
"lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||
"lr_scheduler_params": {
|
||||
"gamma": 0.5,
|
||||
"milestones": [200000, 400000, 600000]
|
||||
},
|
||||
"lr": 1e-4, // initial learning rate
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log traning on console.
|
||||
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
||||
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
// DATA LOADING
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"eval_split_size": 50, // number of samples for testing
|
||||
|
||||
// PATHS
|
||||
"output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/"
|
||||
}
|
||||
|
|
@ -23,8 +23,12 @@ def load_wav_data(data_path, eval_split_size):
|
|||
|
||||
|
||||
def load_wav_feat_data(data_path, feat_path, eval_split_size):
|
||||
wav_paths = sorted(find_wav_files(data_path))
|
||||
feat_paths = sorted(find_feat_files(feat_path))
|
||||
wav_paths = find_wav_files(data_path)
|
||||
feat_paths = find_feat_files(feat_path)
|
||||
|
||||
wav_paths.sort(key=lambda x: Path(x).stem)
|
||||
feat_paths.sort(key=lambda x: Path(x).stem)
|
||||
|
||||
assert len(wav_paths) == len(feat_paths)
|
||||
for wav, feat in zip(wav_paths, feat_paths):
|
||||
wav_name = Path(wav).stem
|
||||
|
|
|
@ -41,6 +41,26 @@ def to_camel(text):
|
|||
text = text.capitalize()
|
||||
return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
|
||||
|
||||
def setup_wavernn(c):
|
||||
print(" > Model: {}".format(c.model))
|
||||
MyModel = importlib.import_module('TTS.vocoder.models.wavernn')
|
||||
MyModel = getattr(MyModel, "WaveRNN")
|
||||
model = MyModel(
|
||||
rnn_dims=512,
|
||||
fc_dims=512,
|
||||
mode=c.mode,
|
||||
mulaw=c.mulaw,
|
||||
pad=c.padding,
|
||||
use_aux_net=c.use_aux_net,
|
||||
use_upsample_net=c.use_upsample_net,
|
||||
upsample_factors=c.upsample_factors,
|
||||
feat_dims=80,
|
||||
compute_dims=128,
|
||||
res_out_dims=128,
|
||||
res_blocks=10,
|
||||
hop_length=c.audio['hop_length'],
|
||||
sample_rate=c.audio['sample_rate'])
|
||||
return model
|
||||
|
||||
def setup_generator(c):
|
||||
print(" > Generator Model: {}".format(c.generator_model))
|
||||
|
|
Loading…
Reference in New Issue