add initial wavernn support

pull/10/head
Alex K 2020-10-15 19:14:50 +02:00
parent 1a87ad82e3
commit d6bd3cd8b8
6 changed files with 838 additions and 175 deletions

View File

@ -11,20 +11,27 @@ from TTS.tts.datasets.preprocess import load_meta_data
from TTS.utils.io import load_config
from TTS.utils.audio import AudioProcessor
def main():
"""Run preprocessing process."""
parser = argparse.ArgumentParser(
description="Compute mean and variance of spectrogtram features.")
parser.add_argument("--config_path", type=str, required=True,
help="TTS config file path to define audio processin parameters.")
parser.add_argument("--out_path", default=None, type=str,
help="directory to save the output file.")
description="Compute mean and variance of spectrogtram features."
)
parser.add_argument(
"--config_path",
type=str,
required=True,
help="TTS config file path to define audio processin parameters.",
)
parser.add_argument(
"--out_path", default=None, type=str, help="directory to save the output file."
)
args = parser.parse_args()
# load config
CONFIG = load_config(args.config_path)
CONFIG.audio['signal_norm'] = False # do not apply earlier normalization
CONFIG.audio['stats_path'] = None # discard pre-defined stats
CONFIG.audio["signal_norm"] = False # do not apply earlier normalization
CONFIG.audio["stats_path"] = None # discard pre-defined stats
# load audio processor
ap = AudioProcessor(**CONFIG.audio)
@ -58,27 +65,27 @@ def main():
output_file_path = os.path.join(args.out_path, "scale_stats.npy")
stats = {}
stats['mel_mean'] = mel_mean
stats['mel_std'] = mel_scale
stats['linear_mean'] = linear_mean
stats['linear_std'] = linear_scale
stats["mel_mean"] = mel_mean
stats["mel_std"] = mel_scale
stats["linear_mean"] = linear_mean
stats["linear_std"] = linear_scale
print(f' > Avg mel spec mean: {mel_mean.mean()}')
print(f' > Avg mel spec scale: {mel_scale.mean()}')
print(f' > Avg linear spec mean: {linear_mean.mean()}')
print(f' > Avg lienar spec scale: {linear_scale.mean()}')
print(f" > Avg mel spec mean: {mel_mean.mean()}")
print(f" > Avg mel spec scale: {mel_scale.mean()}")
print(f" > Avg linear spec mean: {linear_mean.mean()}")
print(f" > Avg lienar spec scale: {linear_scale.mean()}")
# set default config values for mean-var scaling
CONFIG.audio['stats_path'] = output_file_path
CONFIG.audio['signal_norm'] = True
CONFIG.audio["stats_path"] = output_file_path
CONFIG.audio["signal_norm"] = True
# remove redundant values
del CONFIG.audio['max_norm']
del CONFIG.audio['min_level_db']
del CONFIG.audio['symmetric_norm']
del CONFIG.audio['clip_norm']
stats['audio_config'] = CONFIG.audio
del CONFIG.audio["max_norm"]
del CONFIG.audio["min_level_db"]
del CONFIG.audio["symmetric_norm"]
del CONFIG.audio["clip_norm"]
stats["audio_config"] = CONFIG.audio
np.save(output_file_path, stats, allow_pickle=True)
print(f' > scale_stats.npy is saved to {output_file_path}')
print(f" > scale_stats.npy is saved to {output_file_path}")
if __name__ == "__main__":

View File

@ -10,20 +10,29 @@ import torch
from torch.utils.data import DataLoader
from TTS.utils.audio import AudioProcessor
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.generic_utils import (
KeepAverage,
count_parameters,
create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict,
)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import setup_torch_training_env
from TTS.vocoder.datasets.gan_dataset import GANDataset
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
# from distribute import (DistributedSampler, apply_gradient_allreduce,
# init_distributed, reduce_tensor)
from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator,
setup_generator)
from TTS.vocoder.utils.generic_utils import (
plot_results,
setup_discriminator,
setup_generator,
)
from TTS.vocoder.utils.io import save_best_model, save_checkpoint
use_cuda, num_gpus = setup_torch_training_env(True, True)
@ -33,27 +42,30 @@ def setup_loader(ap, is_val=False, verbose=False):
if is_val and not c.run_eval:
loader = None
else:
dataset = GANDataset(ap=ap,
items=eval_data if is_val else train_data,
seq_len=c.seq_len,
hop_len=ap.hop_length,
pad_short=c.pad_short,
conv_pad=c.conv_pad,
is_training=not is_val,
return_segments=not is_val,
use_noise_augment=c.use_noise_augment,
use_cache=c.use_cache,
verbose=verbose)
dataset = GANDataset(
ap=ap,
items=eval_data if is_val else train_data,
seq_len=c.seq_len,
hop_len=ap.hop_length,
pad_short=c.pad_short,
conv_pad=c.conv_pad,
is_training=not is_val,
return_segments=not is_val,
use_noise_augment=c.use_noise_augment,
use_cache=c.use_cache,
verbose=verbose,
)
dataset.shuffle_mapping()
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(dataset,
batch_size=1 if is_val else c.batch_size,
shuffle=True,
drop_last=False,
sampler=None,
num_workers=c.num_val_loader_workers
if is_val else c.num_loader_workers,
pin_memory=False)
loader = DataLoader(
dataset,
batch_size=1 if is_val else c.batch_size,
shuffle=True,
drop_last=False,
sampler=None,
num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers,
pin_memory=False,
)
return loader
@ -80,16 +92,26 @@ def format_data(data):
return co, x, None, None
def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
scheduler_G, scheduler_D, ap, global_step, epoch):
def train(
model_G,
criterion_G,
optimizer_G,
model_D,
criterion_D,
optimizer_D,
scheduler_G,
scheduler_D,
ap,
global_step,
epoch,
):
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
model_G.train()
model_D.train()
epoch_time = 0
keep_avg = KeepAverage()
if use_cuda:
batch_n_iter = int(
len(data_loader.dataset) / (c.batch_size * num_gpus))
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time()
@ -145,16 +167,16 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
scores_fake = D_out_fake
# compute losses
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
feats_real, y_hat_sub, y_G_sub)
loss_G = loss_G_dict['G_loss']
loss_G_dict = criterion_G(
y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub
)
loss_G = loss_G_dict["G_loss"]
# optimizer generator
optimizer_G.zero_grad()
loss_G.backward()
if c.gen_clip_grad > 0:
torch.nn.utils.clip_grad_norm_(model_G.parameters(),
c.gen_clip_grad)
torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad)
optimizer_G.step()
if scheduler_G is not None:
scheduler_G.step()
@ -199,14 +221,13 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
# compute losses
loss_D_dict = criterion_D(scores_fake, scores_real)
loss_D = loss_D_dict['D_loss']
loss_D = loss_D_dict["D_loss"]
# optimizer discriminator
optimizer_D.zero_grad()
loss_D.backward()
if c.disc_clip_grad > 0:
torch.nn.utils.clip_grad_norm_(model_D.parameters(),
c.disc_clip_grad)
torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad)
optimizer_D.step()
if scheduler_D is not None:
scheduler_D.step()
@ -221,34 +242,40 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
epoch_time += step_time
# get current learning rates
current_lr_G = list(optimizer_G.param_groups)[0]['lr']
current_lr_D = list(optimizer_D.param_groups)[0]['lr']
current_lr_G = list(optimizer_G.param_groups)[0]["lr"]
current_lr_D = list(optimizer_D.param_groups)[0]["lr"]
# update avg stats
update_train_values = dict()
for key, value in loss_dict.items():
update_train_values['avg_' + key] = value
update_train_values['avg_loader_time'] = loader_time
update_train_values['avg_step_time'] = step_time
update_train_values["avg_" + key] = value
update_train_values["avg_loader_time"] = loader_time
update_train_values["avg_step_time"] = step_time
keep_avg.update_values(update_train_values)
# print training stats
if global_step % c.print_step == 0:
log_dict = {
'step_time': [step_time, 2],
'loader_time': [loader_time, 4],
"step_time": [step_time, 2],
"loader_time": [loader_time, 4],
"current_lr_G": current_lr_G,
"current_lr_D": current_lr_D
"current_lr_D": current_lr_D,
}
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
log_dict, loss_dict, keep_avg.avg_values)
c_logger.print_train_step(
batch_n_iter,
num_iter,
global_step,
log_dict,
loss_dict,
keep_avg.avg_values,
)
# plot step stats
if global_step % 10 == 0:
iter_stats = {
"lr_G": current_lr_G,
"lr_D": current_lr_D,
"step_time": step_time
"step_time": step_time,
}
iter_stats.update(loss_dict)
tb_logger.tb_train_iter_stats(global_step, iter_stats)
@ -257,27 +284,28 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
if global_step % c.save_step == 0:
if c.checkpoint:
# save model
save_checkpoint(model_G,
optimizer_G,
scheduler_G,
model_D,
optimizer_D,
scheduler_D,
global_step,
epoch,
OUT_PATH,
model_losses=loss_dict)
save_checkpoint(
model_G,
optimizer_G,
scheduler_G,
model_D,
optimizer_D,
scheduler_D,
global_step,
epoch,
OUT_PATH,
model_losses=loss_dict,
)
# compute spectrograms
figures = plot_results(y_hat_vis, y_G, ap, global_step,
'train')
figures = plot_results(y_hat_vis, y_G, ap, global_step, "train")
tb_logger.tb_train_figures(global_step, figures)
# Sample audio
sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy()
tb_logger.tb_train_audios(global_step,
{'train/audio': sample_voice},
c.audio["sample_rate"])
tb_logger.tb_train_audios(
global_step, {"train/audio": sample_voice}, c.audio["sample_rate"]
)
end_time = time.time()
# print epoch stats
@ -326,7 +354,6 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
y_hat = model_G.pqmf_synthesis(y_hat)
y_G_sub = model_G.pqmf_analysis(y_G)
scores_fake, feats_fake, feats_real = None, None, None
if global_step > c.steps_to_start_discriminator:
@ -352,8 +379,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
feats_fake, feats_real = None, None
# compute losses
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
feats_real, y_hat_sub, y_G_sub)
loss_G_dict = criterion_G(
y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub
)
loss_dict = dict()
for key, value in loss_G_dict.items():
@ -403,16 +431,15 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
else:
loss_dict[key] = value.item()
step_time = time.time() - start_time
epoch_time += step_time
# update avg stats
update_eval_values = dict()
for key, value in loss_dict.items():
update_eval_values['avg_' + key] = value
update_eval_values['avg_loader_time'] = loader_time
update_eval_values['avg_step_time'] = step_time
update_eval_values["avg_" + key] = value
update_eval_values["avg_loader_time"] = loader_time
update_eval_values["avg_step_time"] = step_time
keep_avg.update_values(update_eval_values)
# print eval stats
@ -420,13 +447,14 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
# compute spectrograms
figures = plot_results(y_hat, y_G, ap, global_step, 'eval')
figures = plot_results(y_hat, y_G, ap, global_step, "eval")
tb_logger.tb_eval_figures(global_step, figures)
# Sample audio
sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice},
c.audio["sample_rate"])
tb_logger.tb_eval_audios(
global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"]
)
# synthesize a full voice
data_loader.return_segments = False
@ -443,7 +471,9 @@ def main(args): # pylint: disable=redefined-outer-name
print(f" > Loading wavs from: {c.data_path}")
if c.feature_path is not None:
print(f" > Loading features from: {c.feature_path}")
eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size)
eval_data, train_data = load_wav_feat_data(
c.data_path, c.feature_path, c.eval_split_size
)
else:
eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
@ -461,17 +491,15 @@ def main(args): # pylint: disable=redefined-outer-name
# setup optimizers
optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0)
optimizer_disc = RAdam(model_disc.parameters(),
lr=c.lr_disc,
weight_decay=0)
optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0)
# schedulers
scheduler_gen = None
scheduler_disc = None
if 'lr_scheduler_gen' in c:
if "lr_scheduler_gen" in c:
scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen)
scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params)
if 'lr_scheduler_disc' in c:
if "lr_scheduler_disc" in c:
scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc)
scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params)
@ -480,47 +508,46 @@ def main(args): # pylint: disable=redefined-outer-name
criterion_disc = DiscriminatorLoss(c)
if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location='cpu')
checkpoint = torch.load(args.restore_path, map_location="cpu")
try:
print(" > Restoring Generator Model...")
model_gen.load_state_dict(checkpoint['model'])
model_gen.load_state_dict(checkpoint["model"])
print(" > Restoring Generator Optimizer...")
optimizer_gen.load_state_dict(checkpoint['optimizer'])
optimizer_gen.load_state_dict(checkpoint["optimizer"])
print(" > Restoring Discriminator Model...")
model_disc.load_state_dict(checkpoint['model_disc'])
model_disc.load_state_dict(checkpoint["model_disc"])
print(" > Restoring Discriminator Optimizer...")
optimizer_disc.load_state_dict(checkpoint['optimizer_disc'])
if 'scheduler' in checkpoint:
optimizer_disc.load_state_dict(checkpoint["optimizer_disc"])
if "scheduler" in checkpoint:
print(" > Restoring Generator LR Scheduler...")
scheduler_gen.load_state_dict(checkpoint['scheduler'])
scheduler_gen.load_state_dict(checkpoint["scheduler"])
# NOTE: Not sure if necessary
scheduler_gen.optimizer = optimizer_gen
if 'scheduler_disc' in checkpoint:
if "scheduler_disc" in checkpoint:
print(" > Restoring Discriminator LR Scheduler...")
scheduler_disc.load_state_dict(checkpoint['scheduler_disc'])
scheduler_disc.load_state_dict(checkpoint["scheduler_disc"])
scheduler_disc.optimizer = optimizer_disc
except RuntimeError:
# retore only matching layers.
print(" > Partial model initialization...")
model_dict = model_gen.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
model_dict = set_init_dict(model_dict, checkpoint["model"], c)
model_gen.load_state_dict(model_dict)
model_dict = model_disc.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c)
model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c)
model_disc.load_state_dict(model_dict)
del model_dict
# reset lr if not countinuining training.
for group in optimizer_gen.param_groups:
group['lr'] = c.lr_gen
group["lr"] = c.lr_gen
for group in optimizer_disc.param_groups:
group['lr'] = c.lr_disc
group["lr"] = c.lr_disc
print(" > Model restored from step %d" % checkpoint['step'],
flush=True)
args.restore_step = checkpoint['step']
print(" > Model restored from step %d" % checkpoint["step"], flush=True)
args.restore_step = checkpoint["step"]
else:
args.restore_step = 0
@ -539,75 +566,92 @@ def main(args): # pylint: disable=redefined-outer-name
num_params = count_parameters(model_disc)
print(" > Discriminator has {} parameters".format(num_params), flush=True)
if 'best_loss' not in locals():
best_loss = float('inf')
if "best_loss" not in locals():
best_loss = float("inf")
global_step = args.restore_step
for epoch in range(0, c.epochs):
c_logger.print_epoch_start(epoch, c.epochs)
_, global_step = train(model_gen, criterion_gen, optimizer_gen,
model_disc, criterion_disc, optimizer_disc,
scheduler_gen, scheduler_disc, ap, global_step,
epoch)
eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap,
global_step, epoch)
_, global_step = train(
model_gen,
criterion_gen,
optimizer_gen,
model_disc,
criterion_disc,
optimizer_disc,
scheduler_gen,
scheduler_disc,
ap,
global_step,
epoch,
)
eval_avg_loss_dict = evaluate(
model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch
)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = eval_avg_loss_dict[c.target_loss]
best_loss = save_best_model(target_loss,
best_loss,
model_gen,
optimizer_gen,
scheduler_gen,
model_disc,
optimizer_disc,
scheduler_disc,
global_step,
epoch,
OUT_PATH,
model_losses=eval_avg_loss_dict)
best_loss = save_best_model(
target_loss,
best_loss,
model_gen,
optimizer_gen,
scheduler_gen,
model_disc,
optimizer_disc,
scheduler_disc,
global_step,
epoch,
OUT_PATH,
model_losses=eval_avg_loss_dict,
)
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--continue_path',
"--continue_path",
type=str,
help=
'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default='',
required='--config_path' not in sys.argv)
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default="",
required="--config_path" not in sys.argv,
)
parser.add_argument(
'--restore_path',
"--restore_path",
type=str,
help='Model file to be restored. Use to finetune a model.',
default='')
parser.add_argument('--config_path',
type=str,
help='Path to config file for training.',
required='--continue_path' not in sys.argv)
parser.add_argument('--debug',
type=bool,
default=False,
help='Do not verify commit integrity to run training.')
help="Model file to be restored. Use to finetune a model.",
default="",
)
parser.add_argument(
"--config_path",
type=str,
help="Path to config file for training.",
required="--continue_path" not in sys.argv,
)
parser.add_argument(
"--debug",
type=bool,
default=False,
help="Do not verify commit integrity to run training.",
)
# DISTRUBUTED
parser.add_argument(
'--rank',
"--rank",
type=int,
default=0,
help='DISTRIBUTED: process rank for distributed training.')
parser.add_argument('--group_id',
type=str,
default="",
help='DISTRIBUTED: process group id.')
help="DISTRIBUTED: process rank for distributed training.",
)
parser.add_argument(
"--group_id", type=str, default="", help="DISTRIBUTED: process group id."
)
args = parser.parse_args()
if args.continue_path != '':
if args.continue_path != "":
args.output_path = args.continue_path
args.config_path = os.path.join(args.continue_path, 'config.json')
args.config_path = os.path.join(args.continue_path, "config.json")
list_of_files = glob.glob(
args.continue_path +
"/*.pth.tar") # * means all if need specific format then *.csv
args.continue_path + "/*.pth.tar"
) # * means all if need specific format then *.csv
latest_model_file = max(list_of_files, key=os.path.getctime)
args.restore_path = latest_model_file
print(f" > Training continues for {args.restore_path}")
@ -618,11 +662,10 @@ if __name__ == '__main__':
_ = os.path.dirname(os.path.realpath(__file__))
OUT_PATH = args.continue_path
if args.continue_path == '':
OUT_PATH = create_experiment_folder(c.output_path, c.run_name,
args.debug)
if args.continue_path == "":
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
AUDIO_PATH = os.path.join(OUT_PATH, "test_audios")
c_logger = ConsoleLogger()
@ -632,16 +675,17 @@ if __name__ == '__main__':
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
copy_config_file(
args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER')
tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER")
# write model desc to tensorboard
tb_logger.tb_add_text('model-description', c['run_description'], 0)
tb_logger.tb_add_text("model-description", c["run_description"], 0)
try:
main(args)
@ -654,4 +698,4 @@ if __name__ == '__main__':
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)
sys.exit(1)

View File

@ -0,0 +1,493 @@
import argparse
import math
import os
import pickle
import shutil
import sys
import traceback
import time
import glob
import random
import torch
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.visual import plot_spectrogram
from TTS.utils.io import copy_config_file, load_config
from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss
from TTS.vocoder.utils.generic_utils import setup_wavernn
from TTS.utils.training import setup_torch_training_env
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.generic_utils import (
KeepAverage,
count_parameters,
create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict,
)
from TTS.vocoder.utils.io import save_best_model, save_checkpoint
use_cuda, num_gpus = setup_torch_training_env(True, True)
def setup_loader(ap, is_val=False, verbose=False):
if is_val and not CONFIG.run_eval:
loader = None
else:
dataset = WaveRNNDataset(
ap=ap,
items=eval_data if is_val else train_data,
seq_len=CONFIG.seq_len,
hop_len=ap.hop_length,
pad=CONFIG.padding,
mode=CONFIG.mode,
is_training=not is_val,
verbose=verbose,
)
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
shuffle=True,
collate_fn=dataset.collate,
batch_size=CONFIG.batch_size,
num_workers=CONFIG.num_val_loader_workers
if is_val
else CONFIG.num_loader_workers,
pin_memory=True,
)
return loader
def format_data(data):
# setup input data
x = data[0]
m = data[1]
y = data[2]
# dispatch data to GPU
if use_cuda:
x = x.cuda(non_blocking=True)
m = m.cuda(non_blocking=True)
y = y.cuda(non_blocking=True)
return x, m, y
def train(model, optimizer, criterion, scheduler, ap, global_step, epoch):
# create train loader
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
model.train()
epoch_time = 0
keep_avg = KeepAverage()
if use_cuda:
batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size)
end_time = time.time()
c_logger.print_train_start()
# train loop
print(" > Training", flush=True)
for num_iter, data in enumerate(data_loader):
start_time = time.time()
x, m, y = format_data(data)
loader_time = time.time() - end_time
global_step += 1
##################
# MODEL TRAINING #
##################
y_hat = model(x, m)
y_hat_vis = y_hat # for visualization
# y_hat = y_hat.transpose(1, 2)
if isinstance(model.mode, int):
y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
else:
y = y.float()
y = y.unsqueeze(-1)
# m_scaled, _ = model.upsample(m)
# compute losses
loss = criterion(y_hat, y)
if loss.item() is None:
raise RuntimeError(" [!] None loss. Exiting ...")
optimizer.zero_grad()
loss.backward()
if CONFIG.grad_clip > 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip)
optimizer.step()
if scheduler is not None:
scheduler.step()
# get the current learning rate
cur_lr = list(optimizer.param_groups)[0]["lr"]
step_time = time.time() - start_time
epoch_time += step_time
update_train_values = dict()
loss_dict = dict()
loss_dict["model_loss"] = loss.item()
for key, value in loss_dict.items():
update_train_values["avg_" + key] = value
update_train_values["avg_loader_time"] = loader_time
update_train_values["avg_step_time"] = step_time
keep_avg.update_values(update_train_values)
# print training stats
if global_step % CONFIG.print_step == 0:
log_dict = {
"step_time": [step_time, 2],
"loader_time": [loader_time, 4],
"current_lr": cur_lr,
}
c_logger.print_train_step(
batch_n_iter,
num_iter,
global_step,
log_dict,
loss_dict,
keep_avg.avg_values,
)
# plot step stats
if global_step % 10 == 0:
iter_stats = {"lr": cur_lr, "step_time": step_time}
iter_stats.update(loss_dict)
tb_logger.tb_train_iter_stats(global_step, iter_stats)
# save checkpoint
if global_step % CONFIG.save_step == 0:
if CONFIG.checkpoint:
# save model
save_checkpoint(
model,
optimizer,
scheduler,
None,
None,
None,
global_step,
epoch,
OUT_PATH,
model_losses=loss_dict,
)
# synthesize a full voice
wav_path = train_data[random.randrange(0, len(train_data))][0]
wav = ap.load_wav(wav_path)
ground_mel = ap.melspectrogram(wav)
sample_wav = model.generate(
ground_mel,
CONFIG.batched,
CONFIG.target_samples,
CONFIG.overlap_samples,
)
predict_mel = ap.melspectrogram(sample_wav)
# Sample audio
tb_logger.tb_train_audios(
global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"]
)
# compute spectrograms
figures = {
"prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False),
"ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False),
}
tb_logger.tb_train_figures(global_step, figures)
end_time = time.time()
# print epoch stats
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
# Plot Training Epoch Stats
epoch_stats = {"epoch_time": epoch_time}
epoch_stats.update(keep_avg.avg_values)
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
# TODO: plot model stats
# if c.tb_model_param_stats:
# tb_logger.tb_model_weights(model, global_step)
return keep_avg.avg_values, global_step
@torch.no_grad()
def evaluate(model, criterion, ap, global_step, epoch):
# create train loader
data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0))
model.eval()
epoch_time = 0
keep_avg = KeepAverage()
end_time = time.time()
c_logger.print_eval_start()
with torch.no_grad():
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
x, m, y = format_data(data)
loader_time = time.time() - end_time
global_step += 1
y_hat = model(x, m)
if isinstance(model.mode, int):
y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
else:
y = y.float()
y = y.unsqueeze(-1)
loss = criterion(y_hat, y)
# Compute avg loss
# if num_gpus > 1:
# loss = reduce_tensor(loss.data, num_gpus)
loss_dict = dict()
loss_dict["model_loss"] = loss.item()
step_time = time.time() - start_time
epoch_time += step_time
# update avg stats
update_eval_values = dict()
for key, value in loss_dict.items():
update_eval_values["avg_" + key] = value
update_eval_values["avg_loader_time"] = loader_time
update_eval_values["avg_step_time"] = step_time
keep_avg.update_values(update_eval_values)
# print eval stats
if CONFIG.print_eval:
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
if epoch > CONFIG.test_delay_epochs:
# synthesize a full voice
wav_path = eval_data[random.randrange(0, len(eval_data))][0]
wav = ap.load_wav(wav_path)
ground_mel = ap.melspectrogram(wav)
sample_wav = model.generate(
ground_mel,
CONFIG.batched,
CONFIG.target_samples,
CONFIG.overlap_samples,
)
predict_mel = ap.melspectrogram(sample_wav)
# Sample audio
tb_logger.tb_eval_audios(
global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"]
)
# compute spectrograms
figures = {
"prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False),
"ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False),
}
tb_logger.tb_eval_figures(global_step, figures)
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
return keep_avg.avg_values
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global train_data, eval_data
print(f" > Loading wavs from: {CONFIG.data_path}")
if CONFIG.feature_path is not None:
print(f" > Loading features from: {CONFIG.feature_path}")
eval_data, train_data = load_wav_feat_data(
CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size
)
eval_data, train_data = eval_data, train_data
else:
eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size)
# setup audio processor
ap = AudioProcessor(**CONFIG.audio)
# setup model
model_wavernn = setup_wavernn(CONFIG)
# define train functions
if CONFIG.mode == "mold":
criterion = discretized_mix_logistic_loss
elif CONFIG.mode == "gauss":
criterion = gaussian_loss
elif isinstance(CONFIG.mode, int):
criterion = torch.nn.CrossEntropyLoss()
if use_cuda:
model_wavernn.cuda()
if isinstance(CONFIG.mode, int):
criterion.cuda()
optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0)
scheduler = None
if "lr_scheduler" in CONFIG:
scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler)
scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params)
# slow start for the first 5 epochs
# lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1)
# scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
# restore any checkpoint
if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location="cpu")
try:
print(" > Restoring Model...")
model_wavernn.load_state_dict(checkpoint["model"])
print(" > Restoring Optimizer...")
optimizer.load_state_dict(checkpoint["optimizer"])
if "scheduler" in checkpoint:
print(" > Restoring Generator LR Scheduler...")
scheduler.load_state_dict(checkpoint["scheduler"])
scheduler.optimizer = optimizer
# TODO: fix resetting restored optimizer lr
# optimizer.load_state_dict(checkpoint["optimizer"])
except RuntimeError:
# retore only matching layers.
print(" > Partial model initialization...")
model_dict = model_wavernn.state_dict()
model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG)
model_wavernn.load_state_dict(model_dict)
print(" > Model restored from step %d" % checkpoint["step"], flush=True)
args.restore_step = checkpoint["step"]
else:
args.restore_step = 0
# DISTRIBUTED
# if num_gpus > 1:
# model = apply_gradient_allreduce(model)
num_parameters = count_parameters(model_wavernn)
print(" > Model has {} parameters".format(num_parameters), flush=True)
if "best_loss" not in locals():
best_loss = float("inf")
global_step = args.restore_step
for epoch in range(0, CONFIG.epochs):
c_logger.print_epoch_start(epoch, CONFIG.epochs)
_, global_step = train(
model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch
)
eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = eval_avg_loss_dict["avg_model_loss"]
best_loss = save_best_model(
target_loss,
best_loss,
model_wavernn,
optimizer,
scheduler,
None,
None,
None,
global_step,
epoch,
OUT_PATH,
model_losses=eval_avg_loss_dict,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--continue_path",
type=str,
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default="",
required="--config_path" not in sys.argv,
)
parser.add_argument(
"--restore_path",
type=str,
help="Model file to be restored. Use to finetune a model.",
default="",
)
parser.add_argument(
"--config_path",
type=str,
help="Path to config file for training.",
required="--continue_path" not in sys.argv,
)
parser.add_argument(
"--debug",
type=bool,
default=False,
help="Do not verify commit integrity to run training.",
)
# DISTRUBUTED
parser.add_argument(
"--rank",
type=int,
default=0,
help="DISTRIBUTED: process rank for distributed training.",
)
parser.add_argument(
"--group_id", type=str, default="", help="DISTRIBUTED: process group id."
)
args = parser.parse_args()
if args.continue_path != "":
args.output_path = args.continue_path
args.config_path = os.path.join(args.continue_path, "config.json")
list_of_files = glob.glob(
args.continue_path + "/*.pth.tar"
) # * means all if need specific format then *.csv
latest_model_file = max(list_of_files, key=os.path.getctime)
args.restore_path = latest_model_file
print(f" > Training continues for {args.restore_path}")
# setup output paths and read configs
CONFIG = load_config(args.config_path)
# check_config(c)
_ = os.path.dirname(os.path.realpath(__file__))
OUT_PATH = args.continue_path
if args.continue_path == "":
OUT_PATH = create_experiment_folder(
CONFIG.output_path, CONFIG.run_name, args.debug
)
AUDIO_PATH = os.path.join(OUT_PATH, "test_audios")
c_logger = ConsoleLogger()
if args.rank == 0:
os.makedirs(AUDIO_PATH, exist_ok=True)
new_fields = {}
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(
args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER")
# write model desc to tensorboard
tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0)
try:
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)

View File

@ -0,0 +1,95 @@
{
"model": "wavernn",
"run_name": "wavernn_test",
"run_description": "wavernn_test training",
// AUDIO PARAMETERS
"audio":{
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 20.0, // scaler value appplied after log transform of spectrogram.
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// Generating / Synthesizing
"batched": true,
"target_samples": 11000, // target number of samples to be generated in each batch entry
"overlap_samples": 550, // number of samples for crossfading between batches
// DISTRIBUTED TRAINING
// "distributed":{
// "backend": "nccl",
// "url": "tcp:\/\/localhost:54321"
// },
// MODEL PARAMETERS
"use_aux_net": true,
"use_upsample_net": true,
"upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length
"seq_len": 1280, // has to be devideable by hop_length
"mode": "mold", // mold [string], gauss [string], bits [int]
"mulaw": false, // apply mulaw if mode is bits
"padding": 2, // pad the input for resnet to see wider input length
// DATASET
"data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files
"feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant)
// TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
"epochs": 10000, // total number of epochs to train.
"warmup_steps": 10,
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, // early testing only wastes computation time.
// OPTIMIZER
"grad_clip": 4, // apply gradient clipping if > 0
"lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_params": {
"gamma": 0.5,
"milestones": [200000, 400000, 600000]
},
"lr": 1e-4, // initial learning rate
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log traning on console.
"print_eval": false, // If True, it prints loss values for each step in eval run.
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"eval_split_size": 50, // number of samples for testing
// PATHS
"output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/"
}

View File

@ -23,8 +23,12 @@ def load_wav_data(data_path, eval_split_size):
def load_wav_feat_data(data_path, feat_path, eval_split_size):
wav_paths = sorted(find_wav_files(data_path))
feat_paths = sorted(find_feat_files(feat_path))
wav_paths = find_wav_files(data_path)
feat_paths = find_feat_files(feat_path)
wav_paths.sort(key=lambda x: Path(x).stem)
feat_paths.sort(key=lambda x: Path(x).stem)
assert len(wav_paths) == len(feat_paths)
for wav, feat in zip(wav_paths, feat_paths):
wav_name = Path(wav).stem

View File

@ -41,6 +41,26 @@ def to_camel(text):
text = text.capitalize()
return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
def setup_wavernn(c):
print(" > Model: {}".format(c.model))
MyModel = importlib.import_module('TTS.vocoder.models.wavernn')
MyModel = getattr(MyModel, "WaveRNN")
model = MyModel(
rnn_dims=512,
fc_dims=512,
mode=c.mode,
mulaw=c.mulaw,
pad=c.padding,
use_aux_net=c.use_aux_net,
use_upsample_net=c.use_upsample_net,
upsample_factors=c.upsample_factors,
feat_dims=80,
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=c.audio['hop_length'],
sample_rate=c.audio['sample_rate'])
return model
def setup_generator(c):
print(" > Generator Model: {}".format(c.generator_model))