Formatting changes and distributed training

pull/10/head
Eren Golge 2019-02-27 09:50:52 +01:00
parent dce1715e0f
commit bf5f18d11e
8 changed files with 277 additions and 356 deletions

View File

@ -3,7 +3,6 @@
"model_description": "Queue memory and change lower r incrementatlly",
"audio":{
"audio_processor": "audio", // to use dictate different audio processors, if available.
// Audio processing parameters
"num_mels": 80, // size of the mel spec frame.
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
@ -25,6 +24,11 @@
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
},
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"embedding_size": 256, // Character embedding vector length. You don't need to change it in general.
"text_cleaner": "phoneme_cleaners",
"epochs": 1000, // total number of epochs to train.
@ -37,14 +41,16 @@
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
"eval_batch_size":32,
"r": 2, // Number of frames to predict for step.
"wd": 0.00001, // Weight decay weight.
"r": 5, // Number of frames to predict for step.
"wd": 0.00001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 5000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 50, // Number of steps to log traning on console.
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"batch_group_size": 8, //Number of batches to shuffle after bucketing.
"run_eval": true,
"test_delay_epochs": 100, //Until attention is aligned, testing only wastes computation time.
"data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument
"meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
"meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.

View File

@ -25,7 +25,8 @@ class MyDataset(Dataset):
cached=False,
use_phonemes=True,
phoneme_cache_path=None,
phoneme_language="en-us"):
phoneme_language="en-us",
verbose=False):
"""
Args:
root_path (str): root path for the data folder.
@ -47,6 +48,7 @@ class MyDataset(Dataset):
phoneme_cache_path (str): path to cache phoneme features.
phoneme_language (str): one the languages from
https://github.com/bootphon/phonemizer#languages
verbose (bool): print diagnostic information.
"""
self.root_path = root_path
self.batch_group_size = batch_group_size
@ -61,16 +63,17 @@ class MyDataset(Dataset):
self.use_phonemes = use_phonemes
self.phoneme_cache_path = phoneme_cache_path
self.phoneme_language = phoneme_language
self.verbose = verbose
if use_phonemes and not os.path.isdir(phoneme_cache_path):
os.makedirs(phoneme_cache_path)
print(" > DataLoader initialization")
print(" | > Data path: {}".format(root_path))
print(" | > Use phonemes: {}".format(self.use_phonemes))
if use_phonemes:
print(" | > phoneme language: {}".format(phoneme_language))
print(" | > Cached dataset: {}".format(self.cached))
print(" | > Number of instances : {}".format(len(self.items)))
if self.verbose:
print("\n > DataLoader initialization")
print(" | > Data path: {}".format(root_path))
print(" | > Use phonemes: {}".format(self.use_phonemes))
if use_phonemes:
print(" | > phoneme language: {}".format(phoneme_language))
print(" | > Cached dataset: {}".format(self.cached))
print(" | > Number of instances : {}".format(len(self.items)))
self.sort_items()
def load_wav(self, filename):
@ -125,11 +128,7 @@ class MyDataset(Dataset):
def sort_items(self):
r"""Sort instances based on text length in ascending order"""
lengths = np.array([len(ins[0]) for ins in self.items])
print(" | > Max length sequence: {}".format(np.max(lengths)))
print(" | > Min length sequence: {}".format(np.min(lengths)))
print(" | > Avg length sequence: {}".format(np.mean(lengths)))
idxs = np.argsort(lengths)
new_items = []
ignored = []
@ -139,11 +138,8 @@ class MyDataset(Dataset):
ignored.append(idx)
else:
new_items.append(self.items[idx])
print(" | > {} instances are ignored ({})".format(
len(ignored), self.min_seq_len))
# shuffle batch groups
if self.batch_group_size > 0:
print(" | > Batch group shuffling is active.")
for i in range(len(new_items) // self.batch_group_size):
offset = i * self.batch_group_size
end_offset = offset + self.batch_group_size
@ -152,6 +148,14 @@ class MyDataset(Dataset):
new_items[offset : end_offset] = temp_items
self.items = new_items
if self.verbose:
print(" | > Max length sequence: {}".format(np.max(lengths)))
print(" | > Min length sequence: {}".format(np.min(lengths)))
print(" | > Avg length sequence: {}".format(np.mean(lengths)))
print(" | > Num. instances discarded by max-min seq limits: {}".format(
len(ignored), self.min_seq_len))
print(" | > Batch group size: {}.".format(self.batch_group_size))
def __len__(self):
return len(self.items)

View File

@ -1,5 +0,0 @@
Encouraged, he started with a minute a day.
His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe.
Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning.
If he decided to watch TV he really watched it.
Often we try to bring about change through sheer effort and we put all of our energy into a new initiative.

View File

@ -21,7 +21,6 @@ class Tacotron(nn.Module):
self.linear_dim = linear_dim
self.embedding = nn.Embedding(
num_chars, embedding_dim, padding_idx=padding_idx)
print(" | > Number of characters : {}".format(num_chars))
self.embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(embedding_dim)
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_windowing)

408
train.py
View File

@ -1,38 +1,44 @@
import os
import sys
import time
import shutil
import torch
import argparse
import importlib
import os
import shutil
import sys
import time
import traceback
import numpy as np
import numpy as np
import torch
import torch.nn as nn
from tensorboardX import SummaryWriter
from torch import optim
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from utils.generic_utils import (
remove_experiment_folder, create_experiment_folder, save_checkpoint,
save_best_model, load_config, lr_decay, count_parameters, check_update,
get_commit_hash, sequence_mask, NoamLR)
from utils.text.symbols import symbols, phonemes
from utils.visual import plot_alignment, plot_spectrogram
from models.tacotron import Tacotron
from layers.losses import L1LossMasked
from datasets.TTSDataset import MyDataset
from layers.losses import L1LossMasked
from models.tacotron import Tacotron
from utils.audio import AudioProcessor
from utils.synthesis import synthesis
from utils.generic_utils import (
NoamLR, check_update, count_parameters, create_experiment_folder,
get_commit_hash, load_config, lr_decay, remove_experiment_folder,
save_best_model, save_checkpoint, sequence_mask, weight_decay)
from utils.logger import Logger
from utils.synthesis import synthesis
from utils.text.symbols import phonemes, symbols
from utils.visual import plot_alignment, plot_spectrogram
from distribute import init_distributed, apply_gradient_allreduce, reduce_tensor
from distribute import DistributedSampler
torch.manual_seed(1)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(54321)
use_cuda = torch.cuda.is_available()
num_gpus = torch.cuda.device_count()
print(" > Using CUDA: ", use_cuda)
print(" > Number of GPUs: ", torch.cuda.device_count())
print(" > Number of GPUs: ", num_gpus)
def setup_loader(is_val=False):
def setup_loader(is_val=False, verbose=False):
global ap
if is_val and not c.run_eval:
loader = None
@ -44,38 +50,44 @@ def setup_loader(is_val=False):
c.text_cleaner,
preprocessor=preprocessor,
ap=ap,
batch_group_size=0 if is_val else 8 * c.batch_size,
batch_group_size=0 if is_val else c.batch_group_size * c.batch_size,
min_seq_len=0 if is_val else c.min_seq_len,
max_seq_len=float("inf") if is_val else c.max_seq_len,
cached=False if c.dataset != "tts_cache" else True,
phoneme_cache_path=c.phoneme_cache_path,
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language
)
phoneme_language=c.phoneme_language,
verbose=verbose)
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
batch_size=c.eval_batch_size if is_val else c.batch_size,
shuffle=False,
collate_fn=dataset.collate_fn,
drop_last=False,
num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers,
sampler=sampler,
num_workers=c.num_val_loader_workers
if is_val else c.num_loader_workers,
pin_memory=False)
return loader
def train(model, criterion, criterion_st, optimizer, optimizer_st,
scheduler, ap, epoch):
data_loader = setup_loader(is_val=False)
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
ap, epoch):
data_loader = setup_loader(is_val=False, verbose=(epoch==0))
model.train()
epoch_time = 0
avg_linear_loss = 0
avg_mel_loss = 0
avg_stop_loss = 0
avg_step_time = 0
print(" | > Epoch {}/{}".format(epoch, c.epochs), flush=True)
print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True)
n_priority_freq = int(
3000 / (c.audio['sample_rate'] * 0.5) * c.audio['num_freq'])
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
if num_gpus > 0:
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
for num_iter, data in enumerate(data_loader):
start_time = time.time()
@ -116,12 +128,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st,
mask = sequence_mask(text_lengths)
# forward pass
if use_cuda:
mel_output, linear_output, alignments, stop_tokens = torch.nn.parallel.data_parallel(
model, (text_input, mel_input, mask))
else:
mel_output, linear_output, alignments, stop_tokens = model(
text_input, mel_input, mask)
mel_output, linear_output, alignments, stop_tokens = model(
text_input, mel_input, mask)
# loss computation
stop_loss = criterion_st(stop_tokens, stop_targets)
@ -134,29 +142,14 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st,
# backpass and check the grad norm for spec losses
loss.backward(retain_graph=True)
# custom weight decay
for group in optimizer.param_groups:
for param in group['params']:
current_lr = group['lr']
param.data = param.data.add(-c.wd * group['lr'], param.data)
grad_norm, skip_flag = check_update(model, 1)
if skip_flag:
optimizer.zero_grad()
print(" | > Iteration skipped!!", flush=True)
continue
optimizer, current_lr = weight_decay(optimizer, c.wd)
grad_norm, _ = check_update(model, 1.0)
optimizer.step()
# backpass and check the grad norm for stop loss
stop_loss.backward()
# custom weight decay
for group in optimizer_st.param_groups:
for param in group['params']:
param.data = param.data.add(-c.wd * group['lr'], param.data)
grad_norm_st, skip_flag = check_update(model.decoder.stopnet, 0.5)
if skip_flag:
optimizer_st.zero_grad()
print(" | > Iteration skipped fro stopnet!!")
continue
optimizer_st, _ = weight_decay(optimizer_st, c.wd)
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
optimizer_st.step()
step_time = time.time() - start_time
@ -164,49 +157,62 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st,
if current_step % c.print_step == 0:
print(
" | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} LinearLoss:{:.5f} "
" | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} LinearLoss:{:.5f} "
"MelLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} "
"GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}".format(
num_iter, batch_n_iter, current_step, loss.item(),
linear_loss.item(), mel_loss.item(), stop_loss.item(),
grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, current_lr),
"GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}"
.format(num_iter, batch_n_iter, current_step, loss.item(),
linear_loss.item(), mel_loss.item(), stop_loss.item(),
grad_norm, grad_norm_st, avg_text_length,
avg_spec_length, step_time, current_lr),
flush=True)
avg_linear_loss += float(linear_loss.item())
avg_mel_loss += float(mel_loss.item())
avg_stop_loss += stop_loss.item()
avg_step_time += step_time
# aggregate losses from processes
if num_gpus > 1:
linear_loss = reduce_tensor(linear_loss.data, num_gpus)
mel_loss = reduce_tensor(mel_loss.data, num_gpus)
loss = reduce_tensor(loss.data, num_gpus)
stop_loss = reduce_tensor(stop_loss.data, num_gpus)
# Plot Training Iter Stats
iter_stats = {"loss_posnet": linear_loss.item(),
"loss_decoder": mel_loss.item(),
"lr": current_lr,
"grad_norm": grad_norm,
"grad_norm_st": grad_norm_st,
"step_time": step_time}
tb_logger.tb_train_iter_stats(current_step, iter_stats)
if args.rank == 0:
avg_linear_loss += float(linear_loss.item())
avg_mel_loss += float(mel_loss.item())
avg_stop_loss += stop_loss.item()
avg_step_time += step_time
if current_step % c.save_step == 0:
if c.checkpoint:
# save model
save_checkpoint(model, optimizer, optimizer_st,
linear_loss.item(), OUT_PATH, current_step,
epoch)
# Plot Training Iter Stats
iter_stats = {
"loss_posnet": linear_loss.item(),
"loss_decoder": mel_loss.item(),
"lr": current_lr,
"grad_norm": grad_norm,
"grad_norm_st": grad_norm_st,
"step_time": step_time
}
tb_logger.tb_train_iter_stats(current_step, iter_stats)
# Diagnostic visualizations
const_spec = linear_output[0].data.cpu().numpy()
gt_spec = linear_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
if current_step % c.save_step == 0:
if c.checkpoint:
# save model
save_checkpoint(model, optimizer, optimizer_st,
linear_loss.item(), OUT_PATH, current_step,
epoch)
figures = {"prediction": plot_spectrogram(const_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img)}
tb_logger.tb_train_figures(current_step, figures)
# Diagnostic visualizations
const_spec = linear_output[0].data.cpu().numpy()
gt_spec = linear_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
# Sample audio
tb_logger.tb_train_audios(current_step,
{'TrainAudio': ap.inv_spectrogram(const_spec.T)},
c.audio["sample_rate"])
figures = {
"prediction": plot_spectrogram(const_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img)
}
tb_logger.tb_train_figures(current_step, figures)
# Sample audio
tb_logger.tb_train_audios(
current_step, {'TrainAudio': ap.inv_spectrogram(const_spec.T)},
c.audio["sample_rate"])
avg_linear_loss /= (num_iter + 1)
avg_mel_loss /= (num_iter + 1)
@ -216,7 +222,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st,
# print epoch stats
print(
" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
"AvgLinearLoss:{:.5f} AvgMelLoss:{:.5f} "
"AvgStopLoss:{:.5f} EpochTime:{:.2f} "
"AvgStepTime:{:.2f}".format(current_step, avg_total_loss,
@ -224,25 +230,29 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st,
avg_stop_loss, epoch_time, avg_step_time),
flush=True)
# Plot Training Epoch Stats
epoch_stats = {"loss_postnet": avg_linear_loss,
"loss_decoder": avg_mel_loss,
"stop_loss": avg_stop_loss,
"epoch_time": epoch_time}
tb_logger.tb_train_epoch_stats(current_step, epoch_stats)
if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, current_step)
# Plot Epoch Stats
if args.rank == 0:
# Plot Training Epoch Stats
epoch_stats = {
"loss_postnet": avg_linear_loss,
"loss_decoder": avg_mel_loss,
"stop_loss": avg_stop_loss,
"epoch_time": epoch_time
}
tb_logger.tb_train_epoch_stats(current_step, epoch_stats)
if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, current_step)
return avg_linear_loss, current_step
def evaluate(model, criterion, criterion_st, ap, current_step):
def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
data_loader = setup_loader(is_val=True)
model.eval()
epoch_time = 0
avg_linear_loss = 0
avg_mel_loss = 0
avg_stop_loss = 0
print(" | > Validation")
print("\n > Validation")
test_sentences = [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
@ -296,74 +306,95 @@ def evaluate(model, criterion, criterion_st, ap, current_step):
if num_iter % c.print_step == 0:
print(
" | > TotalLoss: {:.5f} LinearLoss: {:.5f} MelLoss:{:.5f} "
" | > TotalLoss: {:.5f} LinearLoss: {:.5f} MelLoss:{:.5f} "
"StopLoss: {:.5f} ".format(loss.item(),
linear_loss.item(),
mel_loss.item(),
stop_loss.item()),
flush=True)
# aggregate losses from processes
if num_gpus > 1:
linear_loss = reduce_tensor(linear_loss.data, num_gpus)
mel_loss = reduce_tensor(mel_loss.data, num_gpus)
stop_loss = reduce_tensor(stop_loss.data, num_gpus)
avg_linear_loss += float(linear_loss.item())
avg_mel_loss += float(mel_loss.item())
avg_stop_loss += stop_loss.item()
# Diagnostic visualizations
idx = np.random.randint(mel_input.shape[0])
const_spec = linear_output[idx].data.cpu().numpy()
gt_spec = linear_input[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy()
if args.rank == 0:
# Diagnostic visualizations
idx = np.random.randint(mel_input.shape[0])
const_spec = linear_output[idx].data.cpu().numpy()
gt_spec = linear_input[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy()
eval_figures = {"prediction": plot_spectrogram(const_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img)}
tb_logger.tb_eval_figures(current_step, eval_figures)
eval_figures = {
"prediction": plot_spectrogram(const_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img)
}
tb_logger.tb_eval_figures(current_step, eval_figures)
# Sample audio
tb_logger.tb_eval_audios(current_step, {"ValAudio": ap.inv_spectrogram(const_spec.T)}, c.audio["sample_rate"])
# Sample audio
tb_logger.tb_eval_audios(
current_step, {"ValAudio": ap.inv_spectrogram(const_spec.T)},
c.audio["sample_rate"])
# compute average losses
avg_linear_loss /= (num_iter + 1)
avg_mel_loss /= (num_iter + 1)
avg_stop_loss /= (num_iter + 1)
# compute average losses
avg_linear_loss /= (num_iter + 1)
avg_mel_loss /= (num_iter + 1)
avg_stop_loss /= (num_iter + 1)
# Plot Validation Stats
epoch_stats = {"loss_postnet": avg_linear_loss,
"loss_decoder": avg_mel_loss,
"stop_loss": avg_stop_loss}
tb_logger.tb_eval_stats(current_step, epoch_stats)
# Plot Validation Stats
epoch_stats = {
"loss_postnet": avg_linear_loss,
"loss_decoder": avg_mel_loss,
"stop_loss": avg_stop_loss
}
tb_logger.tb_eval_stats(current_step, epoch_stats)
# test sentences
test_audios = {}
test_figures = {}
for idx, test_sentence in enumerate(test_sentences):
try:
wav, alignment, linear_spec, _, stop_tokens = synthesis(
model, test_sentence, c, use_cuda, ap)
file_path = os.path.join(AUDIO_PATH, str(current_step))
os.makedirs(file_path, exist_ok=True)
file_path = os.path.join(file_path,
"TestSentence_{}.wav".format(idx))
ap.save_wav(wav, file_path)
test_audios['{}-audio'.format(idx)] = wav
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(linear_spec, ap)
test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment)
except:
print(" !! Error creating Test Sentence -", idx)
traceback.print_exc()
tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate'])
tb_logger.tb_test_figures(current_step, test_figures)
if args.rank == 0 and epoch > c.test_delay_epochs:
# test sentences
test_audios = {}
test_figures = {}
print(" | > Synthesizing test sentences")
for idx, test_sentence in enumerate(test_sentences):
try:
wav, alignment, linear_spec, _, stop_tokens = synthesis(
model, test_sentence, c, use_cuda, ap)
file_path = os.path.join(AUDIO_PATH, str(current_step))
os.makedirs(file_path, exist_ok=True)
file_path = os.path.join(file_path,
"TestSentence_{}.wav".format(idx))
ap.save_wav(wav, file_path)
test_audios['{}-audio'.format(idx)] = wav
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
linear_spec, ap)
test_figures['{}-alignment'.format(idx)] = plot_alignment(
alignment)
except:
print(" !! Error creating Test Sentence -", idx)
traceback.print_exc()
tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate'])
tb_logger.tb_test_figures(current_step, test_figures)
return avg_linear_loss
def main(args):
# DISTRUBUTED
if num_gpus > 1:
init_distributed(args.rank, num_gpus, args.group_id,
c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
model = Tacotron(num_chars=num_chars,
embedding_dim=c.embedding_size,
linear_dim=ap.num_freq,
mel_dim=ap.num_mels,
r=c.r,
memory_size=c.memory_size)
print(" | > Num output units : {}".format(ap.num_freq), flush=True)
model = Tacotron(
num_chars=num_chars,
embedding_dim=c.embedding_size,
linear_dim=ap.num_freq,
mel_dim=ap.num_mels,
r=c.r,
memory_size=c.memory_size)
optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0)
optimizer_st = optim.Adam(
@ -385,24 +416,26 @@ def main(args):
# 1. filter out unnecessary keys
pretrained_dict = {
k: v
for k, v in checkpoint['model'].items() if k in model_dict
for k, v in checkpoint['model'].items() if k in model_dict
}
# 2. filter out different size layers
pretrained_dict = {
k: v
for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()
for k, v in pretrained_dict.items()
if v.numel() == model_dict[k].numel()
}
# 3. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 4. load the new state dict
model.load_state_dict(model_dict)
print(" | > {} / {} layers are initialized".format(len(pretrained_dict), len(model_dict)))
print(" | > {} / {} layers are initialized".format(
len(pretrained_dict), len(model_dict)))
if use_cuda:
model = model.cuda()
criterion.cuda()
criterion_st.cuda()
for group in optimizer.param_groups:
group['lr'] = c.lr
group['lr'] = c.lr
print(
" > Model restored from step %d" % checkpoint['step'], flush=True)
start_epoch = checkpoint['epoch']
@ -410,12 +443,15 @@ def main(args):
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
print("\n > Starting a new training", flush=True)
if use_cuda:
model = model.cuda()
criterion.cuda()
criterion_st.cuda()
# DISTRUBUTED
if num_gpus > 1:
model = apply_gradient_allreduce(model)
if c.lr_decay:
scheduler = NoamLR(
optimizer,
@ -425,22 +461,18 @@ def main(args):
scheduler = None
num_params = count_parameters(model)
print(" | > Model has {} parameters".format(num_params), flush=True)
if not os.path.exists(CHECKPOINT_PATH):
os.mkdir(CHECKPOINT_PATH)
print("\n > Model has {} parameters".format(num_params), flush=True)
if 'best_loss' not in locals():
best_loss = float('inf')
for epoch in range(0, c.epochs):
train_loss, current_step = train(model, criterion, criterion_st,
optimizer, optimizer_st,
scheduler, ap, epoch)
val_loss = evaluate(model, criterion, criterion_st, ap,
current_step)
optimizer, optimizer_st, scheduler,
ap, epoch)
val_loss = evaluate(model, criterion, criterion_st, ap, current_step, epoch)
print(
" | > Train Loss: {:.5f} Validation Loss: {:.5f}".format(
" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format(
train_loss, val_loss),
flush=True)
target_loss = train_loss
@ -468,31 +500,59 @@ if __name__ == '__main__':
default=False,
help='Do not verify commit integrity to run training.')
parser.add_argument(
'--data_path', type=str, default='', help='Defines the data path. It overwrites config.json.')
'--data_path',
type=str,
default='',
help='Defines the data path. It overwrites config.json.')
parser.add_argument(
'--output_path',
type=str,
help='path for training outputs.',
default='')
# DISTRUBUTED
parser.add_argument(
'--rank',
type=int,
default=0,
help='DISTRIBUTED: process rank for distributed training.')
parser.add_argument(
'--group_id',
type=str,
default="",
help='DISTRIBUTED: process group id.')
args = parser.parse_args()
# setup output paths and read configs
c = load_config(args.config_path)
_ = os.path.dirname(os.path.realpath(__file__))
OUT_PATH = os.path.join(_, c.output_path)
OUT_PATH = create_experiment_folder(OUT_PATH, c.model_name, args.debug)
CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
os.makedirs(AUDIO_PATH, exist_ok=True)
shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))
if args.data_path != '':
c.data_path = args.data_path
# setup tensorboard
LOG_DIR = OUT_PATH
tb_logger = Logger(LOG_DIR)
if args.output_path == '':
OUT_PATH = os.path.join(_, c.output_path)
else:
OUT_PATH = args.output_path
if args.group_id == '':
OUT_PATH = create_experiment_folder(OUT_PATH, c.model_name, args.debug)
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
if args.rank == 0:
os.makedirs(AUDIO_PATH, exist_ok=True)
shutil.copyfile(args.config_path, os.path.join(OUT_PATH,
'config.json'))
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
if args.rank==0:
LOG_DIR = OUT_PATH
tb_logger = Logger(LOG_DIR)
# Conditional imports
preprocessor = importlib.import_module('datasets.preprocess')
preprocessor = getattr(preprocessor, c.dataset.lower())
audio = importlib.import_module('utils.' + c.audio['audio_processor'])
AudioProcessor = getattr(audio, 'AudioProcessor')
# Audio processor
ap = AudioProcessor(**c.audio)

View File

@ -50,10 +50,9 @@ class AudioProcessor(object):
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
print(" | > Audio Processor attributes.")
members = vars(self)
for key, value in members.items():
print(" | > {}:{}".format(key, value))
print(" | > {}:{}".format(key, value))
def save_wav(self, wav, path):
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
@ -118,8 +117,6 @@ class AudioProcessor(object):
n_fft = (self.num_freq - 1) * 2
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate)
print(" | > fft size: {}, hop length: {}, win length: {}".format(
n_fft, hop_length, win_length))
return n_fft, hop_length, win_length
def _amp_to_db(self, x):

View File

@ -1,151 +0,0 @@
import os
import sys
import librosa
import pickle
import copy
import numpy as np
from scipy import signal
import lws
_mel_basis = None
class AudioProcessor(object):
def __init__(
self,
sample_rate,
num_mels,
min_level_db,
frame_shift_ms,
frame_length_ms,
ref_level_db,
num_freq,
power,
preemphasis,
min_mel_freq,
max_mel_freq,
griffin_lim_iters=None,
):
print(" > Setting up Audio Processor...")
self.sample_rate = sample_rate
self.num_mels = num_mels
self.min_level_db = min_level_db
self.frame_shift_ms = frame_shift_ms
self.frame_length_ms = frame_length_ms
self.ref_level_db = ref_level_db
self.num_freq = num_freq
self.power = power
self.min_mel_freq = min_mel_freq
self.max_mel_freq = max_mel_freq
self.griffin_lim_iters = griffin_lim_iters
self.preemphasis = preemphasis
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
if preemphasis == 0:
print(" | > Preemphasis is deactive.")
def save_wav(self, wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
librosa.output.write_wav(
path, wav.astype(np.int16), self.sample_rate)
def _stft_parameters(self, ):
n_fft = int((self.num_freq - 1) * 2)
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate)
if n_fft % hop_length != 0:
hop_length = n_fft / 8
print(" | > hop_length is set to default ({}).".format(hop_length))
if n_fft % win_length != 0:
win_length = n_fft / 2
print(" | > win_length is set to default ({}).".format(win_length))
print(" | > fft size: {}, hop length: {}, win length: {}".format(
n_fft, hop_length, win_length))
return int(n_fft), int(hop_length), int(win_length)
def _lws_processor(self):
try:
return lws.lws(
self.win_length,
self.hop_length,
fftsize=self.n_fft,
mode="speech")
except:
raise RuntimeError(
" !! WindowLength({}) is not multiple of HopLength({}).".
format(self.win_length, self.hop_length))
def _amp_to_db(self, x):
min_level = np.exp(self.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))
def _db_to_amp(self, x):
return np.power(10.0, x * 0.05)
def _normalize(self, S):
return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1)
def _denormalize(self, S):
return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db
def apply_preemphasis(self, x):
if self.preemphasis == 0:
raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ")
return signal.lfilter([1, -self.preemphasis], [1], x)
def apply_inv_preemphasis(self, x):
if self.preemphasis == 0:
raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ")
return signal.lfilter([1], [1, -self.preemphasis], x)
def spectrogram(self, y):
f = open(os.devnull, 'w')
old_out = sys.stdout
sys.stdout = f
if self.preemphasis:
D = self._lws_processor().stft(self.apply_preemphasis(y)).T
else:
D = self._lws_processor().stft(y).T
S = self._amp_to_db(np.abs(D)) - self.ref_level_db
sys.stdout = old_out
return self._normalize(S)
def inv_spectrogram(self, spectrogram):
'''Converts spectrogram to waveform using librosa'''
f = open(os.devnull, 'w')
old_out = sys.stdout
sys.stdout = f
S = self._denormalize(spectrogram)
S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear
processor = self._lws_processor()
D = processor.run_lws(S.astype(np.float64).T**self.power)
y = processor.istft(D).astype(np.float32)
# Reconstruct phase
sys.stdout = old_out
if self.preemphasis:
return self.apply_inv_preemphasis(y)
return y
def _linear_to_mel(self, spectrogram):
global _mel_basis
if _mel_basis is None:
_mel_basis = self._build_mel_basis()
return np.dot(_mel_basis, spectrogram)
def _build_mel_basis(self, ):
return librosa.filters.mel(
self.sample_rate, self.n_fft, n_mels=self.num_mels)
# fmin=self.min_mel_freq, fmax=self.max_mel_freq)
def melspectrogram(self, y):
f = open(os.devnull, 'w')
old_out = sys.stdout
sys.stdout = f
if self.preemphasis:
D = self._lws_processor().stft(self.apply_preemphasis(y)).T
else:
D = self._lws_processor().stft(y).T
S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
sys.stdout = old_out
return self._normalize(S)

View File

@ -123,7 +123,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
best_loss = model_loss
bestmodel_path = 'best_model.pth.tar'
bestmodel_path = os.path.join(out_path, bestmodel_path)
print(" | > Best model saving with loss {0:.5f} : {1:}".format(
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(
model_loss, bestmodel_path))
torch.save(state, bestmodel_path)
return best_loss
@ -148,6 +148,17 @@ def lr_decay(init_lr, global_step, warmup_steps):
return lr
def weight_decay(optimizer, wd):
"""
Custom weight decay operation, not effecting grad values.
"""
for group in optimizer.param_groups:
for param in group['params']:
current_lr = group['lr']
param.data = param.data.add(-wd * group['lr'], param.data)
return optimizer, current_lr
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1):
self.warmup_steps = float(warmup_steps)