From bf5f18d11e550a791cb862bbfe813c395425e548 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 27 Feb 2019 09:50:52 +0100 Subject: [PATCH] Formatting changes and distributed training --- config.json | 12 +- datasets/TTSDataset.py | 38 ++-- hard-sentences.txt | 5 - models/tacotron.py | 1 - train.py | 408 +++++++++++++++++++++++------------------ utils/audio.py | 5 +- utils/audio_lws.py | 151 --------------- utils/generic_utils.py | 13 +- 8 files changed, 277 insertions(+), 356 deletions(-) delete mode 100644 hard-sentences.txt delete mode 100644 utils/audio_lws.py diff --git a/config.json b/config.json index f49eed79..037f8249 100644 --- a/config.json +++ b/config.json @@ -3,7 +3,6 @@ "model_description": "Queue memory and change lower r incrementatlly", "audio":{ - "audio_processor": "audio", // to use dictate different audio processors, if available. // Audio processing parameters "num_mels": 80, // size of the mel spec frame. "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. @@ -25,6 +24,11 @@ "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) }, + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + "embedding_size": 256, // Character embedding vector length. You don't need to change it in general. "text_cleaner": "phoneme_cleaners", "epochs": 1000, // total number of epochs to train. @@ -37,14 +41,16 @@ "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. "eval_batch_size":32, - "r": 2, // Number of frames to predict for step. - "wd": 0.00001, // Weight decay weight. + "r": 5, // Number of frames to predict for step. + "wd": 0.00001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 5000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 50, // Number of steps to log traning on console. "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "batch_group_size": 8, //Number of batches to shuffle after bucketing. "run_eval": true, + "test_delay_epochs": 100, //Until attention is aligned, testing only wastes computation time. "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader. "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader. diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index 0eb5e115..f4c70c00 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -25,7 +25,8 @@ class MyDataset(Dataset): cached=False, use_phonemes=True, phoneme_cache_path=None, - phoneme_language="en-us"): + phoneme_language="en-us", + verbose=False): """ Args: root_path (str): root path for the data folder. @@ -47,6 +48,7 @@ class MyDataset(Dataset): phoneme_cache_path (str): path to cache phoneme features. phoneme_language (str): one the languages from https://github.com/bootphon/phonemizer#languages + verbose (bool): print diagnostic information. """ self.root_path = root_path self.batch_group_size = batch_group_size @@ -61,16 +63,17 @@ class MyDataset(Dataset): self.use_phonemes = use_phonemes self.phoneme_cache_path = phoneme_cache_path self.phoneme_language = phoneme_language + self.verbose = verbose if use_phonemes and not os.path.isdir(phoneme_cache_path): os.makedirs(phoneme_cache_path) - print(" > DataLoader initialization") - print(" | > Data path: {}".format(root_path)) - print(" | > Use phonemes: {}".format(self.use_phonemes)) - if use_phonemes: - print(" | > phoneme language: {}".format(phoneme_language)) - print(" | > Cached dataset: {}".format(self.cached)) - print(" | > Number of instances : {}".format(len(self.items))) - + if self.verbose: + print("\n > DataLoader initialization") + print(" | > Data path: {}".format(root_path)) + print(" | > Use phonemes: {}".format(self.use_phonemes)) + if use_phonemes: + print(" | > phoneme language: {}".format(phoneme_language)) + print(" | > Cached dataset: {}".format(self.cached)) + print(" | > Number of instances : {}".format(len(self.items))) self.sort_items() def load_wav(self, filename): @@ -125,11 +128,7 @@ class MyDataset(Dataset): def sort_items(self): r"""Sort instances based on text length in ascending order""" lengths = np.array([len(ins[0]) for ins in self.items]) - - print(" | > Max length sequence: {}".format(np.max(lengths))) - print(" | > Min length sequence: {}".format(np.min(lengths))) - print(" | > Avg length sequence: {}".format(np.mean(lengths))) - + idxs = np.argsort(lengths) new_items = [] ignored = [] @@ -139,11 +138,8 @@ class MyDataset(Dataset): ignored.append(idx) else: new_items.append(self.items[idx]) - print(" | > {} instances are ignored ({})".format( - len(ignored), self.min_seq_len)) # shuffle batch groups if self.batch_group_size > 0: - print(" | > Batch group shuffling is active.") for i in range(len(new_items) // self.batch_group_size): offset = i * self.batch_group_size end_offset = offset + self.batch_group_size @@ -152,6 +148,14 @@ class MyDataset(Dataset): new_items[offset : end_offset] = temp_items self.items = new_items + if self.verbose: + print(" | > Max length sequence: {}".format(np.max(lengths))) + print(" | > Min length sequence: {}".format(np.min(lengths))) + print(" | > Avg length sequence: {}".format(np.mean(lengths))) + print(" | > Num. instances discarded by max-min seq limits: {}".format( + len(ignored), self.min_seq_len)) + print(" | > Batch group size: {}.".format(self.batch_group_size)) + def __len__(self): return len(self.items) diff --git a/hard-sentences.txt b/hard-sentences.txt deleted file mode 100644 index 42fbc655..00000000 --- a/hard-sentences.txt +++ /dev/null @@ -1,5 +0,0 @@ -Encouraged, he started with a minute a day. -His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe. -Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning. -If he decided to watch TV he really watched it. -Often we try to bring about change through sheer effort and we put all of our energy into a new initiative. diff --git a/models/tacotron.py b/models/tacotron.py index 6a12d257..e11d74e7 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -21,7 +21,6 @@ class Tacotron(nn.Module): self.linear_dim = linear_dim self.embedding = nn.Embedding( num_chars, embedding_dim, padding_idx=padding_idx) - print(" | > Number of characters : {}".format(num_chars)) self.embedding.weight.data.normal_(0, 0.3) self.encoder = Encoder(embedding_dim) self.decoder = Decoder(256, mel_dim, r, memory_size, attn_windowing) diff --git a/train.py b/train.py index 9c3b2b8f..783b9c89 100644 --- a/train.py +++ b/train.py @@ -1,38 +1,44 @@ -import os -import sys -import time -import shutil -import torch import argparse import importlib +import os +import shutil +import sys +import time import traceback -import numpy as np +import numpy as np +import torch import torch.nn as nn +from tensorboardX import SummaryWriter from torch import optim from torch.utils.data import DataLoader -from tensorboardX import SummaryWriter -from utils.generic_utils import ( - remove_experiment_folder, create_experiment_folder, save_checkpoint, - save_best_model, load_config, lr_decay, count_parameters, check_update, - get_commit_hash, sequence_mask, NoamLR) -from utils.text.symbols import symbols, phonemes -from utils.visual import plot_alignment, plot_spectrogram -from models.tacotron import Tacotron -from layers.losses import L1LossMasked from datasets.TTSDataset import MyDataset +from layers.losses import L1LossMasked +from models.tacotron import Tacotron from utils.audio import AudioProcessor -from utils.synthesis import synthesis +from utils.generic_utils import ( + NoamLR, check_update, count_parameters, create_experiment_folder, + get_commit_hash, load_config, lr_decay, remove_experiment_folder, + save_best_model, save_checkpoint, sequence_mask, weight_decay) from utils.logger import Logger +from utils.synthesis import synthesis +from utils.text.symbols import phonemes, symbols +from utils.visual import plot_alignment, plot_spectrogram +from distribute import init_distributed, apply_gradient_allreduce, reduce_tensor +from distribute import DistributedSampler -torch.manual_seed(1) + +torch.backends.cudnn.enabled = True +torch.backends.cudnn.benchmark = False +torch.manual_seed(54321) use_cuda = torch.cuda.is_available() +num_gpus = torch.cuda.device_count() print(" > Using CUDA: ", use_cuda) -print(" > Number of GPUs: ", torch.cuda.device_count()) +print(" > Number of GPUs: ", num_gpus) -def setup_loader(is_val=False): +def setup_loader(is_val=False, verbose=False): global ap if is_val and not c.run_eval: loader = None @@ -44,38 +50,44 @@ def setup_loader(is_val=False): c.text_cleaner, preprocessor=preprocessor, ap=ap, - batch_group_size=0 if is_val else 8 * c.batch_size, + batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, min_seq_len=0 if is_val else c.min_seq_len, max_seq_len=float("inf") if is_val else c.max_seq_len, cached=False if c.dataset != "tts_cache" else True, phoneme_cache_path=c.phoneme_cache_path, use_phonemes=c.use_phonemes, - phoneme_language=c.phoneme_language - ) + phoneme_language=c.phoneme_language, + verbose=verbose) + sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader( dataset, batch_size=c.eval_batch_size if is_val else c.batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, + sampler=sampler, + num_workers=c.num_val_loader_workers + if is_val else c.num_loader_workers, pin_memory=False) return loader -def train(model, criterion, criterion_st, optimizer, optimizer_st, - scheduler, ap, epoch): - data_loader = setup_loader(is_val=False) +def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, + ap, epoch): + data_loader = setup_loader(is_val=False, verbose=(epoch==0)) model.train() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 avg_stop_loss = 0 avg_step_time = 0 - print(" | > Epoch {}/{}".format(epoch, c.epochs), flush=True) + print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) n_priority_freq = int( 3000 / (c.audio['sample_rate'] * 0.5) * c.audio['num_freq']) - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) + if num_gpus > 0: + batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) + else: + batch_n_iter = int(len(data_loader.dataset) / c.batch_size) for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -116,12 +128,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, mask = sequence_mask(text_lengths) # forward pass - if use_cuda: - mel_output, linear_output, alignments, stop_tokens = torch.nn.parallel.data_parallel( - model, (text_input, mel_input, mask)) - else: - mel_output, linear_output, alignments, stop_tokens = model( - text_input, mel_input, mask) + mel_output, linear_output, alignments, stop_tokens = model( + text_input, mel_input, mask) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) @@ -134,29 +142,14 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, # backpass and check the grad norm for spec losses loss.backward(retain_graph=True) - # custom weight decay - for group in optimizer.param_groups: - for param in group['params']: - current_lr = group['lr'] - param.data = param.data.add(-c.wd * group['lr'], param.data) - grad_norm, skip_flag = check_update(model, 1) - if skip_flag: - optimizer.zero_grad() - print(" | > Iteration skipped!!", flush=True) - continue + optimizer, current_lr = weight_decay(optimizer, c.wd) + grad_norm, _ = check_update(model, 1.0) optimizer.step() # backpass and check the grad norm for stop loss stop_loss.backward() - # custom weight decay - for group in optimizer_st.param_groups: - for param in group['params']: - param.data = param.data.add(-c.wd * group['lr'], param.data) - grad_norm_st, skip_flag = check_update(model.decoder.stopnet, 0.5) - if skip_flag: - optimizer_st.zero_grad() - print(" | > Iteration skipped fro stopnet!!") - continue + optimizer_st, _ = weight_decay(optimizer_st, c.wd) + grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() step_time = time.time() - start_time @@ -164,49 +157,62 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, if current_step % c.print_step == 0: print( - " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} LinearLoss:{:.5f} " + " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} LinearLoss:{:.5f} " "MelLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} " - "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}".format( - num_iter, batch_n_iter, current_step, loss.item(), - linear_loss.item(), mel_loss.item(), stop_loss.item(), - grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, current_lr), + "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}" + .format(num_iter, batch_n_iter, current_step, loss.item(), + linear_loss.item(), mel_loss.item(), stop_loss.item(), + grad_norm, grad_norm_st, avg_text_length, + avg_spec_length, step_time, current_lr), flush=True) - avg_linear_loss += float(linear_loss.item()) - avg_mel_loss += float(mel_loss.item()) - avg_stop_loss += stop_loss.item() - avg_step_time += step_time + # aggregate losses from processes + if num_gpus > 1: + linear_loss = reduce_tensor(linear_loss.data, num_gpus) + mel_loss = reduce_tensor(mel_loss.data, num_gpus) + loss = reduce_tensor(loss.data, num_gpus) + stop_loss = reduce_tensor(stop_loss.data, num_gpus) - # Plot Training Iter Stats - iter_stats = {"loss_posnet": linear_loss.item(), - "loss_decoder": mel_loss.item(), - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time} - tb_logger.tb_train_iter_stats(current_step, iter_stats) + if args.rank == 0: + avg_linear_loss += float(linear_loss.item()) + avg_mel_loss += float(mel_loss.item()) + avg_stop_loss += stop_loss.item() + avg_step_time += step_time - if current_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint(model, optimizer, optimizer_st, - linear_loss.item(), OUT_PATH, current_step, - epoch) + # Plot Training Iter Stats + iter_stats = { + "loss_posnet": linear_loss.item(), + "loss_decoder": mel_loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "grad_norm_st": grad_norm_st, + "step_time": step_time + } + tb_logger.tb_train_iter_stats(current_step, iter_stats) - # Diagnostic visualizations - const_spec = linear_output[0].data.cpu().numpy() - gt_spec = linear_input[0].data.cpu().numpy() - align_img = alignments[0].data.cpu().numpy() + if current_step % c.save_step == 0: + if c.checkpoint: + # save model + save_checkpoint(model, optimizer, optimizer_st, + linear_loss.item(), OUT_PATH, current_step, + epoch) - figures = {"prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img)} - tb_logger.tb_train_figures(current_step, figures) + # Diagnostic visualizations + const_spec = linear_output[0].data.cpu().numpy() + gt_spec = linear_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() - # Sample audio - tb_logger.tb_train_audios(current_step, - {'TrainAudio': ap.inv_spectrogram(const_spec.T)}, - c.audio["sample_rate"]) + figures = { + "prediction": plot_spectrogram(const_spec, ap), + "ground_truth": plot_spectrogram(gt_spec, ap), + "alignment": plot_alignment(align_img) + } + tb_logger.tb_train_figures(current_step, figures) + + # Sample audio + tb_logger.tb_train_audios( + current_step, {'TrainAudio': ap.inv_spectrogram(const_spec.T)}, + c.audio["sample_rate"]) avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) @@ -216,7 +222,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, # print epoch stats print( - " | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " + " | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " "AvgLinearLoss:{:.5f} AvgMelLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStepTime:{:.2f}".format(current_step, avg_total_loss, @@ -224,25 +230,29 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, avg_stop_loss, epoch_time, avg_step_time), flush=True) - # Plot Training Epoch Stats - epoch_stats = {"loss_postnet": avg_linear_loss, - "loss_decoder": avg_mel_loss, - "stop_loss": avg_stop_loss, - "epoch_time": epoch_time} - tb_logger.tb_train_epoch_stats(current_step, epoch_stats) - if c.tb_model_param_stats: - tb_logger.tb_model_weights(model, current_step) + # Plot Epoch Stats + if args.rank == 0: + # Plot Training Epoch Stats + epoch_stats = { + "loss_postnet": avg_linear_loss, + "loss_decoder": avg_mel_loss, + "stop_loss": avg_stop_loss, + "epoch_time": epoch_time + } + tb_logger.tb_train_epoch_stats(current_step, epoch_stats) + if c.tb_model_param_stats: + tb_logger.tb_model_weights(model, current_step) return avg_linear_loss, current_step -def evaluate(model, criterion, criterion_st, ap, current_step): +def evaluate(model, criterion, criterion_st, ap, current_step, epoch): data_loader = setup_loader(is_val=True) model.eval() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 avg_stop_loss = 0 - print(" | > Validation") + print("\n > Validation") test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", @@ -296,74 +306,95 @@ def evaluate(model, criterion, criterion_st, ap, current_step): if num_iter % c.print_step == 0: print( - " | > TotalLoss: {:.5f} LinearLoss: {:.5f} MelLoss:{:.5f} " + " | > TotalLoss: {:.5f} LinearLoss: {:.5f} MelLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), linear_loss.item(), mel_loss.item(), stop_loss.item()), flush=True) + # aggregate losses from processes + if num_gpus > 1: + linear_loss = reduce_tensor(linear_loss.data, num_gpus) + mel_loss = reduce_tensor(mel_loss.data, num_gpus) + stop_loss = reduce_tensor(stop_loss.data, num_gpus) + avg_linear_loss += float(linear_loss.item()) avg_mel_loss += float(mel_loss.item()) avg_stop_loss += stop_loss.item() - # Diagnostic visualizations - idx = np.random.randint(mel_input.shape[0]) - const_spec = linear_output[idx].data.cpu().numpy() - gt_spec = linear_input[idx].data.cpu().numpy() - align_img = alignments[idx].data.cpu().numpy() + if args.rank == 0: + # Diagnostic visualizations + idx = np.random.randint(mel_input.shape[0]) + const_spec = linear_output[idx].data.cpu().numpy() + gt_spec = linear_input[idx].data.cpu().numpy() + align_img = alignments[idx].data.cpu().numpy() - eval_figures = {"prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img)} - tb_logger.tb_eval_figures(current_step, eval_figures) + eval_figures = { + "prediction": plot_spectrogram(const_spec, ap), + "ground_truth": plot_spectrogram(gt_spec, ap), + "alignment": plot_alignment(align_img) + } + tb_logger.tb_eval_figures(current_step, eval_figures) - # Sample audio - tb_logger.tb_eval_audios(current_step, {"ValAudio": ap.inv_spectrogram(const_spec.T)}, c.audio["sample_rate"]) + # Sample audio + tb_logger.tb_eval_audios( + current_step, {"ValAudio": ap.inv_spectrogram(const_spec.T)}, + c.audio["sample_rate"]) - # compute average losses - avg_linear_loss /= (num_iter + 1) - avg_mel_loss /= (num_iter + 1) - avg_stop_loss /= (num_iter + 1) + # compute average losses + avg_linear_loss /= (num_iter + 1) + avg_mel_loss /= (num_iter + 1) + avg_stop_loss /= (num_iter + 1) - # Plot Validation Stats - epoch_stats = {"loss_postnet": avg_linear_loss, - "loss_decoder": avg_mel_loss, - "stop_loss": avg_stop_loss} - tb_logger.tb_eval_stats(current_step, epoch_stats) + # Plot Validation Stats + epoch_stats = { + "loss_postnet": avg_linear_loss, + "loss_decoder": avg_mel_loss, + "stop_loss": avg_stop_loss + } + tb_logger.tb_eval_stats(current_step, epoch_stats) - # test sentences - test_audios = {} - test_figures = {} - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, linear_spec, _, stop_tokens = synthesis( - model, test_sentence, c, use_cuda, ap) - file_path = os.path.join(AUDIO_PATH, str(current_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, - "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios['{}-audio'.format(idx)] = wav - test_figures['{}-prediction'.format(idx)] = plot_spectrogram(linear_spec, ap) - test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment) - except: - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) - tb_logger.tb_test_figures(current_step, test_figures) + if args.rank == 0 and epoch > c.test_delay_epochs: + # test sentences + test_audios = {} + test_figures = {} + print(" | > Synthesizing test sentences") + for idx, test_sentence in enumerate(test_sentences): + try: + wav, alignment, linear_spec, _, stop_tokens = synthesis( + model, test_sentence, c, use_cuda, ap) + file_path = os.path.join(AUDIO_PATH, str(current_step)) + os.makedirs(file_path, exist_ok=True) + file_path = os.path.join(file_path, + "TestSentence_{}.wav".format(idx)) + ap.save_wav(wav, file_path) + test_audios['{}-audio'.format(idx)] = wav + test_figures['{}-prediction'.format(idx)] = plot_spectrogram( + linear_spec, ap) + test_figures['{}-alignment'.format(idx)] = plot_alignment( + alignment) + except: + print(" !! Error creating Test Sentence -", idx) + traceback.print_exc() + tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) + tb_logger.tb_test_figures(current_step, test_figures) return avg_linear_loss def main(args): + # DISTRUBUTED + if num_gpus > 1: + init_distributed(args.rank, num_gpus, args.group_id, + c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) - model = Tacotron(num_chars=num_chars, - embedding_dim=c.embedding_size, - linear_dim=ap.num_freq, - mel_dim=ap.num_mels, - r=c.r, - memory_size=c.memory_size) - print(" | > Num output units : {}".format(ap.num_freq), flush=True) + model = Tacotron( + num_chars=num_chars, + embedding_dim=c.embedding_size, + linear_dim=ap.num_freq, + mel_dim=ap.num_mels, + r=c.r, + memory_size=c.memory_size) optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer_st = optim.Adam( @@ -385,24 +416,26 @@ def main(args): # 1. filter out unnecessary keys pretrained_dict = { k: v - for k, v in checkpoint['model'].items() if k in model_dict + for k, v in checkpoint['model'].items() if k in model_dict } # 2. filter out different size layers pretrained_dict = { k: v - for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel() + for k, v in pretrained_dict.items() + if v.numel() == model_dict[k].numel() } # 3. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 4. load the new state dict model.load_state_dict(model_dict) - print(" | > {} / {} layers are initialized".format(len(pretrained_dict), len(model_dict))) + print(" | > {} / {} layers are initialized".format( + len(pretrained_dict), len(model_dict))) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() for group in optimizer.param_groups: - group['lr'] = c.lr + group['lr'] = c.lr print( " > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['epoch'] @@ -410,12 +443,15 @@ def main(args): args.restore_step = checkpoint['step'] else: args.restore_step = 0 - print("\n > Starting a new training", flush=True) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() + # DISTRUBUTED + if num_gpus > 1: + model = apply_gradient_allreduce(model) + if c.lr_decay: scheduler = NoamLR( optimizer, @@ -425,22 +461,18 @@ def main(args): scheduler = None num_params = count_parameters(model) - print(" | > Model has {} parameters".format(num_params), flush=True) - - if not os.path.exists(CHECKPOINT_PATH): - os.mkdir(CHECKPOINT_PATH) + print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, - optimizer, optimizer_st, - scheduler, ap, epoch) - val_loss = evaluate(model, criterion, criterion_st, ap, - current_step) + optimizer, optimizer_st, scheduler, + ap, epoch) + val_loss = evaluate(model, criterion, criterion_st, ap, current_step, epoch) print( - " | > Train Loss: {:.5f} Validation Loss: {:.5f}".format( + " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss @@ -468,31 +500,59 @@ if __name__ == '__main__': default=False, help='Do not verify commit integrity to run training.') parser.add_argument( - '--data_path', type=str, default='', help='Defines the data path. It overwrites config.json.') + '--data_path', + type=str, + default='', + help='Defines the data path. It overwrites config.json.') + parser.add_argument( + '--output_path', + type=str, + help='path for training outputs.', + default='') + + # DISTRUBUTED + parser.add_argument( + '--rank', + type=int, + default=0, + help='DISTRIBUTED: process rank for distributed training.') + parser.add_argument( + '--group_id', + type=str, + default="", + help='DISTRIBUTED: process group id.') args = parser.parse_args() # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) - OUT_PATH = os.path.join(_, c.output_path) - OUT_PATH = create_experiment_folder(OUT_PATH, c.model_name, args.debug) - CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - os.makedirs(AUDIO_PATH, exist_ok=True) - shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) - if args.data_path != '': c.data_path = args.data_path - # setup tensorboard - LOG_DIR = OUT_PATH - tb_logger = Logger(LOG_DIR) + if args.output_path == '': + OUT_PATH = os.path.join(_, c.output_path) + else: + OUT_PATH = args.output_path + + if args.group_id == '': + OUT_PATH = create_experiment_folder(OUT_PATH, c.model_name, args.debug) + + AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') + + if args.rank == 0: + os.makedirs(AUDIO_PATH, exist_ok=True) + shutil.copyfile(args.config_path, os.path.join(OUT_PATH, + 'config.json')) + os.chmod(AUDIO_PATH, 0o775) + os.chmod(OUT_PATH, 0o775) + + if args.rank==0: + LOG_DIR = OUT_PATH + tb_logger = Logger(LOG_DIR) # Conditional imports preprocessor = importlib.import_module('datasets.preprocess') preprocessor = getattr(preprocessor, c.dataset.lower()) - audio = importlib.import_module('utils.' + c.audio['audio_processor']) - AudioProcessor = getattr(audio, 'AudioProcessor') # Audio processor ap = AudioProcessor(**c.audio) diff --git a/utils/audio.py b/utils/audio.py index 60d68584..08dda0a8 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -50,10 +50,9 @@ class AudioProcessor(object): self.clip_norm = clip_norm self.do_trim_silence = do_trim_silence self.n_fft, self.hop_length, self.win_length = self._stft_parameters() - print(" | > Audio Processor attributes.") members = vars(self) for key, value in members.items(): - print(" | > {}:{}".format(key, value)) + print(" | > {}:{}".format(key, value)) def save_wav(self, wav, path): wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) @@ -118,8 +117,6 @@ class AudioProcessor(object): n_fft = (self.num_freq - 1) * 2 hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate) - print(" | > fft size: {}, hop length: {}, win length: {}".format( - n_fft, hop_length, win_length)) return n_fft, hop_length, win_length def _amp_to_db(self, x): diff --git a/utils/audio_lws.py b/utils/audio_lws.py deleted file mode 100644 index 9cf3fc7a..00000000 --- a/utils/audio_lws.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -import sys -import librosa -import pickle -import copy -import numpy as np -from scipy import signal -import lws - -_mel_basis = None - - -class AudioProcessor(object): - def __init__( - self, - sample_rate, - num_mels, - min_level_db, - frame_shift_ms, - frame_length_ms, - ref_level_db, - num_freq, - power, - preemphasis, - min_mel_freq, - max_mel_freq, - griffin_lim_iters=None, - ): - print(" > Setting up Audio Processor...") - self.sample_rate = sample_rate - self.num_mels = num_mels - self.min_level_db = min_level_db - self.frame_shift_ms = frame_shift_ms - self.frame_length_ms = frame_length_ms - self.ref_level_db = ref_level_db - self.num_freq = num_freq - self.power = power - self.min_mel_freq = min_mel_freq - self.max_mel_freq = max_mel_freq - self.griffin_lim_iters = griffin_lim_iters - self.preemphasis = preemphasis - self.n_fft, self.hop_length, self.win_length = self._stft_parameters() - if preemphasis == 0: - print(" | > Preemphasis is deactive.") - - def save_wav(self, wav, path): - wav *= 32767 / max(0.01, np.max(np.abs(wav))) - librosa.output.write_wav( - path, wav.astype(np.int16), self.sample_rate) - - def _stft_parameters(self, ): - n_fft = int((self.num_freq - 1) * 2) - hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) - win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate) - if n_fft % hop_length != 0: - hop_length = n_fft / 8 - print(" | > hop_length is set to default ({}).".format(hop_length)) - if n_fft % win_length != 0: - win_length = n_fft / 2 - print(" | > win_length is set to default ({}).".format(win_length)) - print(" | > fft size: {}, hop length: {}, win length: {}".format( - n_fft, hop_length, win_length)) - return int(n_fft), int(hop_length), int(win_length) - - def _lws_processor(self): - try: - return lws.lws( - self.win_length, - self.hop_length, - fftsize=self.n_fft, - mode="speech") - except: - raise RuntimeError( - " !! WindowLength({}) is not multiple of HopLength({}).". - format(self.win_length, self.hop_length)) - - def _amp_to_db(self, x): - min_level = np.exp(self.min_level_db / 20 * np.log(10)) - return 20 * np.log10(np.maximum(min_level, x)) - - def _db_to_amp(self, x): - return np.power(10.0, x * 0.05) - - def _normalize(self, S): - return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1) - - def _denormalize(self, S): - return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db - - def apply_preemphasis(self, x): - if self.preemphasis == 0: - raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") - return signal.lfilter([1, -self.preemphasis], [1], x) - - def apply_inv_preemphasis(self, x): - if self.preemphasis == 0: - raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") - return signal.lfilter([1], [1, -self.preemphasis], x) - - def spectrogram(self, y): - f = open(os.devnull, 'w') - old_out = sys.stdout - sys.stdout = f - if self.preemphasis: - D = self._lws_processor().stft(self.apply_preemphasis(y)).T - else: - D = self._lws_processor().stft(y).T - S = self._amp_to_db(np.abs(D)) - self.ref_level_db - sys.stdout = old_out - return self._normalize(S) - - def inv_spectrogram(self, spectrogram): - '''Converts spectrogram to waveform using librosa''' - f = open(os.devnull, 'w') - old_out = sys.stdout - sys.stdout = f - S = self._denormalize(spectrogram) - S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear - processor = self._lws_processor() - D = processor.run_lws(S.astype(np.float64).T**self.power) - y = processor.istft(D).astype(np.float32) - # Reconstruct phase - sys.stdout = old_out - if self.preemphasis: - return self.apply_inv_preemphasis(y) - return y - - def _linear_to_mel(self, spectrogram): - global _mel_basis - if _mel_basis is None: - _mel_basis = self._build_mel_basis() - return np.dot(_mel_basis, spectrogram) - - def _build_mel_basis(self, ): - return librosa.filters.mel( - self.sample_rate, self.n_fft, n_mels=self.num_mels) - - -# fmin=self.min_mel_freq, fmax=self.max_mel_freq) - - def melspectrogram(self, y): - f = open(os.devnull, 'w') - old_out = sys.stdout - sys.stdout = f - if self.preemphasis: - D = self._lws_processor().stft(self.apply_preemphasis(y)).T - else: - D = self._lws_processor().stft(y).T - S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db - sys.stdout = old_out - return self._normalize(S) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 1c178fd6..0161e166 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -123,7 +123,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, best_loss = model_loss bestmodel_path = 'best_model.pth.tar' bestmodel_path = os.path.join(out_path, bestmodel_path) - print(" | > Best model saving with loss {0:.5f} : {1:}".format( + print("\n > BEST MODEL ({0:.5f}) : {1:}".format( model_loss, bestmodel_path)) torch.save(state, bestmodel_path) return best_loss @@ -148,6 +148,17 @@ def lr_decay(init_lr, global_step, warmup_steps): return lr +def weight_decay(optimizer, wd): + """ + Custom weight decay operation, not effecting grad values. + """ + for group in optimizer.param_groups: + for param in group['params']: + current_lr = group['lr'] + param.data = param.data.add(-wd * group['lr'], param.data) + return optimizer, current_lr + + class NoamLR(torch.optim.lr_scheduler._LRScheduler): def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): self.warmup_steps = float(warmup_steps)