From 609d8efa69f3f6e089b5c5af7d167d290734514c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 11 Sep 2019 10:32:07 +0200 Subject: [PATCH 01/35] compute alignment diagonality score and encapsulate stats averaging with a class in traning --- train.py | 204 ++++++++++++++++++++++++----------------- utils/generic_utils.py | 33 ++++++- utils/measures.py | 19 ++++ 3 files changed, 170 insertions(+), 86 deletions(-) create mode 100644 utils/measures.py diff --git a/train.py b/train.py index 30133b96..1100c1f3 100644 --- a/train.py +++ b/train.py @@ -20,7 +20,7 @@ from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters, load_config, remove_experiment_folder, save_best_model, save_checkpoint, weight_decay, set_init_dict, copy_config_file, setup_model, - split_dataset, gradual_training_scheduler) + split_dataset, gradual_training_scheduler, KeepAverage) from TTS.utils.logger import Logger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers @@ -29,6 +29,7 @@ from TTS.utils.text.symbols import phonemes, symbols from TTS.utils.visual import plot_alignment, plot_spectrogram from TTS.datasets.preprocess import get_preprocessor_by_name from TTS.utils.radam import RAdam +from TTS.utils.measures import alignment_diagonal_score torch.backends.cudnn.enabled = True @@ -45,12 +46,14 @@ def setup_loader(ap, is_val=False, verbose=False): global meta_data_eval if "meta_data_train" not in globals(): if c.meta_file_train is not None: - meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path, c.meta_file_train) + meta_data_train = get_preprocessor_by_name( + c.dataset)(c.data_path, c.meta_file_train) else: meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path) if "meta_data_eval" not in globals() and c.run_eval: if c.meta_file_val is not None: - meta_data_eval = get_preprocessor_by_name(c.dataset)(c.data_path, c.meta_file_val) + meta_data_eval = get_preprocessor_by_name( + c.dataset)(c.data_path, c.meta_file_val) else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) if is_val and not c.run_eval: @@ -90,14 +93,20 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, speaker_mapping = load_speaker_mapping(OUT_PATH) model.train() epoch_time = 0 - avg_postnet_loss = 0 - avg_decoder_loss = 0 - avg_stop_loss = 0 - avg_step_time = 0 - avg_loader_time = 0 + train_values = { + 'avg_postnet_loss': 0, + 'avg_decoder_loss': 0, + 'avg_stop_loss': 0, + 'avg_align_score': 0, + 'avg_step_time': 0, + 'avg_loader_time': 0, + 'avg_alignment_score': 0} + keep_avg = KeepAverage() + keep_avg.add_values(train_values) print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / + (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -108,7 +117,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, text_input = data[0] text_lengths = data[1] speaker_names = data[2] - linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None + linear_input = data[3] if c.model in [ + "Tacotron", "TacotronGST"] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] @@ -126,7 +136,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze( + 2).float().squeeze(2) global_step += 1 @@ -143,7 +154,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron", "TacotronGST"] else None + linear_input = linear_input.cuda(non_blocking=True) if c.model in [ + "Tacotron", "TacotronGST"] else None stop_targets = stop_targets.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) @@ -153,13 +165,16 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation - stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) + stop_loss = criterion_st( + stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion(postnet_output, linear_input, mel_lengths) + postnet_loss = criterion( + postnet_output, linear_input, mel_lengths) else: - postnet_loss = criterion(postnet_output, mel_input, mel_lengths) + postnet_loss = criterion( + postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: @@ -175,6 +190,10 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() + # compute alignment score + align_score = alignment_diagonal_score(alignments) + keep_avg.update_value('avg_align_score', align_score) + # backpass and check the grad norm for stop loss if c.separate_stopnet: stop_loss.backward() @@ -183,18 +202,18 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, optimizer_st.step() else: grad_norm_st = 0 - + step_time = time.time() - start_time epoch_time += step_time if global_step % c.print_step == 0: print( - " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} " - "DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} " + " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} " + "DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( - num_iter, batch_n_iter, global_step, loss.item(), - postnet_loss.item(), decoder_loss.item(), stop_loss.item(), + num_iter, batch_n_iter, global_step, + postnet_loss.item(), decoder_loss.item(), stop_loss.item(), align_score.item(), grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), flush=True) @@ -204,14 +223,16 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) loss = reduce_tensor(loss.data, num_gpus) - stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss + stop_loss = reduce_tensor( + stop_loss.data, num_gpus) if c.stopnet else stop_loss if args.rank == 0: - avg_postnet_loss += float(postnet_loss.item()) - avg_decoder_loss += float(decoder_loss.item()) - avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()) - avg_step_time += step_time - avg_loader_time += loader_time + update_train_values = {'avg_postnet_loss': float(postnet_loss.item()), + 'avg_decoder_loss': float(decoder_loss.item()), + 'avg_stop_loss': stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()), + 'avg_step_time': step_time, + 'avg_loader_time': loader_time} + keep_avg.update_values(update_train_values) # Plot Training Iter Stats # reduce TB load @@ -233,7 +254,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() - gt_spec = linear_input[0].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy() + gt_spec = linear_input[0].data.cpu().numpy() if c.model in [ + "Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { @@ -253,35 +275,28 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, c.audio["sample_rate"]) end_time = time.time() - avg_postnet_loss /= (num_iter + 1) - avg_decoder_loss /= (num_iter + 1) - avg_stop_loss /= (num_iter + 1) - avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss - avg_step_time /= (num_iter + 1) - avg_loader_time /= (num_iter + 1) - # print epoch stats print( " | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " - "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, avg_total_loss, - avg_postnet_loss, avg_decoder_loss, - avg_stop_loss, epoch_time, avg_step_time, - avg_loader_time), + "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, keep_avg['avg_postnet_loss'], keep_avg['avg_decoder_loss'], + keep_avg['avg_stop_loss'], keep_avg['avg_align_score'], + epoch_time, keep_avg['avg_step_time'], keep_avg['avg_loader_time']), flush=True) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats - epoch_stats = {"loss_postnet": avg_postnet_loss, - "loss_decoder": avg_decoder_loss, - "stop_loss": avg_stop_loss, + epoch_stats = {"loss_postnet": keep_avg['avg_postnet_loss'], + "loss_decoder": keep_avg['avg_decoder_loss'], + "stop_loss": keep_avg['avg_stop_loss'], + "alignment_score": keep_avg['avg_align_score'], "epoch_time": epoch_time} tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) - return avg_postnet_loss, global_step + return keep_avg['avg_postnet_loss'], global_step def evaluate(model, criterion, criterion_st, ap, global_step, epoch): @@ -290,9 +305,12 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 - avg_postnet_loss = 0 - avg_decoder_loss = 0 - avg_stop_loss = 0 + eval_values_dict = {'avg_postnet_loss' : 0, + 'avg_decoder_loss' : 0, + 'avg_stop_loss' : 0, + 'avg_align_score': 0} + keep_avg = KeepAverage() + keep_avg.add_values(eval_values_dict) print("\n > Validation") if c.test_sentences_file is None: test_sentences = [ @@ -313,7 +331,8 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): text_input = data[0] text_lengths = data[1] speaker_names = data[2] - linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None + linear_input = data[3] if c.model in [ + "Tacotron", "TacotronGST"] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] @@ -329,14 +348,16 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze( + 2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() - linear_input = linear_input.cuda() if c.model in ["Tacotron", "TacotronGST"] else None + linear_input = linear_input.cuda() if c.model in [ + "Tacotron", "TacotronGST"] else None stop_targets = stop_targets.cuda() if speaker_ids is not None: speaker_ids = speaker_ids.cuda() @@ -347,13 +368,17 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): speaker_ids=speaker_ids) # loss computation - stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) + stop_loss = criterion_st( + stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: - decoder_loss = criterion(decoder_output, mel_input, mel_lengths) + decoder_loss = criterion( + decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion(postnet_output, linear_input, mel_lengths) + postnet_loss = criterion( + postnet_output, linear_input, mel_lengths) else: - postnet_loss = criterion(postnet_output, mel_input, mel_lengths) + postnet_loss = criterion( + postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: @@ -365,14 +390,9 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): step_time = time.time() - start_time epoch_time += step_time - if num_iter % c.print_step == 0: - print( - " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " - "StopLoss: {:.5f} ".format(loss.item(), - postnet_loss.item(), - decoder_loss.item(), - stop_loss.item()), - flush=True) + # compute alignment score + align_score = alignment_diagonal_score(alignments) + keep_avg.update_value('avg_align_score', align_score) # aggregate losses from processes if num_gpus > 1: @@ -381,15 +401,26 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) - avg_postnet_loss += float(postnet_loss.item()) - avg_decoder_loss += float(decoder_loss.item()) - avg_stop_loss += stop_loss.item() + keep_avg.update_values({'avg_postnet_loss' : float(postnet_loss.item()), + 'avg_decoder_loss' : float(decoder_loss.item()), + 'avg_stop_loss' : float(stop_loss.item())}) + + if num_iter % c.print_step == 0: + print( + " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " + "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}".format(loss.item(), + postnet_loss.item(), keep_avg['avg_postnet_loss'], + decoder_loss.item(), keep_avg['avg_decoder_loss'], + stop_loss.item(), keep_avg['avg_stop_loss'], + align_score.item(), keep_avg['avg_align_score']), + flush=True) if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() - gt_spec = linear_input[idx].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[idx].data.cpu().numpy() + gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ + "Tacotron", "TacotronGST"] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { @@ -404,17 +435,13 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) - - # compute average losses - avg_postnet_loss /= (num_iter + 1) - avg_decoder_loss /= (num_iter + 1) - avg_stop_loss /= (num_iter + 1) + tb_logger.tb_eval_audios( + global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # Plot Validation Stats - epoch_stats = {"loss_postnet": avg_postnet_loss, - "loss_decoder": avg_decoder_loss, - "stop_loss": avg_stop_loss} + epoch_stats = {"loss_postnet": keep_avg['avg_postnet_loss'], + "loss_decoder": keep_avg['avg_decoder_loss'], + "stop_loss": keep_avg['avg_stop_loss']} tb_logger.tb_eval_stats(global_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: @@ -436,18 +463,21 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav - test_figures['{}-prediction'.format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment) + test_figures['{}-prediction'.format(idx) + ] = plot_spectrogram(postnet_output, ap) + test_figures['{}-alignment'.format(idx) + ] = plot_alignment(alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) + tb_logger.tb_test_audios( + global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) - return avg_postnet_loss + return keep_avg['avg_postnet_loss'] -#FIXME: move args definition/parsing inside of main? -def main(args): #pylint: disable=redefined-outer-name +# FIXME: move args definition/parsing inside of main? +def main(args): # pylint: disable=redefined-outer-name # Audio processor ap = AudioProcessor(**c.audio) @@ -488,9 +518,11 @@ def main(args): #pylint: disable=redefined-outer-name optimizer_st = None if c.loss_masking: - criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"] else MSELossMasked() + criterion = L1LossMasked() if c.model in [ + "Tacotron", "TacotronGST"] else MSELossMasked() else: - criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"] else nn.MSELoss() + criterion = nn.L1Loss() if c.model in [ + "Tacotron", "TacotronGST"] else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None if args.restore_path: @@ -552,7 +584,8 @@ def main(args): #pylint: disable=redefined-outer-name train_loss, global_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch) - val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch) + val_loss = evaluate(model, criterion, criterion_st, + ap, global_step, epoch) print( " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), @@ -635,7 +668,8 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'), new_fields) + copy_config_file(args.config_path, os.path.join( + OUT_PATH, 'config.json'), new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) @@ -650,8 +684,8 @@ if __name__ == '__main__': try: sys.exit(0) except SystemExit: - os._exit(0) #pylint: disable=protected-access - except Exception: #pylint: disable=broad-except + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() sys.exit(1) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 1c16834a..d72ffdd5 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -313,4 +313,35 @@ def gradual_training_scheduler(global_step, config): for values in config.gradual_training: if global_step >= values[0]: new_values = values - return new_values[1], new_values[2] \ No newline at end of file + return new_values[1], new_values[2] + + +class KeepAverage(): + def __init__(self): + self.avg_values = {} + self.iters = {} + + def __getitem__(self, key): + return self.avg_values[key] + + def add_value(self, name, init_val=0, init_iter=0): + self.avg_values[name] = init_val + self.iters[name] = init_iter + + def update_value(self, name, value, weighted_avg=False): + if weighted_avg: + self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value + self.iters[name] += 1 + else: + self.avg_values[name] = self.avg_values[name] * self.iters[name] + value + self.iters[name] += 1 + self.avg_values[name] /= self.iters[name] + + def add_values(self, name_dict): + for key, value in name_dict.items(): + self.add_value(key, init_val=value) + + def update_values(self, value_dict): + for key, value in value_dict.items(): + self.update_value(key, value) + diff --git a/utils/measures.py b/utils/measures.py new file mode 100644 index 00000000..21652cf0 --- /dev/null +++ b/utils/measures.py @@ -0,0 +1,19 @@ +import torch +import numpy as np + + +def alignment_diagonal_score(alignments): + """ + Compute how diagonal alignment predictions are. It is useful + to measure the alignment consistency of a model + Args: + alignments (torch.Tensor): batch of alignments. + Shape: + alignments : batch x decoder_steps x encoder_steps + """ + return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0) + + + + + From d45d963dc11c37cafb4b86c73d81a18114342e2c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 11 Sep 2019 10:39:59 +0200 Subject: [PATCH 02/35] linter fix --- train.py | 29 +++++++++++++++-------------- utils/generic_utils.py | 15 ++++++++------- utils/measures.py | 8 -------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/train.py b/train.py index 1100c1f3..13444c82 100644 --- a/train.py +++ b/train.py @@ -190,7 +190,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() - # compute alignment score + # compute alignment score align_score = alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_score', align_score) @@ -281,7 +281,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, keep_avg['avg_postnet_loss'], keep_avg['avg_decoder_loss'], - keep_avg['avg_stop_loss'], keep_avg['avg_align_score'], + keep_avg['avg_stop_loss'], keep_avg['avg_align_score'], epoch_time, keep_avg['avg_step_time'], keep_avg['avg_loader_time']), flush=True) @@ -305,11 +305,11 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 - eval_values_dict = {'avg_postnet_loss' : 0, - 'avg_decoder_loss' : 0, - 'avg_stop_loss' : 0, + eval_values_dict = {'avg_postnet_loss': 0, + 'avg_decoder_loss': 0, + 'avg_stop_loss': 0, 'avg_align_score': 0} - keep_avg = KeepAverage() + keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) print("\n > Validation") if c.test_sentences_file is None: @@ -401,18 +401,19 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) - keep_avg.update_values({'avg_postnet_loss' : float(postnet_loss.item()), - 'avg_decoder_loss' : float(decoder_loss.item()), - 'avg_stop_loss' : float(stop_loss.item())}) + keep_avg.update_values({'avg_postnet_loss': float(postnet_loss.item()), + 'avg_decoder_loss': float(decoder_loss.item()), + 'avg_stop_loss': float(stop_loss.item())}) if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " - "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}".format(loss.item(), - postnet_loss.item(), keep_avg['avg_postnet_loss'], - decoder_loss.item(), keep_avg['avg_decoder_loss'], - stop_loss.item(), keep_avg['avg_stop_loss'], - align_score.item(), keep_avg['avg_align_score']), + "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}".format( + loss.item(), + postnet_loss.item(), keep_avg['avg_postnet_loss'], + decoder_loss.item(), keep_avg['avg_decoder_loss'], + stop_loss.item(), keep_avg['avg_stop_loss'], + align_score.item(), keep_avg['avg_align_score']), flush=True) if args.rank == 0: diff --git a/utils/generic_utils.py b/utils/generic_utils.py index d72ffdd5..1053d221 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -31,7 +31,8 @@ def load_config(config_path): def get_git_branch(): try: out = subprocess.check_output(["git", "branch"]).decode("utf8") - current = next(line for line in out.split("\n") if line.startswith("*")) + current = next(line for line in out.split( + "\n") if line.startswith("*")) current.replace("* ", "") except subprocess.CalledProcessError: current = "inside_docker" @@ -298,7 +299,7 @@ def split_dataset(items): # most stupid code ever -- Fix it ! while len(items_eval) < eval_split_size: speakers = [item[-1] for item in items] - speaker_counter = Counter(speakers) + speaker_counter = Counter(speakers) item_idx = np.random.randint(0, len(items)) if speaker_counter[items[item_idx][-1]] > 1: items_eval.append(items[item_idx]) @@ -323,20 +324,21 @@ class KeepAverage(): def __getitem__(self, key): return self.avg_values[key] - + def add_value(self, name, init_val=0, init_iter=0): self.avg_values[name] = init_val self.iters[name] = init_iter - + def update_value(self, name, value, weighted_avg=False): if weighted_avg: self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value self.iters[name] += 1 else: - self.avg_values[name] = self.avg_values[name] * self.iters[name] + value + self.avg_values[name] = self.avg_values[name] * \ + self.iters[name] + value self.iters[name] += 1 self.avg_values[name] /= self.iters[name] - + def add_values(self, name_dict): for key, value in name_dict.items(): self.add_value(key, init_val=value) @@ -344,4 +346,3 @@ class KeepAverage(): def update_values(self, value_dict): for key, value in value_dict.items(): self.update_value(key, value) - diff --git a/utils/measures.py b/utils/measures.py index 21652cf0..21b61298 100644 --- a/utils/measures.py +++ b/utils/measures.py @@ -1,6 +1,3 @@ -import torch -import numpy as np - def alignment_diagonal_score(alignments): """ @@ -12,8 +9,3 @@ def alignment_diagonal_score(alignments): alignments : batch x decoder_steps x encoder_steps """ return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0) - - - - - From a1322530dfd63ec2b9433699b36a647774f9aaa5 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 12 Sep 2019 10:39:15 +0200 Subject: [PATCH 03/35] integrade concatinative speker embedding to tacotron --- layers/tacotron.py | 17 +++++++---- models/tacotron.py | 70 +++++++++++++++++++++++++++++++------------- tests/test_layers.py | 31 +++++++++++++++++++- 3 files changed, 91 insertions(+), 27 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 788e5230..411e7e72 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -273,7 +273,7 @@ class Decoder(nn.Module): def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, - separate_stopnet): + separate_stopnet, speaker_embedding_dim): super(Decoder, self).__init__() self.r_init = r self.r = r @@ -285,8 +285,9 @@ class Decoder(nn.Module): self.separate_stopnet = separate_stopnet self.query_dim = 256 # memory -> |Prenet| -> processed_memory + prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim self.prenet = Prenet( - memory_dim * self.memory_size if self.use_memory_queue else memory_dim, + prenet_dim, prenet_type, prenet_dropout, out_features=[256, 128]) @@ -407,7 +408,7 @@ class Decoder(nn.Module): # use only the last frame prediction self.memory_input = new_memory[:, :self.memory_dim] - def forward(self, inputs, memory, mask): + def forward(self, inputs, memory, mask, speaker_embeddings=None): """ Args: inputs: Encoder outputs. @@ -432,6 +433,8 @@ class Decoder(nn.Module): if t > 0: new_memory = memory[t - 1] self._update_memory_input(new_memory) + if speaker_embeddings is not None: + self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1) output, stop_token, attention = self.decode(inputs, mask) outputs += [output] attentions += [attention] @@ -440,13 +443,15 @@ class Decoder(nn.Module): return self._parse_outputs(outputs, attentions, stop_tokens) - def inference(self, inputs): + def inference(self, inputs, speaker_embeddings=None): """ Args: - inputs: Encoder outputs. + inputs: encoder outputs. + speaker_embeddings: speaker vectors. Shapes: - inputs: batch x time x encoder_out_dim + - speaker_embeddings: batch x embed_dim """ outputs = [] attentions = [] @@ -459,6 +464,8 @@ class Decoder(nn.Module): if t > 0: new_memory = outputs[-1] self._update_memory_input(new_memory) + if speaker_embeddings is not None: + self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1) output, stop_token, attention = self.decode(inputs, None) stop_token = torch.sigmoid(stop_token.data) outputs += [output] diff --git a/models/tacotron.py b/models/tacotron.py index 69a6fa03..bd2a3ac7 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -1,4 +1,5 @@ # coding: utf-8 +import torch from torch import nn from TTS.layers.tacotron import Encoder, Decoder, PostCBHG from TTS.utils.generic_utils import sequence_mask @@ -25,28 +26,50 @@ class Tacotron(nn.Module): self.r = r self.mel_dim = mel_dim self.linear_dim = linear_dim + self.num_speakers = num_speakers self.embedding = nn.Embedding(num_chars, 256) self.embedding.weight.data.normal_(0, 0.3) + decoder_dim = 512 if num_speakers > 1 else 256 + encoder_dim = 512 if num_speakers > 1 else 256 + proj_speaker_dim = 80 if num_speakers > 1 else 0 if num_speakers > 1: self.speaker_embedding = nn.Embedding(num_speakers, 256) self.speaker_embedding.weight.data.normal_(0, 0.3) - self.encoder = Encoder(256) - self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win, + self.speaker_project_mel = nn.Sequential(nn.Linear(256, proj_speaker_dim), nn.Tanh()) + self.encoder = Encoder(encoder_dim) + self.decoder = Decoder(decoder_dim, mel_dim, r, memory_size, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, - location_attn, separate_stopnet) + location_attn, separate_stopnet, proj_speaker_dim) self.postnet = PostCBHG(mel_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim) - + + def __init_states(self): + self.speaker_embeddings = None + self.speaker_embeddings_projected = None + + def compute_speaker_embedding(self, speaker_ids): + if hasattr(self, "speaker_embedding") and speaker_ids is None: + raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") + if hasattr(self, "speaker_embedding") and speaker_ids is not None: + self.speaker_embeddings = self._compute_speaker_embedding(speaker_ids) + self.speaker_embeddings_projected = self.speaker_project_mel(self.speaker_embeddings).squeeze(1) + def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): B = characters.size(0) mask = sequence_mask(text_lengths).to(characters.device) inputs = self.embedding(characters) + self.__init_states() + self.compute_speaker_embedding(speaker_ids) + if self.num_speakers > 1: + inputs = self._concat_speaker_embedding(inputs, + self.speaker_embeddings) encoder_outputs = self.encoder(inputs) - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - speaker_ids) + if self.num_speakers > 1: + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, + self.speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, mask) + encoder_outputs, mel_specs, mask, self.speaker_embeddings_projected) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) @@ -55,25 +78,30 @@ class Tacotron(nn.Module): def inference(self, characters, speaker_ids=None): B = characters.size(0) inputs = self.embedding(characters) + self.__init_states() + self.compute_speaker_embedding(speaker_ids) + if self.num_speakers > 1: + inputs = self._concat_speaker_embedding(inputs, + self.speaker_embeddings) encoder_outputs = self.encoder(inputs) - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - speaker_ids) + if self.num_speakers > 1: + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, + self.speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs) + encoder_outputs, self.speaker_embeddings_projected) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) return mel_outputs, linear_outputs, alignments, stop_tokens - def _add_speaker_embedding(self, encoder_outputs, speaker_ids): - if hasattr(self, "speaker_embedding") and speaker_ids is None: - raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") - if hasattr(self, "speaker_embedding") and speaker_ids is not None: - speaker_embeddings = self.speaker_embedding(speaker_ids) + def _compute_speaker_embedding(self, speaker_ids): + speaker_embeddings = self.speaker_embedding(speaker_ids) + return speaker_embeddings.unsqueeze_(1) + + def _concat_speaker_embedding(self, outputs, speaker_embeddings): + speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), + outputs.size(1), + -1) + outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) + return outputs - speaker_embeddings.unsqueeze_(1) - speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0), - encoder_outputs.size(1), - -1) - encoder_outputs = encoder_outputs + speaker_embeddings - return encoder_outputs diff --git a/tests/test_layers.py b/tests/test_layers.py index cf27e30c..a465a898 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -54,7 +54,8 @@ class DecoderTests(unittest.TestCase): trans_agent=True, forward_attn_mask=True, location_attn=True, - separate_stopnet=True) + separate_stopnet=True, + speaker_embedding_dim=0) dummy_input = T.rand(4, 8, 256) dummy_memory = T.rand(4, 2, 80) @@ -66,6 +67,34 @@ class DecoderTests(unittest.TestCase): assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2]) assert stop_tokens.shape[0] == 4 + def test_in_out_multispeaker(self): + layer = Decoder( + in_features=256, + memory_dim=80, + r=2, + memory_size=4, + attn_windowing=False, + attn_norm="sigmoid", + prenet_type='original', + prenet_dropout=True, + forward_attn=True, + trans_agent=True, + forward_attn_mask=True, + location_attn=True, + separate_stopnet=True, + speaker_embedding_dim=80) + dummy_input = T.rand(4, 8, 256) + dummy_memory = T.rand(4, 2, 80) + dummy_embed = T.rand(4, 80) + + output, alignment, stop_tokens = layer( + dummy_input, dummy_memory, mask=None, speaker_embeddings=dummy_embed) + + assert output.shape[0] == 4 + assert output.shape[1] == 1, "size not {}".format(output.shape[1]) + assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2]) + assert stop_tokens.shape[0] == 4 + class EncoderTests(unittest.TestCase): def test_in_out(self): From 14a4d1a061872622d992ec39369bd24a78e292cc Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 12 Sep 2019 23:06:59 +0200 Subject: [PATCH 04/35] update TacotronGST and its test. Inherit it from Tacotron class --- models/tacotron.py | 20 ++++++--- models/tacotrongst.py | 84 ++++++++++++++++++++---------------- tests/test_tacotron_model.py | 78 +++++++++++++++++++++++++++++---- 3 files changed, 129 insertions(+), 53 deletions(-) diff --git a/models/tacotron.py b/models/tacotron.py index bd2a3ac7..8f40f313 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -44,7 +44,7 @@ class Tacotron(nn.Module): self.postnet = PostCBHG(mel_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim) - def __init_states(self): + def _init_states(self): self.speaker_embeddings = None self.speaker_embeddings_projected = None @@ -59,7 +59,7 @@ class Tacotron(nn.Module): B = characters.size(0) mask = sequence_mask(text_lengths).to(characters.device) inputs = self.embedding(characters) - self.__init_states() + self._init_states() self.compute_speaker_embedding(speaker_ids) if self.num_speakers > 1: inputs = self._concat_speaker_embedding(inputs, @@ -78,7 +78,7 @@ class Tacotron(nn.Module): def inference(self, characters, speaker_ids=None): B = characters.size(0) inputs = self.embedding(characters) - self.__init_states() + self._init_states() self.compute_speaker_embedding(speaker_ids) if self.num_speakers > 1: inputs = self._concat_speaker_embedding(inputs, @@ -98,10 +98,16 @@ class Tacotron(nn.Module): speaker_embeddings = self.speaker_embedding(speaker_ids) return speaker_embeddings.unsqueeze_(1) - def _concat_speaker_embedding(self, outputs, speaker_embeddings): + def _add_speaker_embedding(self, outputs, speaker_embeddings): speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), - outputs.size(1), - -1) - outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) + outputs.size(1), + -1) + outputs = outputs + speaker_embeddings_ return outputs + def _concat_speaker_embedding(self, outputs, speaker_embeddings): + speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), + outputs.size(1), + -1) + outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) + return outputs diff --git a/models/tacotrongst.py b/models/tacotrongst.py index 5ea389d9..7d2fc626 100644 --- a/models/tacotrongst.py +++ b/models/tacotrongst.py @@ -1,11 +1,13 @@ # coding: utf-8 +import torch from torch import nn from TTS.layers.tacotron import Encoder, Decoder, PostCBHG from TTS.layers.gst_layers import GST from TTS.utils.generic_utils import sequence_mask +from TTS.models.tacotron import Tacotron -class TacotronGST(nn.Module): +class TacotronGST(Tacotron): def __init__(self, num_chars, num_speakers, @@ -22,37 +24,49 @@ class TacotronGST(nn.Module): forward_attn_mask=False, location_attn=True, separate_stopnet=True): - super(TacotronGST, self).__init__() - self.r = r - self.mel_dim = mel_dim - self.linear_dim = linear_dim - self.embedding = nn.Embedding(num_chars, 256) - self.embedding.weight.data.normal_(0, 0.3) - if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, 256) - self.speaker_embedding.weight.data.normal_(0, 0.3) - self.encoder = Encoder(256) - self.gst = GST(num_mel=80, num_heads=4, num_style_tokens=10, embedding_dim=256) - self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win, + super().__init__(num_chars, + num_speakers, + r, + linear_dim, + mel_dim, + memory_size, + attn_win, + attn_norm, + prenet_type, + prenet_dropout, + forward_attn, + trans_agent, + forward_attn_mask, + location_attn, + separate_stopnet) + gst_embedding_dim = 256 + decoder_dim = 512 + gst_embedding_dim if num_speakers > 1 else 256 + gst_embedding_dim + proj_speaker_dim = 80 if num_speakers > 1 else 0 + self.decoder = Decoder(decoder_dim, mel_dim, r, memory_size, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, - location_attn, separate_stopnet) - self.postnet = PostCBHG(mel_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim) - + location_attn, separate_stopnet, proj_speaker_dim) + self.gst = GST(num_mel=80, num_heads=4, + num_style_tokens=10, embedding_dim=gst_embedding_dim) def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): B = characters.size(0) mask = sequence_mask(text_lengths).to(characters.device) inputs = self.embedding(characters) + self._init_states() + self.compute_speaker_embedding(speaker_ids) + if self.num_speakers > 1: + inputs = self._concat_speaker_embedding(inputs, + self.speaker_embeddings) encoder_outputs = self.encoder(inputs) - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - speaker_ids) + if self.num_speakers > 1: + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, + self.speaker_embeddings) gst_outputs = self.gst(mel_specs) - gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1) - encoder_outputs = encoder_outputs + gst_outputs + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, gst_outputs) mel_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, mask) + encoder_outputs, mel_specs, mask, self.speaker_embeddings_projected) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) @@ -61,27 +75,23 @@ class TacotronGST(nn.Module): def inference(self, characters, speaker_ids=None, style_mel=None): B = characters.size(0) inputs = self.embedding(characters) + self._init_states() + self.compute_speaker_embedding(speaker_ids) + if self.num_speakers > 1: + inputs = self._concat_speaker_embedding(inputs, + self.speaker_embeddings) encoder_outputs = self.encoder(inputs) - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - speaker_ids) + if self.num_speakers > 1: + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, + self.speaker_embeddings) if style_mel is not None: gst_outputs = self.gst(style_mel) gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1) - encoder_outputs = encoder_outputs + gst_outputs + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, + gst_outputs) mel_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs) + encoder_outputs, self.speaker_embeddings_projected) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) return mel_outputs, linear_outputs, alignments, stop_tokens - - def _add_speaker_embedding(self, encoder_outputs, speaker_ids): - if hasattr(self, "speaker_embedding") and speaker_ids is not None: - speaker_embeddings = self.speaker_embedding(speaker_ids) - - speaker_embeddings.unsqueeze_(1) - speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0), - encoder_outputs.size(1), - -1) - encoder_outputs = encoder_outputs + speaker_embeddings - return encoder_outputs diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index acd7af41..9b8de336 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -8,6 +8,7 @@ from torch import nn from TTS.utils.generic_utils import load_config from TTS.layers.losses import L1LossMasked from TTS.models.tacotron import Tacotron +from TTS.models.tacotrongst import TacotronGST #pylint: disable=unused-variable @@ -24,15 +25,74 @@ def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) -class TacotronTrainTest(unittest.TestCase): +# class TacotronTrainTest(unittest.TestCase): + # def test_train_step(self): + # input = torch.randint(0, 24, (8, 128)).long().to(device) + # input_lengths = torch.randint(100, 129, (8, )).long().to(device) + # input_lengths[-1] = 128 + # mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + # linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) + # mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + # stop_targets = torch.zeros(8, 30, 1).float().to(device) + # speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + # for idx in mel_lengths: + # stop_targets[:, int(idx.item()):, 0] = 1.0 + + # stop_targets = stop_targets.view(input.shape[0], + # stop_targets.size(1) // c.r, -1) + # stop_targets = (stop_targets.sum(2) > + # 0.0).unsqueeze(2).float().squeeze() + + # criterion = L1LossMasked().to(device) + # criterion_st = nn.BCEWithLogitsLoss().to(device) + # model = Tacotron( + # num_chars=32, + # num_speakers=5, + # linear_dim=c.audio['num_freq'], + # mel_dim=c.audio['num_mels'], + # r=c.r, + # memory_size=c.memory_size).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + # model.train() + # print(" > Num parameters for Tacotron model:%s"%(count_parameters(model))) + # model_ref = copy.deepcopy(model) + # count = 0 + # for param, param_ref in zip(model.parameters(), + # model_ref.parameters()): + # assert (param - param_ref).sum() == 0, param + # count += 1 + # optimizer = optim.Adam(model.parameters(), lr=c.lr) + # for _ in range(5): + # mel_out, linear_out, align, stop_tokens = model.forward( + # input, input_lengths, mel_spec, speaker_ids) + # optimizer.zero_grad() + # loss = criterion(mel_out, mel_spec, mel_lengths) + # stop_loss = criterion_st(stop_tokens, stop_targets) + # loss = loss + criterion(linear_out, linear_spec, + # mel_lengths) + stop_loss + # loss.backward() + # optimizer.step() + # # check parameter changes + # count = 0 + # for param, param_ref in zip(model.parameters(), + # model_ref.parameters()): + # # ignore pre-higway layer since it works conditional + # # if count not in [145, 59]: + # assert (param != param_ref).any( + # ), "param {} with shape {} not updated!! \n{}\n{}".format( + # count, param.shape, param, param_ref) + # count += 1 + + +class TacotronGSTTrainTest(unittest.TestCase): def test_train_step(self): input = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) - stop_targets = torch.zeros(8, 30, 1).float().to(device) + mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device) + linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device) + mel_lengths = torch.randint(20, 120, (8, )).long().to(device) + stop_targets = torch.zeros(8, 120, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) for idx in mel_lengths: @@ -45,7 +105,7 @@ class TacotronTrainTest(unittest.TestCase): criterion = L1LossMasked().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( + model = TacotronGST( num_chars=32, num_speakers=5, linear_dim=c.audio['num_freq'], @@ -53,7 +113,8 @@ class TacotronTrainTest(unittest.TestCase): r=c.r, memory_size=c.memory_size).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s"%(count_parameters(model))) + print(model) + print(" > Num parameters for Tacotron GST model:%s"%(count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), @@ -61,7 +122,7 @@ class TacotronTrainTest(unittest.TestCase): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) - for _ in range(5): + for _ in range(10): mel_out, linear_out, align, stop_tokens = model.forward( input, input_lengths, mel_spec, speaker_ids) optimizer.zero_grad() @@ -76,7 +137,6 @@ class TacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - # if count not in [145, 59]: assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) From 6561013d286476624e3bffebdd394885f5e9a2d1 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 20 Sep 2019 18:46:59 +0200 Subject: [PATCH 05/35] sum style tokesn with encoder outputs instead of concat --- models/tacotrongst.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/models/tacotrongst.py b/models/tacotrongst.py index 7d2fc626..0a9a7385 100644 --- a/models/tacotrongst.py +++ b/models/tacotrongst.py @@ -56,14 +56,14 @@ class TacotronGST(Tacotron): self._init_states() self.compute_speaker_embedding(speaker_ids) if self.num_speakers > 1: - inputs = self._concat_speaker_embedding(inputs, + inputs = self._add_speaker_embedding(inputs, self.speaker_embeddings) encoder_outputs = self.encoder(inputs) if self.num_speakers > 1: - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, + encoder_outputs = self._add_speaker_embedding(encoder_outputs, self.speaker_embeddings) gst_outputs = self.gst(mel_specs) - encoder_outputs = self._concat_speaker_embedding( + encoder_outputs = self._add_speaker_embedding( encoder_outputs, gst_outputs) mel_outputs, alignments, stop_tokens = self.decoder( encoder_outputs, mel_specs, mask, self.speaker_embeddings_projected) @@ -78,16 +78,16 @@ class TacotronGST(Tacotron): self._init_states() self.compute_speaker_embedding(speaker_ids) if self.num_speakers > 1: - inputs = self._concat_speaker_embedding(inputs, + inputs = self._add_speaker_embedding(inputs, self.speaker_embeddings) encoder_outputs = self.encoder(inputs) if self.num_speakers > 1: - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, + encoder_outputs = self._add_speaker_embedding(encoder_outputs, self.speaker_embeddings) if style_mel is not None: gst_outputs = self.gst(style_mel) gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, + encoder_outputs = self._add_speaker_embedding(encoder_outputs, gst_outputs) mel_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs, self.speaker_embeddings_projected) From 1f4ec804b6b0cf8345f96397ff4ea7dc21245010 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 21 Sep 2019 09:58:58 +0200 Subject: [PATCH 06/35] compute and add style tokens in gst --- .compute | 1 + models/tacotrongst.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.compute b/.compute index 3e009cae..24578189 100644 --- a/.compute +++ b/.compute @@ -13,4 +13,5 @@ python3 setup.py develop # python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ +python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360 while true; do sleep 1000000; done diff --git a/models/tacotrongst.py b/models/tacotrongst.py index 0a9a7385..9819ec53 100644 --- a/models/tacotrongst.py +++ b/models/tacotrongst.py @@ -40,7 +40,7 @@ class TacotronGST(Tacotron): location_attn, separate_stopnet) gst_embedding_dim = 256 - decoder_dim = 512 + gst_embedding_dim if num_speakers > 1 else 256 + gst_embedding_dim + decoder_dim = 512 if num_speakers > 1 else 256 proj_speaker_dim = 80 if num_speakers > 1 else 0 self.decoder = Decoder(decoder_dim, mel_dim, r, memory_size, attn_win, attn_norm, prenet_type, prenet_dropout, From 8d3775a7d695cea3bc398eb3e84da9606184299a Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 17 Sep 2019 18:43:11 +0200 Subject: [PATCH 07/35] Update tacotron2 for gradual training and chnage the indexing of prenet nputs to oick the last frame --- .compute | 2 +- config.json | 8 ++++---- layers/tacotron.py | 2 +- layers/tacotron2.py | 44 +++++++++++++++++++------------------------- train.py | 2 +- 5 files changed, 26 insertions(+), 32 deletions(-) diff --git a/.compute b/.compute index 24578189..1a93820d 100644 --- a/.compute +++ b/.compute @@ -10,7 +10,7 @@ wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop # cp -R ${USER_DIR}/GermanData ../tmp/ -# python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ +python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360 diff --git a/config.json b/config.json index 4d56c3dc..741b82ac 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "run_name": "ljspeech", - "run_description": "gradual training with prenet frame size 1 + no maxout for cbhg + symmetric norm.", + "run_description": "Tacotron2", "audio":{ // Audio processing parameters @@ -31,7 +31,7 @@ "reinit_layers": [], - "model": "Tacotron", // one of the model in models/ + "model": "Tacotron2", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. @@ -55,10 +55,10 @@ "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. + "gradual_training": [[0, 7, 32], [1, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 10000, // Number of training steps expected to save traning stats and checkpoints. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "print_step": 25, // Number of steps to log traning on console. "batch_group_size": 0, //Number of batches to shuffle after bucketing. diff --git a/layers/tacotron.py b/layers/tacotron.py index 411e7e72..04781031 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -406,7 +406,7 @@ class Decoder(nn.Module): self.memory_input = new_memory[:, :self.memory_size * self.memory_dim] else: # use only the last frame prediction - self.memory_input = new_memory[:, :self.memory_dim] + self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):] def forward(self, inputs, memory, mask, speaker_embeddings=None): """ diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 358d1807..c87ffc78 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -101,6 +101,7 @@ class Decoder(nn.Module): forward_attn_mask, location_attn, separate_stopnet): super(Decoder, self).__init__() self.mel_channels = inputs_dim + self.r_init = r self.r = r self.encoder_embedding_dim = in_features self.separate_stopnet = separate_stopnet @@ -111,8 +112,7 @@ class Decoder(nn.Module): self.gate_threshold = 0.5 self.p_attention_dropout = 0.1 self.p_decoder_dropout = 0.1 - - self.prenet = Prenet(self.mel_channels * r, prenet_type, + self.prenet = Prenet(self.mel_channels, prenet_type, prenet_dropout, [self.prenet_dim, self.prenet_dim], bias=False) @@ -135,44 +135,34 @@ class Decoder(nn.Module): self.decoder_rnn_dim, 1) self.linear_projection = Linear(self.decoder_rnn_dim + in_features, - self.mel_channels * r) + self.mel_channels * self.r_init) self.stopnet = nn.Sequential( nn.Dropout(0.1), Linear( - self.decoder_rnn_dim + self.mel_channels * r, + self.decoder_rnn_dim + self.mel_channels * self.r_init, 1, bias=True, init_gain='sigmoid')) - - self.attention_rnn_init = nn.Embedding(1, self.query_dim) - self.go_frame_init = nn.Embedding(1, self.mel_channels * r) - self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim) self.memory_truncated = None + def set_r(self, new_r): + self.r = new_r + def get_go_frame(self, inputs): B = inputs.size(0) - memory = self.go_frame_init(inputs.data.new_zeros(B).long()) + memory = torch.zeros(B, self.mel_channels * self.r, device=inputs.device) return memory def _init_states(self, inputs, mask, keep_states=False): B = inputs.size(0) # T = inputs.size(1) - if not keep_states: - self.query = self.attention_rnn_init( - inputs.data.new_zeros(B).long()) - self.attention_rnn_cell_state = Variable( - inputs.data.new(B, self.query_dim).zero_()) - - self.decoder_hidden = self.decoder_rnn_inits( - inputs.data.new_zeros(B).long()) - self.decoder_cell = Variable( - inputs.data.new(B, self.decoder_rnn_dim).zero_()) - - self.context = Variable( - inputs.data.new(B, self.encoder_embedding_dim).zero_()) - + self.query = torch.zeros(B, self.query_dim, device=inputs.device) + self.attention_rnn_cell_state = torch.zeros(B, self.query_dim, device=inputs.device) + self.decoder_hidden = torch.zeros(B, self.decoder_rnn_dim, device=inputs.device) + self.decoder_cell = torch.zeros(B, self.decoder_rnn_dim, device=inputs.device) + self.context = torch.zeros(B, self.encoder_embedding_dim, device=inputs.device) self.inputs = inputs self.processed_inputs = self.attention.inputs_layer(inputs) self.mask = mask @@ -192,6 +182,9 @@ class Decoder(nn.Module): outputs = outputs.transpose(1, 2) return outputs, stop_tokens, alignments + def _update_memory(self, memory): + return memory[:, :, self.mel_channels * (self.r - 1) :] + def decode(self, memory): query_input = torch.cat((memory, self.context), -1) self.query, self.attention_rnn_cell_state = self.attention_rnn( @@ -223,13 +216,14 @@ class Decoder(nn.Module): stop_token = self.stopnet(stopnet_input.detach()) else: stop_token = self.stopnet(stopnet_input) + decoder_output = decoder_output[:, :self.r * self.mel_channels] return decoder_output, stop_token, self.attention.attention_weights def forward(self, inputs, memories, mask): memory = self.get_go_frame(inputs).unsqueeze(0) memories = self._reshape_memory(memories) memories = torch.cat((memory, memories), dim=0) - memories = self.prenet(memories) + memories = self.prenet(self._update_memory(memories)) self._init_states(inputs, mask=mask) self.attention.init_states(inputs) @@ -277,7 +271,7 @@ class Decoder(nn.Module): print(" | > Decoder stopped with 'max_decoder_steps") break - memory = mel_output + memory = self._update_memory(mel_output) t += 1 outputs, stop_tokens, alignments = self._parse_outputs( diff --git a/train.py b/train.py index 13444c82..d8cdf1fb 100644 --- a/train.py +++ b/train.py @@ -62,7 +62,7 @@ def setup_loader(ap, is_val=False, verbose=False): dataset = MyDataset( c.r, c.text_cleaner, - meta_data=meta_data_eval if is_val else meta_data_train, + meta_data=meta_data_eval if is_val else meta_data_train[:64], ap=ap, batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, min_seq_len=c.min_seq_len, From c8a548d375ae1679169eb429636fe8c356e30359 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 17 Sep 2019 18:44:53 +0200 Subject: [PATCH 08/35] fix the debug --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index d8cdf1fb..13444c82 100644 --- a/train.py +++ b/train.py @@ -62,7 +62,7 @@ def setup_loader(ap, is_val=False, verbose=False): dataset = MyDataset( c.r, c.text_cleaner, - meta_data=meta_data_eval if is_val else meta_data_train[:64], + meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, min_seq_len=c.min_seq_len, From e085c4757dcb7b2bf694f577ac125644d645bd9e Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 18 Sep 2019 01:16:45 +0200 Subject: [PATCH 09/35] bug fix --- .compute | 3 ++- layers/tacotron2.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.compute b/.compute index 1a93820d..c1b3ac9d 100644 --- a/.compute +++ b/.compute @@ -10,7 +10,8 @@ wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop # cp -R ${USER_DIR}/GermanData ../tmp/ -python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ +cp -R /data/ro/shared/data/keithito/LJSpeech-1.1/ ../tmp/ +python3 distribute.py --config_path config.json --data_path ../tmp/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360 diff --git a/layers/tacotron2.py b/layers/tacotron2.py index c87ffc78..6e914fd7 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -243,6 +243,8 @@ class Decoder(nn.Module): def inference(self, inputs): memory = self.get_go_frame(inputs) + memory = self._update_memory(memory) + self._init_states(inputs, mask=None) self.attention.init_win_idx() From 9a2bd7f9af6456abbb452eba0260fb3b67312405 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 18 Sep 2019 02:51:56 +0200 Subject: [PATCH 10/35] fix for 2 dim memory tensor --- layers/tacotron2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 6e914fd7..0ea8b18f 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -183,7 +183,10 @@ class Decoder(nn.Module): return outputs, stop_tokens, alignments def _update_memory(self, memory): - return memory[:, :, self.mel_channels * (self.r - 1) :] + if len(memory.shape) == 2: + return memory[:, self.mel_channels * (self.r - 1) :] + else: + return memory[:, :, self.mel_channels * (self.r - 1) :] def decode(self, memory): query_input = torch.cat((memory, self.context), -1) From b0739e0e17788a425713807cb0be2c851bc47f5c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 20 Sep 2019 18:01:44 +0200 Subject: [PATCH 11/35] config --- config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.json b/config.json index 741b82ac..38d865f9 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "run_name": "ljspeech", - "run_description": "Tacotron2", + "run_description": "Tacotron prenet fix test run - dev-memory_fix", "audio":{ // Audio processing parameters @@ -31,7 +31,7 @@ "reinit_layers": [], - "model": "Tacotron2", // one of the model in models/ + "model": "Tacotron", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. From 7e5c20500b2d9697132d7ceca5dde0d6cf3110c2 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 20 Sep 2019 18:45:48 +0200 Subject: [PATCH 12/35] compute --- .compute | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.compute b/.compute index c1b3ac9d..34da13d8 100644 --- a/.compute +++ b/.compute @@ -10,8 +10,8 @@ wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop # cp -R ${USER_DIR}/GermanData ../tmp/ -cp -R /data/ro/shared/data/keithito/LJSpeech-1.1/ ../tmp/ -python3 distribute.py --config_path config.json --data_path ../tmp/LJSpeech-1.1/ +# cp -R /data/ro/shared/data/keithito/LJSpeech-1.1/ ../tmp/ +python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360 From e8d29613f1a1b52f437cf97cf57d45d59877c5a3 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 24 Sep 2019 15:38:28 +0200 Subject: [PATCH 13/35] fix stop condition --- layers/tacotron2.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 0ea8b18f..4d1574da 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -255,7 +255,6 @@ class Decoder(nn.Module): outputs, stop_tokens, alignments, t = [], [], [], 0 stop_flags = [True, False, False] - stop_count = 0 while True: memory = self.prenet(memory) mel_output, stop_token, alignment = self.decode(memory) @@ -269,9 +268,7 @@ class Decoder(nn.Module): and t > inputs.shape[1]) stop_flags[2] = t > inputs.shape[1] * 2 if all(stop_flags): - stop_count += 1 - if stop_count > 20: - break + break elif len(outputs) == self.max_decoder_steps: print(" | > Decoder stopped with 'max_decoder_steps") break @@ -298,7 +295,6 @@ class Decoder(nn.Module): self.attention.init_states(inputs) outputs, stop_tokens, alignments, t = [], [], [], 0 stop_flags = [True, False, False] - stop_count = 0 while True: memory = self.prenet(self.memory_truncated) mel_output, stop_token, alignment = self.decode(memory) @@ -312,9 +308,7 @@ class Decoder(nn.Module): and t > inputs.shape[1]) stop_flags[2] = t > inputs.shape[1] * 2 if all(stop_flags): - stop_count += 1 - if stop_count > 20: - break + break elif len(outputs) == self.max_decoder_steps: print(" | > Decoder stopped with 'max_decoder_steps") break From 98af061d2e38dc3ee2077a217d37398ea14fdbaa Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 24 Sep 2019 16:18:48 +0200 Subject: [PATCH 14/35] formatting, merge GST model with Tacotron --- layers/tacotron2.py | 75 ++++++++++++-------- models/tacotron.py | 85 +++++++++++++++------- models/tacotrongst.py | 97 ------------------------- tests/test_layers.py | 3 +- tests/test_tacotron_model.py | 132 ++++++++++++++++++----------------- 5 files changed, 173 insertions(+), 219 deletions(-) delete mode 100644 models/tacotrongst.py diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 4d1574da..a02ff95a 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -10,8 +10,10 @@ class ConvBNBlock(nn.Module): super(ConvBNBlock, self).__init__() assert (kernel_size - 1) % 2 == 0 padding = (kernel_size - 1) // 2 - conv1d = nn.Conv1d( - in_channels, out_channels, kernel_size, padding=padding) + conv1d = nn.Conv1d(in_channels, + out_channels, + kernel_size, + padding=padding) norm = nn.BatchNorm1d(out_channels) dropout = nn.Dropout(p=0.5) if nonlinear == 'relu': @@ -52,20 +54,20 @@ class Encoder(nn.Module): convolutions.append( ConvBNBlock(in_features, in_features, 5, 'relu')) self.convolutions = nn.Sequential(*convolutions) - self.lstm = nn.LSTM( - in_features, - int(in_features / 2), - num_layers=1, - batch_first=True, - bidirectional=True) + self.lstm = nn.LSTM(in_features, + int(in_features / 2), + num_layers=1, + batch_first=True, + bidirectional=True) self.rnn_state = None def forward(self, x, input_lengths): x = self.convolutions(x) x = x.transpose(1, 2) input_lengths = input_lengths.cpu().numpy() - x = nn.utils.rnn.pack_padded_sequence( - x, input_lengths, batch_first=True) + x = nn.utils.rnn.pack_padded_sequence(x, + input_lengths, + batch_first=True) self.lstm.flatten_parameters() outputs, _ = self.lstm(x) outputs, _ = nn.utils.rnn.pad_packed_sequence( @@ -112,9 +114,11 @@ class Decoder(nn.Module): self.gate_threshold = 0.5 self.p_attention_dropout = 0.1 self.p_decoder_dropout = 0.1 - self.prenet = Prenet(self.mel_channels, prenet_type, + self.prenet = Prenet(self.mel_channels, + prenet_type, prenet_dropout, - [self.prenet_dim, self.prenet_dim], bias=False) + [self.prenet_dim, self.prenet_dim], + bias=False) self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, self.query_dim) @@ -139,19 +143,20 @@ class Decoder(nn.Module): self.stopnet = nn.Sequential( nn.Dropout(0.1), - Linear( - self.decoder_rnn_dim + self.mel_channels * self.r_init, - 1, - bias=True, - init_gain='sigmoid')) + Linear(self.decoder_rnn_dim + self.mel_channels * self.r_init, + 1, + bias=True, + init_gain='sigmoid')) self.memory_truncated = None def set_r(self, new_r): self.r = new_r - + def get_go_frame(self, inputs): B = inputs.size(0) - memory = torch.zeros(B, self.mel_channels * self.r, device=inputs.device) + memory = torch.zeros(B, + self.mel_channels * self.r, + device=inputs.device) return memory def _init_states(self, inputs, mask, keep_states=False): @@ -159,17 +164,25 @@ class Decoder(nn.Module): # T = inputs.size(1) if not keep_states: self.query = torch.zeros(B, self.query_dim, device=inputs.device) - self.attention_rnn_cell_state = torch.zeros(B, self.query_dim, device=inputs.device) - self.decoder_hidden = torch.zeros(B, self.decoder_rnn_dim, device=inputs.device) - self.decoder_cell = torch.zeros(B, self.decoder_rnn_dim, device=inputs.device) - self.context = torch.zeros(B, self.encoder_embedding_dim, device=inputs.device) + self.attention_rnn_cell_state = torch.zeros(B, + self.query_dim, + device=inputs.device) + self.decoder_hidden = torch.zeros(B, + self.decoder_rnn_dim, + device=inputs.device) + self.decoder_cell = torch.zeros(B, + self.decoder_rnn_dim, + device=inputs.device) + self.context = torch.zeros(B, + self.encoder_embedding_dim, + device=inputs.device) self.inputs = inputs self.processed_inputs = self.attention.inputs_layer(inputs) self.mask = mask def _reshape_memory(self, memories): - memories = memories.view( - memories.size(0), int(memories.size(1) / self.r), -1) + memories = memories.view(memories.size(0), + int(memories.size(1) / self.r), -1) memories = memories.transpose(0, 1) return memories @@ -184,18 +197,18 @@ class Decoder(nn.Module): def _update_memory(self, memory): if len(memory.shape) == 2: - return memory[:, self.mel_channels * (self.r - 1) :] - else: - return memory[:, :, self.mel_channels * (self.r - 1) :] + return memory[:, self.mel_channels * (self.r - 1):] + return memory[:, :, self.mel_channels * (self.r - 1):] def decode(self, memory): query_input = torch.cat((memory, self.context), -1) self.query, self.attention_rnn_cell_state = self.attention_rnn( query_input, (self.query, self.attention_rnn_cell_state)) - self.query = F.dropout( - self.query, self.p_attention_dropout, self.training) + self.query = F.dropout(self.query, self.p_attention_dropout, + self.training) self.attention_rnn_cell_state = F.dropout( - self.attention_rnn_cell_state, self.p_attention_dropout, self.training) + self.attention_rnn_cell_state, self.p_attention_dropout, + self.training) self.context = self.attention(self.query, self.inputs, self.processed_inputs, self.mask) diff --git a/models/tacotron.py b/models/tacotron.py index 8f40f313..8f711364 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -3,6 +3,7 @@ import torch from torch import nn from TTS.layers.tacotron import Encoder, Decoder, PostCBHG from TTS.utils.generic_utils import sequence_mask +from TTS.layers.gst_layers import GST class Tacotron(nn.Module): @@ -14,6 +15,7 @@ class Tacotron(nn.Module): mel_dim=80, memory_size=5, attn_win=False, + gst=False, attn_norm="sigmoid", prenet_type="original", prenet_dropout=True, @@ -26,35 +28,59 @@ class Tacotron(nn.Module): self.r = r self.mel_dim = mel_dim self.linear_dim = linear_dim + self.gst = gst self.num_speakers = num_speakers self.embedding = nn.Embedding(num_chars, 256) self.embedding.weight.data.normal_(0, 0.3) decoder_dim = 512 if num_speakers > 1 else 256 encoder_dim = 512 if num_speakers > 1 else 256 proj_speaker_dim = 80 if num_speakers > 1 else 0 - if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, 256) - self.speaker_embedding.weight.data.normal_(0, 0.3) - self.speaker_project_mel = nn.Sequential(nn.Linear(256, proj_speaker_dim), nn.Tanh()) + # boilerplate model self.encoder = Encoder(encoder_dim) self.decoder = Decoder(decoder_dim, mel_dim, r, memory_size, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, - location_attn, separate_stopnet, proj_speaker_dim) + location_attn, separate_stopnet, + proj_speaker_dim) self.postnet = PostCBHG(mel_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, + linear_dim) + # speaker embedding layers + if num_speakers > 1: + self.speaker_embedding = nn.Embedding(num_speakers, 256) + self.speaker_embedding.weight.data.normal_(0, 0.3) + self.speaker_project_mel = nn.Sequential( + nn.Linear(256, proj_speaker_dim), nn.Tanh()) + self.speaker_embeddings = None + self.speaker_embeddings_projected = None + # global style token layers + if self.gst: + gst_embedding_dim = 256 + self.gst_layer = GST(num_mel=80, + num_heads=4, + num_style_tokens=10, + embedding_dim=gst_embedding_dim) def _init_states(self): - self.speaker_embeddings = None + self.speaker_embeddings = None self.speaker_embeddings_projected = None def compute_speaker_embedding(self, speaker_ids): if hasattr(self, "speaker_embedding") and speaker_ids is None: - raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") + raise RuntimeError( + " [!] Model has speaker embedding layer but speaker_id is not provided" + ) if hasattr(self, "speaker_embedding") and speaker_ids is not None: - self.speaker_embeddings = self._compute_speaker_embedding(speaker_ids) - self.speaker_embeddings_projected = self.speaker_project_mel(self.speaker_embeddings).squeeze(1) - + self.speaker_embeddings = self._compute_speaker_embedding( + speaker_ids) + self.speaker_embeddings_projected = self.speaker_project_mel( + self.speaker_embeddings).squeeze(1) + + def compute_gst(self, inputs, mel_specs): + gst_outputs = self.gst_layer(mel_specs) + inputs = self._add_speaker_embedding(inputs, gst_outputs) + return inputs + def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): B = characters.size(0) mask = sequence_mask(text_lengths).to(characters.device) @@ -63,30 +89,35 @@ class Tacotron(nn.Module): self.compute_speaker_embedding(speaker_ids) if self.num_speakers > 1: inputs = self._concat_speaker_embedding(inputs, - self.speaker_embeddings) + self.speaker_embeddings) encoder_outputs = self.encoder(inputs) + if self.gst: + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) if self.num_speakers > 1: - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, - self.speaker_embeddings) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, self.speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, mask, self.speaker_embeddings_projected) + encoder_outputs, mel_specs, mask, + self.speaker_embeddings_projected) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) return mel_outputs, linear_outputs, alignments, stop_tokens - def inference(self, characters, speaker_ids=None): + def inference(self, characters, speaker_ids=None, style_mel=None): B = characters.size(0) inputs = self.embedding(characters) self._init_states() self.compute_speaker_embedding(speaker_ids) if self.num_speakers > 1: inputs = self._concat_speaker_embedding(inputs, - self.speaker_embeddings) + self.speaker_embeddings) encoder_outputs = self.encoder(inputs) + if self.gst and style_mel is not None: + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) if self.num_speakers > 1: - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, - self.speaker_embeddings) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, self.speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs, self.speaker_embeddings_projected) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) @@ -98,16 +129,16 @@ class Tacotron(nn.Module): speaker_embeddings = self.speaker_embedding(speaker_ids) return speaker_embeddings.unsqueeze_(1) - def _add_speaker_embedding(self, outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), - outputs.size(1), - -1) + @staticmethod + def _add_speaker_embedding(outputs, speaker_embeddings): + speaker_embeddings_ = speaker_embeddings.expand( + outputs.size(0), outputs.size(1), -1) outputs = outputs + speaker_embeddings_ return outputs - def _concat_speaker_embedding(self, outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), - outputs.size(1), - -1) + @staticmethod + def _concat_speaker_embedding(outputs, speaker_embeddings): + speaker_embeddings_ = speaker_embeddings.expand( + outputs.size(0), outputs.size(1), -1) outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) return outputs diff --git a/models/tacotrongst.py b/models/tacotrongst.py deleted file mode 100644 index 9819ec53..00000000 --- a/models/tacotrongst.py +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf-8 -import torch -from torch import nn -from TTS.layers.tacotron import Encoder, Decoder, PostCBHG -from TTS.layers.gst_layers import GST -from TTS.utils.generic_utils import sequence_mask -from TTS.models.tacotron import Tacotron - - -class TacotronGST(Tacotron): - def __init__(self, - num_chars, - num_speakers, - r=5, - linear_dim=1025, - mel_dim=80, - memory_size=5, - attn_win=False, - attn_norm="sigmoid", - prenet_type="original", - prenet_dropout=True, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - separate_stopnet=True): - super().__init__(num_chars, - num_speakers, - r, - linear_dim, - mel_dim, - memory_size, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - separate_stopnet) - gst_embedding_dim = 256 - decoder_dim = 512 if num_speakers > 1 else 256 - proj_speaker_dim = 80 if num_speakers > 1 else 0 - self.decoder = Decoder(decoder_dim, mel_dim, r, memory_size, attn_win, - attn_norm, prenet_type, prenet_dropout, - forward_attn, trans_agent, forward_attn_mask, - location_attn, separate_stopnet, proj_speaker_dim) - self.gst = GST(num_mel=80, num_heads=4, - num_style_tokens=10, embedding_dim=gst_embedding_dim) - - def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): - B = characters.size(0) - mask = sequence_mask(text_lengths).to(characters.device) - inputs = self.embedding(characters) - self._init_states() - self.compute_speaker_embedding(speaker_ids) - if self.num_speakers > 1: - inputs = self._add_speaker_embedding(inputs, - self.speaker_embeddings) - encoder_outputs = self.encoder(inputs) - if self.num_speakers > 1: - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - self.speaker_embeddings) - gst_outputs = self.gst(mel_specs) - encoder_outputs = self._add_speaker_embedding( - encoder_outputs, gst_outputs) - mel_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, mask, self.speaker_embeddings_projected) - mel_outputs = mel_outputs.view(B, -1, self.mel_dim) - linear_outputs = self.postnet(mel_outputs) - linear_outputs = self.last_linear(linear_outputs) - return mel_outputs, linear_outputs, alignments, stop_tokens - - def inference(self, characters, speaker_ids=None, style_mel=None): - B = characters.size(0) - inputs = self.embedding(characters) - self._init_states() - self.compute_speaker_embedding(speaker_ids) - if self.num_speakers > 1: - inputs = self._add_speaker_embedding(inputs, - self.speaker_embeddings) - encoder_outputs = self.encoder(inputs) - if self.num_speakers > 1: - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - self.speaker_embeddings) - if style_mel is not None: - gst_outputs = self.gst(style_mel) - gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1) - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - gst_outputs) - mel_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs, self.speaker_embeddings_projected) - mel_outputs = mel_outputs.view(B, -1, self.mel_dim) - linear_outputs = self.postnet(mel_outputs) - linear_outputs = self.last_linear(linear_outputs) - return mel_outputs, linear_outputs, alignments, stop_tokens diff --git a/tests/test_layers.py b/tests/test_layers.py index a465a898..6b5fd80b 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -67,7 +67,8 @@ class DecoderTests(unittest.TestCase): assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2]) assert stop_tokens.shape[0] == 4 - def test_in_out_multispeaker(self): + @staticmethod + def test_in_out_multispeaker(): layer = Decoder( in_features=256, memory_dim=80, diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index 9b8de336..c8b0d7ca 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -8,7 +8,6 @@ from torch import nn from TTS.utils.generic_utils import load_config from TTS.layers.losses import L1LossMasked from TTS.models.tacotron import Tacotron -from TTS.models.tacotrongst import TacotronGST #pylint: disable=unused-variable @@ -25,68 +24,72 @@ def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) -# class TacotronTrainTest(unittest.TestCase): - # def test_train_step(self): - # input = torch.randint(0, 24, (8, 128)).long().to(device) - # input_lengths = torch.randint(100, 129, (8, )).long().to(device) - # input_lengths[-1] = 128 - # mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - # linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) - # mel_lengths = torch.randint(20, 30, (8, )).long().to(device) - # stop_targets = torch.zeros(8, 30, 1).float().to(device) - # speaker_ids = torch.randint(0, 5, (8, )).long().to(device) +class TacotronTrainTest(unittest.TestCase): + @staticmethod + def test_train_step(): + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths[-1] = 128 + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) - # for idx in mel_lengths: - # stop_targets[:, int(idx.item()):, 0] = 1.0 + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 - # stop_targets = stop_targets.view(input.shape[0], - # stop_targets.size(1) // c.r, -1) - # stop_targets = (stop_targets.sum(2) > - # 0.0).unsqueeze(2).float().squeeze() + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze() - # criterion = L1LossMasked().to(device) - # criterion_st = nn.BCEWithLogitsLoss().to(device) - # model = Tacotron( - # num_chars=32, - # num_speakers=5, - # linear_dim=c.audio['num_freq'], - # mel_dim=c.audio['num_mels'], - # r=c.r, - # memory_size=c.memory_size).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor - # model.train() - # print(" > Num parameters for Tacotron model:%s"%(count_parameters(model))) - # model_ref = copy.deepcopy(model) - # count = 0 - # for param, param_ref in zip(model.parameters(), - # model_ref.parameters()): - # assert (param - param_ref).sum() == 0, param - # count += 1 - # optimizer = optim.Adam(model.parameters(), lr=c.lr) - # for _ in range(5): - # mel_out, linear_out, align, stop_tokens = model.forward( - # input, input_lengths, mel_spec, speaker_ids) - # optimizer.zero_grad() - # loss = criterion(mel_out, mel_spec, mel_lengths) - # stop_loss = criterion_st(stop_tokens, stop_targets) - # loss = loss + criterion(linear_out, linear_spec, - # mel_lengths) + stop_loss - # loss.backward() - # optimizer.step() - # # check parameter changes - # count = 0 - # for param, param_ref in zip(model.parameters(), - # model_ref.parameters()): - # # ignore pre-higway layer since it works conditional - # # if count not in [145, 59]: - # assert (param != param_ref).any( - # ), "param {} with shape {} not updated!! \n{}\n{}".format( - # count, param.shape, param, param_ref) - # count += 1 + criterion = L1LossMasked().to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron( + num_chars=32, + num_speakers=5, + linear_dim=c.audio['num_freq'], + mel_dim=c.audio['num_mels'], + r=c.r, + memory_size=c.memory_size + ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + model.train() + print(" > Num parameters for Tacotron model:%s" % + (count_parameters(model))) + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for _ in range(5): + mel_out, linear_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, speaker_ids) + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(linear_out, linear_spec, + mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 class TacotronGSTTrainTest(unittest.TestCase): - def test_train_step(self): - input = torch.randint(0, 24, (8, 128)).long().to(device) + @staticmethod + def test_train_step(): + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device) @@ -98,23 +101,26 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0 - stop_targets = stop_targets.view(input.shape[0], + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = TacotronGST( + model = Tacotron( num_chars=32, - num_speakers=5, + num_speakers=5, + gst=True, linear_dim=c.audio['num_freq'], mel_dim=c.audio['num_mels'], r=c.r, - memory_size=c.memory_size).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + memory_size=c.memory_size + ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(model) - print(" > Num parameters for Tacotron GST model:%s"%(count_parameters(model))) + print(" > Num parameters for Tacotron GST model:%s" % + (count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), @@ -124,7 +130,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): mel_out, linear_out, align, stop_tokens = model.forward( - input, input_lengths, mel_spec, speaker_ids) + input_dummy, input_lengths, mel_spec, speaker_ids) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) From 5b6b1f354d4b6032c814d23082f7c0828d907b30 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 24 Sep 2019 16:24:58 +0200 Subject: [PATCH 15/35] add use_gst to enable global style token --- config.json | 3 ++- utils/generic_utils.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/config.json b/config.json index 38d865f9..2a171ad1 100644 --- a/config.json +++ b/config.json @@ -79,6 +79,7 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "text_cleaner": "phoneme_cleaners", "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "style_wav_for_test": null // path to style wav file to be used in TacotronGST inference. + "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. + "use_gst": false // TACOTRON ONLY: use global style tokens } diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 1053d221..bfa72a35 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -253,13 +253,14 @@ def setup_model(num_chars, num_speakers, c): print(" > Using model: {}".format(c.model)) MyModel = importlib.import_module('TTS.models.' + c.model.lower()) MyModel = getattr(MyModel, c.model) - if c.model.lower() in ["tacotron", "tacotrongst"]: + if c.model.lower() in "tacotron": model = MyModel( num_chars=num_chars, num_speakers=num_speakers, r=c.r, linear_dim=1025, mel_dim=80, + gst=c.use_gst, memory_size=c.memory_size, attn_win=c.windowing, attn_norm=c.attention_norm, From 113f5860b8f2dec2b3919af5e6754911a0854c5f Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 24 Sep 2019 16:58:52 +0200 Subject: [PATCH 16/35] update benchmark notebook --- notebooks/Benchmark.ipynb | 444 ++++++++++++-------------------------- 1 file changed, 138 insertions(+), 306 deletions(-) diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index 4de29af9..7c528506 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -29,28 +29,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Populating the interactive namespace from numpy and matplotlib\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/erogol/miniconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']\n", - "`%matplotlib` prevents importing * from pylab and numpy\n", - " \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", @@ -59,6 +42,7 @@ "import io\n", "import torch \n", "import time\n", + "import json\n", "import numpy as np\n", "from collections import OrderedDict\n", "from matplotlib import pylab as plt\n", @@ -86,23 +70,25 @@ "from IPython.display import Audio\n", "\n", "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", - "os.environ['OMP_NUM_THREADS']='1'\n" + "os.environ['CUDA_VISIBLE_DEVICES']='1'" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n", + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False, speaker_id=speaker_id, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " # coorect the normalization differences b/w TTS and the Vocoder.\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", + " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n", " if not use_gl:\n", - " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)\n", + " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n", "\n", " print(\" > Run-time: {}\".format(time.time() - t_1))\n", " if figures: \n", @@ -117,31 +103,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mVOCODER_MODEL_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mVOCODER_CONFIG_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mVOCODER_CONFIG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mVOCODER_CONFIG_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0muse_cuda\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/projects/TTS/tts_namespace/TTS/utils/generic_utils.py\u001b[0m in \u001b[0;36mload_config\u001b[0;34m(config_path)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mconfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAttrDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'\\\\\\n'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'" - ] - } - ], + "outputs": [], "source": [ "# Set constants\n", - "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'\n", - "MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n", + "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5099/'\n", + "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n", + "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", "CONFIG = load_config(CONFIG_PATH)\n", - "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\n", - "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\n", + "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n", + "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n", "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n", "use_cuda = False\n", "\n", @@ -149,10 +122,12 @@ "# CONFIG.windowing = False\n", "# CONFIG.prenet_dropout = False\n", "# CONFIG.separate_stopnet = True\n", + "CONFIG.use_forward_attn = True\n", + "# CONFIG.forward_attn_mask = True\n", "# CONFIG.stopnet = True\n", "\n", "# Set the vocoder\n", - "use_gl = True # use GL if True\n", + "use_gl = False # use GL if True\n", "batched_wavernn = True # use batched wavernn inference if True" ] }, @@ -165,9 +140,17 @@ "# LOAD TTS MODEL\n", "from utils.text.symbols import symbols, phonemes\n", "\n", + "# multi speaker \n", + "if CONFIG.use_speaker_embedding:\n", + " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", + " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", + "else:\n", + " speakers = []\n", + " speaker_id = None\n", + "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, CONFIG)\n", + "model = setup_model(num_chars, len(speakers), CONFIG)\n", "\n", "# load the audio processor\n", "ap = AudioProcessor(**CONFIG.audio) \n", @@ -184,7 +167,12 @@ "if use_cuda:\n", " model.cuda()\n", "model.eval()\n", - "print(cp['step'])" + "print(cp['step'])\n", + "print(cp['r'])\n", + "\n", + "# set model stepsize\n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" ] }, { @@ -196,25 +184,28 @@ "# LOAD WAVERNN\n", "if use_gl == False:\n", " from WaveRNN.models.wavernn import Model\n", + " from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n", " bits = 10\n", - "\n", + " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n", " wavernn = Model(\n", " rnn_dims=512,\n", " fc_dims=512,\n", - " mode=\"mold\",\n", - " pad=2,\n", - " upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset\n", + " mode=VOCODER_CONFIG.mode,\n", + " mulaw=VOCODER_CONFIG.mulaw,\n", + " pad=VOCODER_CONFIG.pad,\n", + " upsample_factors=VOCODER_CONFIG.upsample_factors,\n", " feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n", " compute_dims=128,\n", " res_out_dims=128,\n", " res_blocks=10,\n", - " hop_length=ap.hop_length,\n", - " sample_rate=ap.sample_rate,\n", + " hop_length=ap_vocoder.hop_length,\n", + " sample_rate=ap_vocoder.sample_rate,\n", + " use_upsample_net = True,\n", + " use_aux_net = True\n", " ).cuda()\n", "\n", - "\n", " check = torch.load(VOCODER_MODEL_PATH)\n", - " wavernn.load_state_dict(check['model'])\n", + " wavernn.load_state_dict(check['model'], strict=False)\n", " if use_cuda:\n", " wavernn.cuda()\n", " wavernn.eval();\n", @@ -230,111 +221,67 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'model' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_decoder_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m2000\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mspeaker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" - ] - } - ], + "outputs": [], "source": [ "model.eval()\n", "model.decoder.max_decoder_steps = 2000\n", - "speaker_id = 0\n", - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'model' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Be a voice, not an echo.\"\u001b[0m \u001b[0;31m# 'echo' is not in training set.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" - ] - } - ], - "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'model' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"The human voice is the most perfect instrument of all.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" - ] - } - ], - "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'model' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"I'm sorry Dave. I'm afraid I can't do that.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" - ] - } - ], - "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "speaker_id = None\n", + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "scrolled": true - }, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "model.decoder.max_decoder_steps = 2000\n", + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"The human voice is the most perfect instrument of all.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { @@ -347,76 +294,51 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { @@ -429,136 +351,91 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { @@ -568,7 +445,7 @@ "outputs": [], "source": [ "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { @@ -581,107 +458,62 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "# for twb dataset\n", "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, - "outputs": [], - "source": [ - "# !zip benchmark_samples/samples.zip benchmark_samples/*" + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] } ], From 23f6743ac9cf869679df8fa57faa62681ed3ebb9 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 24 Sep 2019 17:19:04 +0200 Subject: [PATCH 17/35] fix synthesize.py --- synthesize.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/synthesize.py b/synthesize.py index 23c67c73..a0bf6be6 100644 --- a/synthesize.py +++ b/synthesize.py @@ -2,6 +2,7 @@ import os import time import argparse import torch +import json import string from TTS.utils.synthesis import synthesis @@ -16,22 +17,27 @@ def tts(model, VC, text, ap, + ap_vocoder, use_cuda, batched_vocoder, + speaker_id=None, figures=False): t_1 = time.time() use_vocoder_model = vocoder_model is not None waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis( - model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars) + model, text, C, use_cuda, ap, speaker_id, False, C.enable_eos_bos_chars) if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T + # correct if there is a scale difference b/w two models + postnet_output = ap._denormalize(postnet_output) + postnet_output = ap_vocoder._normalize(postnet_output) if use_vocoder_model: vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, batched=batched_vocoder, - target=11000, - overlap=550) + target=8000, + overlap=400) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, postnet_output, stop_tokens, waveform @@ -81,6 +87,12 @@ if __name__ == "__main__": help="JSON file for multi-speaker model.", default="" ) + parser.add_argument( + '--speaker_id', + type=int, + help="target speaker_id if the model is multi-speaker.", + default=None + ) args = parser.parse_args() if args.vocoder_path != "": @@ -109,10 +121,12 @@ if __name__ == "__main__": model.eval() if args.use_cuda: model.cuda() + model.decoder.set_r(cp['r']) # load vocoder model if args.vocoder_path != "": VC = load_config(args.vocoder_config_path) + ap_vocoder = AudioProcessor(**VC.audio) bits = 10 vocoder_model = VocoderModel( rnn_dims=512, @@ -127,6 +141,8 @@ if __name__ == "__main__": res_blocks=10, hop_length=ap.hop_length, sample_rate=ap.sample_rate, + use_aux_net=True, + use_upsample_net=True ) check = torch.load(args.vocoder_path) @@ -137,6 +153,7 @@ if __name__ == "__main__": else: vocoder_model = None VC = None + ap_vocoder = None # synthesize voice print(" > Text: {}".format(args.text)) @@ -147,8 +164,10 @@ if __name__ == "__main__": VC, args.text, ap, + ap_vocoder, args.use_cuda, args.batched_vocoder, + speaker_id=args.speaker_id, figures=False) # save the results From 53d658fb74c06e16ba041fbe73bb75b19c268aef Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 24 Sep 2019 17:20:01 +0200 Subject: [PATCH 18/35] formatting --- synthesize.py | 102 ++++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/synthesize.py b/synthesize.py index a0bf6be6..cb0ee8af 100644 --- a/synthesize.py +++ b/synthesize.py @@ -20,12 +20,13 @@ def tts(model, ap_vocoder, use_cuda, batched_vocoder, - speaker_id=None, + speaker_id=None, figures=False): t_1 = time.time() use_vocoder_model = vocoder_model is not None - waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis( - model, text, C, use_cuda, ap, speaker_id, False, C.enable_eos_bos_chars) + waveform, alignment, _, postnet_output, stop_tokens = synthesis( + model, text, C, use_cuda, ap, speaker_id, False, + C.enable_eos_bos_chars) if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T # correct if there is a scale difference b/w two models @@ -45,13 +46,10 @@ def tts(model, if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - 'text', type=str, help='Text to generate speech.') - parser.add_argument( - 'config_path', - type=str, - help='Path to model config file.' - ) + parser.add_argument('text', type=str, help='Text to generate speech.') + parser.add_argument('config_path', + type=str, + help='Path to model config file.') parser.add_argument( 'model_path', type=str, @@ -62,8 +60,10 @@ if __name__ == "__main__": type=str, help='Path to save final wav file.', ) - parser.add_argument( - '--use_cuda', type=bool, help='Run model on CUDA.', default=False) + parser.add_argument('--use_cuda', + type=bool, + help='Run model on CUDA.', + default=False) parser.add_argument( '--vocoder_path', type=str, @@ -71,28 +71,24 @@ if __name__ == "__main__": 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', default="", ) - parser.add_argument( - '--vocoder_config_path', - type=str, - help='Path to vocoder model config file.', - default="") + parser.add_argument('--vocoder_config_path', + type=str, + help='Path to vocoder model config file.', + default="") parser.add_argument( '--batched_vocoder', type=bool, help="If True, vocoder model uses faster batch processing.", default=True) - parser.add_argument( - '--speakers_json', - type=str, - help="JSON file for multi-speaker model.", - default="" - ) + parser.add_argument('--speakers_json', + type=str, + help="JSON file for multi-speaker model.", + default="") parser.add_argument( '--speaker_id', type=int, help="target speaker_id if the model is multi-speaker.", - default=None - ) + default=None) args = parser.parse_args() if args.vocoder_path != "": @@ -128,22 +124,20 @@ if __name__ == "__main__": VC = load_config(args.vocoder_config_path) ap_vocoder = AudioProcessor(**VC.audio) bits = 10 - vocoder_model = VocoderModel( - rnn_dims=512, - fc_dims=512, - mode=VC.mode, - mulaw=VC.mulaw, - pad=VC.pad, - upsample_factors=VC.upsample_factors, - feat_dims=VC.audio["num_mels"], - compute_dims=128, - res_out_dims=128, - res_blocks=10, - hop_length=ap.hop_length, - sample_rate=ap.sample_rate, - use_aux_net=True, - use_upsample_net=True - ) + vocoder_model = VocoderModel(rnn_dims=512, + fc_dims=512, + mode=VC.mode, + mulaw=VC.mulaw, + pad=VC.pad, + upsample_factors=VC.upsample_factors, + feat_dims=VC.audio["num_mels"], + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=ap.hop_length, + sample_rate=ap.sample_rate, + use_aux_net=True, + use_upsample_net=True) check = torch.load(args.vocoder_path) vocoder_model.load_state_dict(check['model']) @@ -157,22 +151,22 @@ if __name__ == "__main__": # synthesize voice print(" > Text: {}".format(args.text)) - _, _, _, wav = tts( - model, - vocoder_model, - C, - VC, - args.text, - ap, - ap_vocoder, - args.use_cuda, - args.batched_vocoder, - speaker_id=args.speaker_id, - figures=False) + _, _, _, wav = tts(model, + vocoder_model, + C, + VC, + args.text, + ap, + ap_vocoder, + args.use_cuda, + args.batched_vocoder, + speaker_id=args.speaker_id, + figures=False) # save the results file_name = args.text.replace(" ", "_") - file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', '')))+'.wav' + file_name = file_name.translate( + str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' out_path = os.path.join(args.out_path, file_name) print(" > Saving output to {}".format(out_path)) ap.save_wav(wav, out_path) From b76aaf8ad46a67da06b94fc86129f3dd426a32df Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 28 Sep 2019 01:09:28 +0200 Subject: [PATCH 19/35] skip weight decay for BN and biases, some formatting --- train.py | 10 ++-- utils/generic_utils.py | 107 +++++++++++++++++++++++++---------------- 2 files changed, 72 insertions(+), 45 deletions(-) diff --git a/train.py b/train.py index 13444c82..7a68e2b0 100644 --- a/train.py +++ b/train.py @@ -20,7 +20,8 @@ from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters, load_config, remove_experiment_folder, save_best_model, save_checkpoint, weight_decay, set_init_dict, copy_config_file, setup_model, - split_dataset, gradual_training_scheduler, KeepAverage) + split_dataset, gradual_training_scheduler, KeepAverage, + set_weight_decay) from TTS.utils.logger import Logger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers @@ -186,7 +187,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, loss += stop_loss loss.backward() - optimizer, current_lr = weight_decay(optimizer, c.wd) + optimizer, current_lr = weight_decay(optimizer) grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() @@ -197,7 +198,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # backpass and check the grad norm for stop loss if c.separate_stopnet: stop_loss.backward() - optimizer_st, _ = weight_decay(optimizer_st, c.wd) + optimizer_st, _ = weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() else: @@ -511,7 +512,8 @@ def main(args): # pylint: disable=redefined-outer-name print(" | > Num output units : {}".format(ap.num_freq), flush=True) - optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0) + params = set_weight_decay(model, c.wd) + optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam( model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index bfa72a35..3cdf74bc 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -31,8 +31,8 @@ def load_config(config_path): def get_git_branch(): try: out = subprocess.check_output(["git", "branch"]).decode("utf8") - current = next(line for line in out.split( - "\n") if line.startswith("*")) + current = next(line for line in out.split("\n") + if line.startswith("*")) current.replace("* ", "") except subprocess.CalledProcessError: current = "inside_docker" @@ -48,8 +48,8 @@ def get_commit_hash(): # raise RuntimeError( # " !! Commit before training to get the commit hash.") try: - commit = subprocess.check_output(['git', 'rev-parse', '--short', - 'HEAD']).decode().strip() + commit = subprocess.check_output( + ['git', 'rev-parse', '--short', 'HEAD']).decode().strip() # Not copying .git folder into docker container except subprocess.CalledProcessError: commit = "0000000" @@ -169,17 +169,43 @@ def lr_decay(init_lr, global_step, warmup_steps): return lr -def weight_decay(optimizer, wd): +def weight_decay(optimizer): """ Custom weight decay operation, not effecting grad values. """ for group in optimizer.param_groups: for param in group['params']: current_lr = group['lr'] - param.data = param.data.add(-wd * group['lr'], param.data) + weight_decay = group['weight_decay'] + param.data = param.data.add(-weight_decay * group['lr'], + param.data) return optimizer, current_lr +def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v"}): + """ + Skip biases, BatchNorm parameters for weight decay + and attention projection layer v + """ + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + if len(param.shape) == 1 or name in skip_list: + print(name) + no_decay.append(param) + else: + decay.append(param) + return [{ + 'params': no_decay, + 'weight_decay': 0. + }, { + 'params': decay, + 'weight_decay': weight_decay + }] + + class NoamLR(torch.optim.lr_scheduler._LRScheduler): def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): self.warmup_steps = float(warmup_steps) @@ -188,8 +214,8 @@ class NoamLR(torch.optim.lr_scheduler._LRScheduler): def get_lr(self): step = max(self.last_epoch, 1) return [ - base_lr * self.warmup_steps**0.5 * min( - step * self.warmup_steps**-1.5, step**-0.5) + base_lr * self.warmup_steps**0.5 * + min(step * self.warmup_steps**-1.5, step**-0.5) for base_lr in self.base_lrs ] @@ -244,8 +270,8 @@ def set_init_dict(model_dict, checkpoint, c): } # 4. overwrite entries in the existing state dict model_dict.update(pretrained_dict) - print(" | > {} / {} layers are restored.".format( - len(pretrained_dict), len(model_dict))) + print(" | > {} / {} layers are restored.".format(len(pretrained_dict), + len(model_dict))) return model_dict @@ -254,37 +280,35 @@ def setup_model(num_chars, num_speakers, c): MyModel = importlib.import_module('TTS.models.' + c.model.lower()) MyModel = getattr(MyModel, c.model) if c.model.lower() in "tacotron": - model = MyModel( - num_chars=num_chars, - num_speakers=num_speakers, - r=c.r, - linear_dim=1025, - mel_dim=80, - gst=c.use_gst, - memory_size=c.memory_size, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - separate_stopnet=c.separate_stopnet) + model = MyModel(num_chars=num_chars, + num_speakers=num_speakers, + r=c.r, + linear_dim=1025, + mel_dim=80, + gst=c.use_gst, + memory_size=c.memory_size, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + separate_stopnet=c.separate_stopnet) elif c.model.lower() == "tacotron2": - model = MyModel( - num_chars=num_chars, - num_speakers=num_speakers, - r=c.r, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - separate_stopnet=c.separate_stopnet) + model = MyModel(num_chars=num_chars, + num_speakers=num_speakers, + r=c.r, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + separate_stopnet=c.separate_stopnet) return model @@ -292,7 +316,8 @@ def split_dataset(items): is_multi_speaker = False speakers = [item[-1] for item in items] is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = 500 if 500 < len(items) * 0.01 else int(len(items) * 0.01) + eval_split_size = 500 if 500 < len(items) * 0.01 else int( + len(items) * 0.01) np.random.seed(0) np.random.shuffle(items) if is_multi_speaker: From 8565c508e45ef52dd14887ef54b38ace8e2a1119 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 28 Sep 2019 01:11:04 +0200 Subject: [PATCH 20/35] remove debug line --- utils/generic_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 3cdf74bc..983797ba 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -193,7 +193,6 @@ def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v"}): if not param.requires_grad: continue if len(param.shape) == 1 or name in skip_list: - print(name) no_decay.append(param) else: decay.append(param) From 99d7f2a666f61dba206036dc457b6946765f834c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 28 Sep 2019 15:31:18 +0200 Subject: [PATCH 21/35] update set_weight_decay --- utils/generic_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 983797ba..3188067f 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -182,9 +182,9 @@ def weight_decay(optimizer): return optimizer, current_lr -def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v"}): +def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): """ - Skip biases, BatchNorm parameters for weight decay + Skip biases, BatchNorm parameters, rnns. and attention projection layer v """ decay = [] @@ -192,7 +192,8 @@ def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v"}): for name, param in model.named_parameters(): if not param.requires_grad: continue - if len(param.shape) == 1 or name in skip_list: + + if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]): no_decay.append(param) else: decay.append(param) From acbafb456bdc460c5adeb7b1394d418f8f2f6758 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 28 Sep 2019 15:44:17 +0200 Subject: [PATCH 22/35] Weighting positive values for stopnet loss, change adam_weight_decay name --- train.py | 8 ++++---- utils/generic_utils.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/train.py b/train.py index 7a68e2b0..cbcfb1ec 100644 --- a/train.py +++ b/train.py @@ -18,7 +18,7 @@ from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters, create_experiment_folder, get_git_branch, load_config, remove_experiment_folder, - save_best_model, save_checkpoint, weight_decay, + save_best_model, save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file, setup_model, split_dataset, gradual_training_scheduler, KeepAverage, set_weight_decay) @@ -187,7 +187,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, loss += stop_loss loss.backward() - optimizer, current_lr = weight_decay(optimizer) + optimizer, current_lr = adam_weight_decay(optimizer) grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() @@ -198,7 +198,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # backpass and check the grad norm for stop loss if c.separate_stopnet: stop_loss.backward() - optimizer_st, _ = weight_decay(optimizer_st) + optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() else: @@ -526,7 +526,7 @@ def main(args): # pylint: disable=redefined-outer-name else: criterion = nn.L1Loss() if c.model in [ "Tacotron", "TacotronGST"] else nn.MSELoss() - criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None + criterion_st = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(20.0)) if c.stopnet else None if args.restore_path: checkpoint = torch.load(args.restore_path) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 3188067f..50d611b8 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -169,7 +169,7 @@ def lr_decay(init_lr, global_step, warmup_steps): return lr -def weight_decay(optimizer): +def adam_weight_decay(optimizer): """ Custom weight decay operation, not effecting grad values. """ @@ -181,7 +181,7 @@ def weight_decay(optimizer): param.data) return optimizer, current_lr - +# pylint: disable=dangerous-default-value def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): """ Skip biases, BatchNorm parameters, rnns. @@ -316,7 +316,7 @@ def split_dataset(items): is_multi_speaker = False speakers = [item[-1] for item in items] is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = 500 if 500 < len(items) * 0.01 else int( + eval_split_size = 500 if len(items) * 0.01 > 500 else int( len(items) * 0.01) np.random.seed(0) np.random.shuffle(items) From 1fad04e31723edae9ef6af809b8d48191890c20a Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 30 Sep 2019 14:30:33 +0200 Subject: [PATCH 23/35] load meta data function --- datasets/preprocess.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index e5f4e1a2..036b1701 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -2,6 +2,27 @@ import os from glob import glob import re import sys +from TTS.utils.generic_utils import split_dataset + + +def load_meta_data(datasets): + meta_data_train_all = [] + meta_data_eval_all = [] + for dataset in datasets: + name = dataset['name'] + root_path = dataset['path'] + meta_file_train = dataset['meta_file_train'] + meta_file_val = dataset['meta_file_val'] + preprocessor = get_preprocessor_by_name(name) + + meta_data_train = preprocessor(root_path, meta_file_train) + if meta_file_val is None: + meta_data_train, meta_data_eval = split_dataset(meta_data_train) + else: + meta_data_eval = preprocessor(root_path, meta_file_val) + meta_data_train_all += meta_data_train + meta_data_eval_all += meta_data_eval + return meta_data_train_all, meta_data_eval_all def get_preprocessor_by_name(name): From 64a01f584b1d8ad3de4434028c4579211de14f74 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 30 Sep 2019 15:03:18 +0200 Subject: [PATCH 24/35] load_meta_data changes --- config.json | 17 ++++++++++++----- train.py | 14 ++------------ 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/config.json b/config.json index 2a171ad1..c5434bf9 100644 --- a/config.json +++ b/config.json @@ -65,10 +65,6 @@ "run_eval": true, "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - "data_path": "/home/erogol/Data/LJSpeech-1.1/", // DATASET-RELATED: can overwritten from command argument - "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader. - "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader. - "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. @@ -80,6 +76,17 @@ "text_cleaner": "phoneme_cleaners", "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. - "use_gst": false // TACOTRON ONLY: use global style tokens + "use_gst": false, // TACOTRON ONLY: use global style tokens + + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "/home/erogol/Data/LJSpeech-1.1/", + "meta_file_train": "metadata_train.csv", + "meta_file_val": "metadata_val.csv" + } + ] + } diff --git a/train.py b/train.py index cbcfb1ec..b94f44fa 100644 --- a/train.py +++ b/train.py @@ -28,7 +28,7 @@ from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ from TTS.utils.synthesis import synthesis from TTS.utils.text.symbols import phonemes, symbols from TTS.utils.visual import plot_alignment, plot_spectrogram -from TTS.datasets.preprocess import get_preprocessor_by_name +from TTS.datasets.preprocess import load_meta_data from TTS.utils.radam import RAdam from TTS.utils.measures import alignment_diagonal_score @@ -46,17 +46,7 @@ def setup_loader(ap, is_val=False, verbose=False): global meta_data_train global meta_data_eval if "meta_data_train" not in globals(): - if c.meta_file_train is not None: - meta_data_train = get_preprocessor_by_name( - c.dataset)(c.data_path, c.meta_file_train) - else: - meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path) - if "meta_data_eval" not in globals() and c.run_eval: - if c.meta_file_val is not None: - meta_data_eval = get_preprocessor_by_name( - c.dataset)(c.data_path, c.meta_file_val) - else: - meta_data_eval, meta_data_train = split_dataset(meta_data_train) + meta_data_train, meta_data_eval = load_meta_data(c.datasets) if is_val and not c.run_eval: loader = None else: From fc9af0ab3c0d9782bfb61079a509387eb248270c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 30 Sep 2019 18:04:15 +0200 Subject: [PATCH 25/35] bug fix for load__meta_data --- datasets/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 036b1701..a78abab9 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -17,7 +17,7 @@ def load_meta_data(datasets): meta_data_train = preprocessor(root_path, meta_file_train) if meta_file_val is None: - meta_data_train, meta_data_eval = split_dataset(meta_data_train) + meta_data_eval, meta_data_train = split_dataset(meta_data_train) else: meta_data_eval = preprocessor(root_path, meta_file_val) meta_data_train_all += meta_data_train From 8dec2a9e95894891a72e98869185f5e39d78dd83 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 2 Oct 2019 00:30:25 +0200 Subject: [PATCH 26/35] fix memory leak duee to diagonal alingmnet score --- train.py | 4 ++-- utils/measures.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index b94f44fa..5ecf99ef 100644 --- a/train.py +++ b/train.py @@ -204,7 +204,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( num_iter, batch_n_iter, global_step, - postnet_loss.item(), decoder_loss.item(), stop_loss.item(), align_score.item(), + postnet_loss.item(), decoder_loss.item(), stop_loss.item(), align_score, grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), flush=True) @@ -404,7 +404,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): postnet_loss.item(), keep_avg['avg_postnet_loss'], decoder_loss.item(), keep_avg['avg_decoder_loss'], stop_loss.item(), keep_avg['avg_stop_loss'], - align_score.item(), keep_avg['avg_align_score']), + align_score, keep_avg['avg_align_score']), flush=True) if args.rank == 0: diff --git a/utils/measures.py b/utils/measures.py index 21b61298..a76a2225 100644 --- a/utils/measures.py +++ b/utils/measures.py @@ -8,4 +8,4 @@ def alignment_diagonal_score(alignments): Shape: alignments : batch x decoder_steps x encoder_steps """ - return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0) + return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0).item() From 0849e3c42ffddc89f34955ea3b7dc3dcc8e017f7 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 4 Oct 2019 18:20:30 +0200 Subject: [PATCH 27/35] sound normalization while reading, adapting get_Speaker for multiple datasets --- train.py | 11 ++++++----- utils/audio.py | 4 ++++ utils/speakers.py | 4 +--- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/train.py b/train.py index 5ecf99ef..087eaa3f 100644 --- a/train.py +++ b/train.py @@ -43,10 +43,6 @@ print(" > Number of GPUs: ", num_gpus) def setup_loader(ap, is_val=False, verbose=False): - global meta_data_train - global meta_data_eval - if "meta_data_train" not in globals(): - meta_data_train, meta_data_eval = load_meta_data(c.datasets) if is_val and not c.run_eval: loader = None else: @@ -470,6 +466,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): # FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name + global meta_data_train, meta_data_eval # Audio processor ap = AudioProcessor(**c.audio) @@ -479,8 +476,12 @@ def main(args): # pylint: disable=redefined-outer-name c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) + # load data instances + meta_data_train, meta_data_eval = load_meta_data(c.datasets) + + # parse speakers if c.use_speaker_embedding: - speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset) + speakers = get_speakers(meta_data_train) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) diff --git a/utils/audio.py b/utils/audio.py index 794520af..f6b8b5c3 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -24,6 +24,7 @@ class AudioProcessor(object): clip_norm=True, griffin_lim_iters=None, do_trim_silence=False, + sound_norm=False, **_): print(" > Setting up Audio Processor...") @@ -45,6 +46,7 @@ class AudioProcessor(object): self.max_norm = 1.0 if max_norm is None else float(max_norm) self.clip_norm = clip_norm self.do_trim_silence = do_trim_silence + self.sound_norm = sound_norm self.n_fft, self.hop_length, self.win_length = self._stft_parameters() members = vars(self) for key, value in members.items(): @@ -243,6 +245,8 @@ class AudioProcessor(object): except ValueError: print(f' [!] File cannot be trimmed for silence - {filename}') assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) + if self.sound_norm: + x = x / x.max() * 0.9 return x @staticmethod diff --git a/utils/speakers.py b/utils/speakers.py index 4b11531b..8aa612a8 100644 --- a/utils/speakers.py +++ b/utils/speakers.py @@ -25,9 +25,7 @@ def save_speaker_mapping(out_path, speaker_mapping): json.dump(speaker_mapping, f, indent=4) -def get_speakers(data_root, meta_file, dataset_type): +def get_speakers(items): """Returns a sorted, unique list of speakers in a given dataset.""" - preprocessor = get_preprocessor_by_name(dataset_type) - items = preprocessor(data_root, meta_file) speakers = {e[2] for e in items} return sorted(speakers) From fbfa20e3b39330dd7f40d9456318a0f57ae218bb Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 4 Oct 2019 18:36:32 +0200 Subject: [PATCH 28/35] linter fix --- train.py | 311 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 170 insertions(+), 141 deletions(-) diff --git a/train.py b/train.py index 087eaa3f..eafd2d0e 100644 --- a/train.py +++ b/train.py @@ -15,13 +15,12 @@ from distribute import (DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor) from TTS.layers.losses import L1LossMasked, MSELossMasked from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters, - create_experiment_folder, get_git_branch, - load_config, remove_experiment_folder, - save_best_model, save_checkpoint, adam_weight_decay, - set_init_dict, copy_config_file, setup_model, - split_dataset, gradual_training_scheduler, KeepAverage, - set_weight_decay) +from TTS.utils.generic_utils import ( + NoamLR, check_update, count_parameters, create_experiment_folder, + get_git_branch, load_config, remove_experiment_folder, save_best_model, + save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file, + setup_model, gradual_training_scheduler, KeepAverage, + set_weight_decay) from TTS.utils.logger import Logger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers @@ -32,7 +31,6 @@ from TTS.datasets.preprocess import load_meta_data from TTS.utils.radam import RAdam from TTS.utils.measures import alignment_diagonal_score - torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.manual_seed(54321) @@ -51,7 +49,8 @@ def setup_loader(ap, is_val=False, verbose=False): c.text_cleaner, meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, - batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, + batch_group_size=0 if is_val else c.batch_group_size * + c.batch_size, min_seq_len=c.min_seq_len, max_seq_len=c.max_seq_len, phoneme_cache_path=c.phoneme_cache_path, @@ -87,13 +86,14 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, 'avg_align_score': 0, 'avg_step_time': 0, 'avg_loader_time': 0, - 'avg_alignment_score': 0} + 'avg_alignment_score': 0 + } keep_avg = KeepAverage() keep_avg.add_values(train_values) print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / - (c.batch_size * num_gpus)) + batch_n_iter = int( + len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -104,8 +104,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, text_input = data[0] text_lengths = data[1] speaker_names = data[2] - linear_input = data[3] if c.model in [ - "Tacotron", "TacotronGST"] else None + linear_input = data[3] if c.model in ["Tacotron", "TacotronGST" + ] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] @@ -114,8 +114,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, loader_time = time.time() - end_time if c.use_speaker_embedding: - speaker_ids = [speaker_mapping[speaker_name] - for speaker_name in speaker_names] + speaker_ids = [ + speaker_mapping[speaker_name] for speaker_name in speaker_names + ] speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None @@ -123,8 +124,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze( - 2).float().squeeze(2) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze(2) global_step += 1 @@ -141,8 +142,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if c.model in [ - "Tacotron", "TacotronGST"] else None + linear_input = linear_input.cuda( + non_blocking=True) if c.model in ["Tacotron", "TacotronGST" + ] else None stop_targets = stop_targets.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) @@ -152,16 +154,16 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation - stop_loss = criterion_st( - stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) + stop_loss = criterion_st(stop_tokens, + stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion( - postnet_output, linear_input, mel_lengths) + postnet_loss = criterion(postnet_output, linear_input, + mel_lengths) else: - postnet_loss = criterion( - postnet_output, mel_input, mel_lengths) + postnet_loss = criterion(postnet_output, mel_input, + mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: @@ -199,10 +201,10 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( - num_iter, batch_n_iter, global_step, - postnet_loss.item(), decoder_loss.item(), stop_loss.item(), align_score, - grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, - loader_time, current_lr), + num_iter, batch_n_iter, global_step, postnet_loss.item(), + decoder_loss.item(), stop_loss.item(), align_score, + grad_norm, grad_norm_st, avg_text_length, avg_spec_length, + step_time, loader_time, current_lr), flush=True) # aggregate losses from processes @@ -210,26 +212,36 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) loss = reduce_tensor(loss.data, num_gpus) - stop_loss = reduce_tensor( - stop_loss.data, num_gpus) if c.stopnet else stop_loss + stop_loss = reduce_tensor(stop_loss.data, + num_gpus) if c.stopnet else stop_loss if args.rank == 0: - update_train_values = {'avg_postnet_loss': float(postnet_loss.item()), - 'avg_decoder_loss': float(decoder_loss.item()), - 'avg_stop_loss': stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()), - 'avg_step_time': step_time, - 'avg_loader_time': loader_time} + update_train_values = { + 'avg_postnet_loss': + float(postnet_loss.item()), + 'avg_decoder_loss': + float(decoder_loss.item()), + 'avg_stop_loss': + stop_loss + if isinstance(stop_loss, float) else float(stop_loss.item()), + 'avg_step_time': + step_time, + 'avg_loader_time': + loader_time + } keep_avg.update_values(update_train_values) # Plot Training Iter Stats # reduce TB load if global_step % 10 == 0: - iter_stats = {"loss_posnet": postnet_loss.item(), - "loss_decoder": decoder_loss.item(), - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time} + iter_stats = { + "loss_posnet": postnet_loss.item(), + "loss_decoder": decoder_loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "grad_norm_st": grad_norm_st, + "step_time": step_time + } tb_logger.tb_train_iter_stats(global_step, iter_stats) if global_step % c.save_step == 0: @@ -242,7 +254,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() if c.model in [ - "Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy() + "Tacotron", "TacotronGST" + ] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { @@ -263,23 +276,26 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, end_time = time.time() # print epoch stats - print( - " | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " - "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " - "AvgStopLoss:{:.5f} EpochTime:{:.2f} " - "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, keep_avg['avg_postnet_loss'], keep_avg['avg_decoder_loss'], - keep_avg['avg_stop_loss'], keep_avg['avg_align_score'], - epoch_time, keep_avg['avg_step_time'], keep_avg['avg_loader_time']), - flush=True) + print(" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " + "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " + "AvgStopLoss:{:.5f} EpochTime:{:.2f} " + "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format( + global_step, keep_avg['avg_postnet_loss'], + keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'], + keep_avg['avg_align_score'], epoch_time, + keep_avg['avg_step_time'], keep_avg['avg_loader_time']), + flush=True) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats - epoch_stats = {"loss_postnet": keep_avg['avg_postnet_loss'], - "loss_decoder": keep_avg['avg_decoder_loss'], - "stop_loss": keep_avg['avg_stop_loss'], - "alignment_score": keep_avg['avg_align_score'], - "epoch_time": epoch_time} + epoch_stats = { + "loss_postnet": keep_avg['avg_postnet_loss'], + "loss_decoder": keep_avg['avg_decoder_loss'], + "stop_loss": keep_avg['avg_stop_loss'], + "alignment_score": keep_avg['avg_align_score'], + "epoch_time": epoch_time + } tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) @@ -292,10 +308,12 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 - eval_values_dict = {'avg_postnet_loss': 0, - 'avg_decoder_loss': 0, - 'avg_stop_loss': 0, - 'avg_align_score': 0} + eval_values_dict = { + 'avg_postnet_loss': 0, + 'avg_decoder_loss': 0, + 'avg_stop_loss': 0, + 'avg_align_score': 0 + } keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) print("\n > Validation") @@ -319,14 +337,17 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): text_lengths = data[1] speaker_names = data[2] linear_input = data[3] if c.model in [ - "Tacotron", "TacotronGST"] else None + "Tacotron", "TacotronGST" + ] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] if c.use_speaker_embedding: - speaker_ids = [speaker_mapping[speaker_name] - for speaker_name in speaker_names] + speaker_ids = [ + speaker_mapping[speaker_name] + for speaker_name in speaker_names + ] speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None @@ -335,8 +356,8 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze( - 2).float().squeeze(2) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: @@ -344,7 +365,8 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() if c.model in [ - "Tacotron", "TacotronGST"] else None + "Tacotron", "TacotronGST" + ] else None stop_targets = stop_targets.cuda() if speaker_ids is not None: speaker_ids = speaker_ids.cuda() @@ -358,14 +380,14 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: - decoder_loss = criterion( - decoder_output, mel_input, mel_lengths) + decoder_loss = criterion(decoder_output, mel_input, + mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion( - postnet_output, linear_input, mel_lengths) + postnet_loss = criterion(postnet_output, linear_input, + mel_lengths) else: - postnet_loss = criterion( - postnet_output, mel_input, mel_lengths) + postnet_loss = criterion(postnet_output, mel_input, + mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: @@ -388,19 +410,25 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) - keep_avg.update_values({'avg_postnet_loss': float(postnet_loss.item()), - 'avg_decoder_loss': float(decoder_loss.item()), - 'avg_stop_loss': float(stop_loss.item())}) + keep_avg.update_values({ + 'avg_postnet_loss': + float(postnet_loss.item()), + 'avg_decoder_loss': + float(decoder_loss.item()), + 'avg_stop_loss': + float(stop_loss.item()) + }) if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " - "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}".format( - loss.item(), - postnet_loss.item(), keep_avg['avg_postnet_loss'], - decoder_loss.item(), keep_avg['avg_decoder_loss'], - stop_loss.item(), keep_avg['avg_stop_loss'], - align_score, keep_avg['avg_align_score']), + "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}" + .format(loss.item(), postnet_loss.item(), + keep_avg['avg_postnet_loss'], + decoder_loss.item(), + keep_avg['avg_decoder_loss'], stop_loss.item(), + keep_avg['avg_stop_loss'], align_score, + keep_avg['avg_align_score']), flush=True) if args.rank == 0: @@ -408,7 +436,8 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ - "Tacotron", "TacotronGST"] else mel_input[idx].data.cpu().numpy() + "Tacotron", "TacotronGST" + ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { @@ -423,13 +452,15 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) - tb_logger.tb_eval_audios( - global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) + tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, + c.audio["sample_rate"]) # Plot Validation Stats - epoch_stats = {"loss_postnet": keep_avg['avg_postnet_loss'], - "loss_decoder": keep_avg['avg_decoder_loss'], - "stop_loss": keep_avg['avg_stop_loss']} + epoch_stats = { + "loss_postnet": keep_avg['avg_postnet_loss'], + "loss_decoder": keep_avg['avg_decoder_loss'], + "stop_loss": keep_avg['avg_stop_loss'] + } tb_logger.tb_eval_stats(global_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: @@ -442,7 +473,11 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( - model, test_sentence, c, use_cuda, ap, + model, + test_sentence, + c, + use_cuda, + ap, speaker_id=speaker_id, style_wav=style_wav) file_path = os.path.join(AUDIO_PATH, str(global_step)) @@ -451,15 +486,15 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav - test_figures['{}-prediction'.format(idx) - ] = plot_spectrogram(postnet_output, ap) - test_figures['{}-alignment'.format(idx) - ] = plot_alignment(alignment) + test_figures['{}-prediction'.format(idx)] = plot_spectrogram( + postnet_output, ap) + test_figures['{}-alignment'.format(idx)] = plot_alignment( + alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() - tb_logger.tb_test_audios( - global_step, test_audios, c.audio['sample_rate']) + tb_logger.tb_test_audios(global_step, test_audios, + c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg['avg_postnet_loss'] @@ -490,8 +525,7 @@ def main(args): # pylint: disable=redefined-outer-name "introduce new speakers to " \ "a previously trained model." else: - speaker_mapping = {name: i - for i, name in enumerate(speakers)} + speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, @@ -506,18 +540,20 @@ def main(args): # pylint: disable=redefined-outer-name params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: - optimizer_st = RAdam( - model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) + optimizer_st = RAdam(model.decoder.stopnet.parameters(), + lr=c.lr, + weight_decay=0) else: optimizer_st = None if c.loss_masking: - criterion = L1LossMasked() if c.model in [ - "Tacotron", "TacotronGST"] else MSELossMasked() + criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST" + ] else MSELossMasked() else: - criterion = nn.L1Loss() if c.model in [ - "Tacotron", "TacotronGST"] else nn.MSELoss() - criterion_st = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(20.0)) if c.stopnet else None + criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST" + ] else nn.MSELoss() + criterion_st = nn.BCEWithLogitsLoss( + pos_weight=torch.tensor(20.0)) if c.stopnet else None if args.restore_path: checkpoint = torch.load(args.restore_path) @@ -536,8 +572,8 @@ def main(args): # pylint: disable=redefined-outer-name del model_dict for group in optimizer.param_groups: group['lr'] = c.lr - print( - " > Model restored from step %d" % checkpoint['step'], flush=True) + print(" > Model restored from step %d" % checkpoint['step'], + flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 @@ -553,10 +589,9 @@ def main(args): # pylint: disable=redefined-outer-name model = apply_gradient_allreduce(model) if c.lr_decay: - scheduler = NoamLR( - optimizer, - warmup_steps=c.warmup_steps, - last_epoch=args.restore_step - 1) + scheduler = NoamLR(optimizer, + warmup_steps=c.warmup_steps, + last_epoch=args.restore_step - 1) else: scheduler = None @@ -576,14 +611,13 @@ def main(args): # pylint: disable=redefined-outer-name print(" > Number of outputs per iteration:", model.decoder.r) train_loss, global_step = train(model, criterion, criterion_st, - optimizer, optimizer_st, scheduler, - ap, global_step, epoch) - val_loss = evaluate(model, criterion, criterion_st, - ap, global_step, epoch) - print( - " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( - train_loss, val_loss), - flush=True) + optimizer, optimizer_st, scheduler, ap, + global_step, epoch) + val_loss = evaluate(model, criterion, criterion_st, ap, global_step, + epoch) + print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( + train_loss, val_loss), + flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss @@ -603,27 +637,23 @@ if __name__ == '__main__': type=str, help='Path to config file for training.', ) - parser.add_argument( - '--debug', - type=bool, - default=True, - help='Do not verify commit integrity to run training.') + parser.add_argument('--debug', + type=bool, + default=True, + help='Do not verify commit integrity to run training.') parser.add_argument( '--data_path', type=str, default='', help='Defines the data path. It overwrites config.json.') - parser.add_argument( - '--output_path', - type=str, - help='path for training outputs.', - default='') - parser.add_argument( - '--output_folder', - type=str, - default='', - help='folder name for training outputs.' - ) + parser.add_argument('--output_path', + type=str, + help='path for training outputs.', + default='') + parser.add_argument('--output_folder', + type=str, + default='', + help='folder name for training outputs.') # DISTRUBUTED parser.add_argument( @@ -631,11 +661,10 @@ if __name__ == '__main__': type=int, default=0, help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument( - '--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') + parser.add_argument('--group_id', + type=str, + default="", + help='DISTRIBUTED: process group id.') args = parser.parse_args() # setup output paths and read configs @@ -662,8 +691,8 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, os.path.join( - OUT_PATH, 'config.json'), new_fields) + copy_config_file(args.config_path, + os.path.join(OUT_PATH, 'config.json'), new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) From 53ec066733237dbc215b2aeb3a1a749ba2e9170f Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 12 Oct 2019 18:34:12 +0200 Subject: [PATCH 29/35] replace zeros() with a better alternative --- layers/tacotron.py | 8 ++++---- layers/tacotron2.py | 27 +++++++++++---------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 04781031..657eefe7 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -340,13 +340,13 @@ class Decoder(nn.Module): T = inputs.size(1) # go frame as zeros matrix if self.use_memory_queue: - self.memory_input = torch.zeros(B, self.memory_dim * self.memory_size, device=inputs.device) + self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size) else: - self.memory_input = torch.zeros(B, self.memory_dim, device=inputs.device) + self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim) # decoder states - self.attention_rnn_hidden = torch.zeros(B, 256, device=inputs.device) + self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256) self.decoder_rnn_hiddens = [ - torch.zeros(B, 256, device=inputs.device) + torch.zeros(1, device=inputs.device).repeat(B, 256) for idx in range(len(self.decoder_rnns)) ] self.context_vec = inputs.data.new(B, self.in_features).zero_() diff --git a/layers/tacotron2.py b/layers/tacotron2.py index a02ff95a..ea55cbed 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -154,28 +154,23 @@ class Decoder(nn.Module): def get_go_frame(self, inputs): B = inputs.size(0) - memory = torch.zeros(B, - self.mel_channels * self.r, - device=inputs.device) + memory = torch.zeros(1, device=inputs.device).repeat(B, + self.mel_channels * self.r) return memory def _init_states(self, inputs, mask, keep_states=False): B = inputs.size(0) # T = inputs.size(1) if not keep_states: - self.query = torch.zeros(B, self.query_dim, device=inputs.device) - self.attention_rnn_cell_state = torch.zeros(B, - self.query_dim, - device=inputs.device) - self.decoder_hidden = torch.zeros(B, - self.decoder_rnn_dim, - device=inputs.device) - self.decoder_cell = torch.zeros(B, - self.decoder_rnn_dim, - device=inputs.device) - self.context = torch.zeros(B, - self.encoder_embedding_dim, - device=inputs.device) + self.query = torch.zeros(1, device=inputs.device).repeat(B, self.query_dim) + self.attention_rnn_cell_state = torch.zeros(1, device=inputs.device).repeat(B, + self.query_dim) + self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(B, + self.decoder_rnn_dim) + self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(B, + self.decoder_rnn_dim) + self.context = torch.zeros(1, device=inputs.device).repeat(B, + self.encoder_embedding_dim) self.inputs = inputs self.processed_inputs = self.attention.inputs_layer(inputs) self.mask = mask From 2dcdc14ea6cdd56e31c7aeb4ce24ae07ccb7161c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 12 Oct 2019 18:34:28 +0200 Subject: [PATCH 30/35] UPDATE TRIM SILENCE --- utils/audio.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index f6b8b5c3..4b8c427c 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -212,11 +212,11 @@ class AudioProcessor(object): return len(wav) def trim_silence(self, wav): - """ Trim silent parts with a threshold and 0.1 sec margin """ - margin = int(self.sample_rate * 0.1) + """ Trim silent parts with a threshold and 0.01 sec margin """ + margin = int(self.sample_rate * 0.01) wav = wav[margin:-margin] return librosa.effects.trim( - wav, top_db=40, frame_length=1024, hop_length=256)[0] + wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0] @staticmethod def mulaw_encode(wav, qc): From c1f598b5d09ef4ddd329dc65ebff029ce24882ea Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sun, 13 Oct 2019 01:22:30 +0200 Subject: [PATCH 31/35] config for tacotron2 --- .compute | 5 +++-- config.json | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.compute b/.compute index 34da13d8..de3589ae 100644 --- a/.compute +++ b/.compute @@ -11,8 +11,9 @@ sudo sh install.sh python3 setup.py develop # cp -R ${USER_DIR}/GermanData ../tmp/ # cp -R /data/ro/shared/data/keithito/LJSpeech-1.1/ ../tmp/ -python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ +# python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ -python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360 +# python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360 +python3 distribute.py --config_path config.json while true; do sleep 1000000; done diff --git a/config.json b/config.json index c5434bf9..9cc4b222 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "run_name": "ljspeech", - "run_description": "Tacotron prenet fix test run - dev-memory_fix", + "run_description": "Tacotron2 ljspeech release training", "audio":{ // Audio processing parameters @@ -31,7 +31,7 @@ "reinit_layers": [], - "model": "Tacotron", // one of the model in models/ + "model": "Tacotron2", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. @@ -82,7 +82,7 @@ [ { "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", + "path": "/data/ro/shared/data/keithito/LJSpeech-1.1/", "meta_file_train": "metadata_train.csv", "meta_file_val": "metadata_val.csv" } From 77f5fd05847ab058e105dc4d651cbd006866d913 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 15 Oct 2019 15:05:42 +0200 Subject: [PATCH 32/35] compute and config update with new attention entropy loss --- .compute | 9 +++------ config.json | 4 ++-- layers/losses.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/.compute b/.compute index de3589ae..5ad456b8 100644 --- a/.compute +++ b/.compute @@ -4,16 +4,13 @@ yes | apt-get install ffmpeg yes | apt-get install espeak yes | apt-get install tmux yes | apt-get install zsh -# pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl -# wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar -wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh +pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.0%2Bcu100-cp36-cp36m-linux_x86_64.whl sudo sh install.sh +pip install pytorch==1.3.0+cu100 python3 setup.py develop -# cp -R ${USER_DIR}/GermanData ../tmp/ -# cp -R /data/ro/shared/data/keithito/LJSpeech-1.1/ ../tmp/ # python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ # python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360 -python3 distribute.py --config_path config.json +# python3 distribute.py --config_path config.json while true; do sleep 1000000; done diff --git a/config.json b/config.json index 9cc4b222..1226e1ac 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "run_name": "ljspeech", - "run_description": "Tacotron2 ljspeech release training", + "run_description": "Tacotron ljspeech release training", "audio":{ // Audio processing parameters @@ -31,7 +31,7 @@ "reinit_layers": [], - "model": "Tacotron2", // one of the model in models/ + "model": "Tacotron", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. diff --git a/layers/losses.py b/layers/losses.py index a6bf95d3..6ccb3986 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -1,3 +1,5 @@ +import numpy as np +import torch from torch import nn from torch.nn import functional from TTS.utils.generic_utils import sequence_mask @@ -53,3 +55,17 @@ class MSELossMasked(nn.Module): x * mask, target * mask, reduction="sum") loss = loss / mask.sum() return loss + + +class AttentionEntropyLoss(nn.Module): + def forward(self, align): + """ + Forces attention to be more decisive by penalizing + soft attention weights + + TODO: arguments + TODO: unit_test + """ + entropy = torch.distributions.Categorical(probs=align).entropy() + loss = (entropy / np.log(align.shape[1])).mean() + return loss From ea32f2368d10237f77d5303db5c8bca0d6b49c7c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 24 Oct 2019 14:11:07 +0200 Subject: [PATCH 33/35] linter fix --- layers/losses.py | 2 +- layers/tacotron2.py | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/layers/losses.py b/layers/losses.py index 6ccb3986..ab472519 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -61,7 +61,7 @@ class AttentionEntropyLoss(nn.Module): def forward(self, align): """ Forces attention to be more decisive by penalizing - soft attention weights + soft attention weights TODO: arguments TODO: unit_test diff --git a/layers/tacotron2.py b/layers/tacotron2.py index ea55cbed..0d7472fd 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -162,15 +162,16 @@ class Decoder(nn.Module): B = inputs.size(0) # T = inputs.size(1) if not keep_states: - self.query = torch.zeros(1, device=inputs.device).repeat(B, self.query_dim) - self.attention_rnn_cell_state = torch.zeros(1, device=inputs.device).repeat(B, - self.query_dim) - self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(B, - self.decoder_rnn_dim) - self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(B, - self.decoder_rnn_dim) - self.context = torch.zeros(1, device=inputs.device).repeat(B, - self.encoder_embedding_dim) + self.query = torch.zeros(1, device=inputs.device).repeat( + B, self.query_dim) + self.attention_rnn_cell_state = torch.zeros( + 1, device=inputs.device).repeat(B, self.query_dim) + self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat( + B, self.decoder_rnn_dim) + self.decoder_cell = torch.zeros(1, device=inputs.device).repeat( + B, self.decoder_rnn_dim) + self.context = torch.zeros(1, device=inputs.device).repeat( + B, self.encoder_embedding_dim) self.inputs = inputs self.processed_inputs = self.attention.inputs_layer(inputs) self.mask = mask @@ -277,7 +278,7 @@ class Decoder(nn.Module): stop_flags[2] = t > inputs.shape[1] * 2 if all(stop_flags): break - elif len(outputs) == self.max_decoder_steps: + if len(outputs) == self.max_decoder_steps: print(" | > Decoder stopped with 'max_decoder_steps") break @@ -317,7 +318,7 @@ class Decoder(nn.Module): stop_flags[2] = t > inputs.shape[1] * 2 if all(stop_flags): break - elif len(outputs) == self.max_decoder_steps: + if len(outputs) == self.max_decoder_steps: print(" | > Decoder stopped with 'max_decoder_steps") break From 3b7aa67ed2e377169aa5f82082fd3a86b96e1449 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 24 Oct 2019 14:19:16 +0200 Subject: [PATCH 34/35] linter --- layers/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layers/losses.py b/layers/losses.py index ab472519..3b60c1f4 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -58,7 +58,7 @@ class MSELossMasked(nn.Module): class AttentionEntropyLoss(nn.Module): - def forward(self, align): + def forward(self, align): #pylint disable=no-self-use """ Forces attention to be more decisive by penalizing soft attention weights From 9d5a5b0764de88f5f3b256d616f40ca1bea1cce3 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 24 Oct 2019 14:34:31 +0200 Subject: [PATCH 35/35] linter --- layers/losses.py | 1 + 1 file changed, 1 insertion(+) diff --git a/layers/losses.py b/layers/losses.py index ab472519..79f5b381 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -58,6 +58,7 @@ class MSELossMasked(nn.Module): class AttentionEntropyLoss(nn.Module): + # pylint: disable=R0201 def forward(self, align): """ Forces attention to be more decisive by penalizing