From 9ee70af9bb1684f07eb9190c053159e0dc3a5563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 7 May 2021 15:39:48 +0200 Subject: [PATCH] code styling --- TTS/bin/compute_statistics.py | 5 +- TTS/bin/train_align_tts.py | 170 ++++++------------ TTS/config/__init__.py | 3 +- TTS/tts/configs/align_tts_config.py | 16 +- TTS/tts/utils/synthesis.py | 2 +- TTS/utils/arguments.py | 2 +- TTS/vocoder/configs/fullband_melgan_config.py | 19 +- TTS/vocoder/configs/hifigan_config.py | 12 +- TTS/vocoder/configs/melgan_config.py | 19 +- .../configs/multiband_melgan_config.py | 40 ++--- .../configs/parallel_wavegan_config.py | 33 ++-- TTS/vocoder/configs/shared_configs.py | 27 ++- TTS/vocoder/configs/wavegrad_config.py | 44 ++--- TTS/vocoder/configs/wavernn_config.py | 14 +- .../test_fullband_melgan_train.py | 6 +- tests/vocoder_tests/test_hifigan_train.py | 6 +- tests/vocoder_tests/test_melgan_train.py | 6 +- .../test_multiband_melgan_train.py | 6 +- .../test_parallel_wavegan_train.py | 6 +- tests/vocoder_tests/test_wavegrad_train.py | 6 +- tests/vocoder_tests/test_wavernn_train.py | 6 +- 21 files changed, 161 insertions(+), 287 deletions(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index b4ee6df7..2c13a960 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -8,11 +8,10 @@ import os import numpy as np from tqdm import tqdm -from TTS.tts.datasets.preprocess import load_meta_data -from TTS.utils.audio import AudioProcessor - # from TTS.utils.io import load_config from TTS.config import load_config +from TTS.tts.datasets.preprocess import load_meta_data +from TTS.utils.audio import AudioProcessor def main(): diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index 206d8b03..7e3921b0 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -46,8 +46,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): ap=ap, tp=config.characters, add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * - config.batch_size, + batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, min_seq_len=config.min_seq_len, max_seq_len=config.max_seq_len, phoneme_cache_path=config.phoneme_cache_path, @@ -56,8 +55,9 @@ def setup_loader(ap, r, is_val=False, verbose=False): enable_eos_bos=config.enable_eos_bos_chars, use_noise_augment=not is_val, verbose=verbose, - speaker_mapping=speaker_mapping if config.use_speaker_embedding - and config.use_external_speaker_embedding_file else None, + speaker_mapping=speaker_mapping + if config.use_speaker_embedding and config.use_external_speaker_embedding_file + else None, ) if config.use_phonemes and config.compute_input_seq_cache: @@ -73,8 +73,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): collate_fn=dataset.collate_fn, drop_last=False, sampler=sampler, - num_workers=config.num_val_loader_workers - if is_val else config.num_loader_workers, + num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, pin_memory=False, ) return loader @@ -97,9 +96,7 @@ def format_data(data): speaker_c = data[8] else: # return speaker_id to be used by an embedding layer - speaker_c = [ - speaker_mapping[speaker_name] for speaker_name in speaker_names - ] + speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] speaker_c = torch.LongTensor(speaker_c) else: speaker_c = None @@ -114,15 +111,13 @@ def format_data(data): return text_input, text_lengths, mel_input, mel_lengths, speaker_c, avg_text_length, avg_spec_length, item_idx -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, - epoch, training_phase): +def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, training_phase): model.train() epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int( - len(data_loader.dataset) / (config.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / config.batch_size) end_time = time.time() @@ -151,12 +146,8 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, # forward pass model with torch.cuda.amp.autocast(enabled=config.mixed_precision): decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward( - text_input, - text_lengths, - mel_targets, - mel_lengths, - g=speaker_c, - phase=training_phase) + text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase + ) # compute loss loss_dict = criterion( @@ -175,14 +166,12 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, if config.mixed_precision: scaler.scale(loss_dict["loss"]).backward() scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - config.grad_clip) + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - config.grad_clip) + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) optimizer.step() # setup lr @@ -201,12 +190,9 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, # aggregate losses from processes if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, - num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, - num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, - num_gpus) + loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) + loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) + loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) # detach loss values @@ -235,18 +221,13 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, "loader_time": [loader_time, 2], "current_lr": current_lr, } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, - log_dict, loss_dict, keep_avg.avg_values) + c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) if args.rank == 0: # Plot Training Iter Stats # reduce TB load if global_step % config.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "step_time": step_time - } + iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} iter_stats.update(loss_dict) tb_logger.tb_train_iter_stats(global_step, iter_stats) @@ -270,8 +251,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, # Diagnostic visualizations if decoder_output is not None: idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy( - ).T + pred_spec = decoder_output[idx].detach().data.cpu().numpy().T gt_spec = mel_targets[idx].data.cpu().numpy().T align_img = alignments[idx].data.cpu() @@ -285,9 +265,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_train_audios(global_step, - {"TrainAudio": train_audio}, - config.audio["sample_rate"]) + tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -304,8 +282,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, @torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch, - training_phase): +def evaluate(data_loader, model, criterion, ap, global_step, epoch, training_phase): model.eval() epoch_time = 0 keep_avg = KeepAverage() @@ -315,18 +292,13 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch, start_time = time.time() # format data - text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _ = format_data( - data) + text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _ = format_data(data) # forward pass model with torch.cuda.amp.autocast(enabled=config.mixed_precision): decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward( - text_input, - text_lengths, - mel_targets, - mel_lengths, - g=speaker_c, - phase=training_phase) + text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase + ) # compute loss loss_dict = criterion( @@ -351,14 +323,10 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch, # aggregate losses from processes if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, - num_gpus) - loss_dict["loss_ssim"] = reduce_tensor( - loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor( - loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, - num_gpus) + loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) + loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) + loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) + loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) # detach loss values loss_dict_new = dict() @@ -376,8 +344,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch, keep_avg.update_values(update_train_values) if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, - keep_avg.avg_values) + c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: # Diagnostic visualizations @@ -387,17 +354,14 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch, align_img = alignments[idx].data.cpu() eval_figures = { - "prediction": plot_spectrogram(pred_spec, ap, - output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, - output_fig=False), + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), "alignment": plot_alignment(align_img, output_fig=False), } # Sample audio eval_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, - config.audio["sample_rate"]) + tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) # Plot Validation Stats tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -422,9 +386,9 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch, print(" | > Synthesizing test sentences") if config.use_speaker_embedding: if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list( - speaker_mapping.keys())[randrange( - len(speaker_mapping) - 1)]]["embedding"] + speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ + "embedding" + ] speaker_id = None else: speaker_id = 0 @@ -452,19 +416,15 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch, file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, - "TestSentence_{}.wav".format(idx)) + file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram( - postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment( - alignment) + test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) except: # pylint: disable=bare-except print(" !! Error creating Test Sentence -", idx) traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, - config.audio["sample_rate"]) + tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg.avg_values @@ -479,32 +439,21 @@ def main(args): # pylint: disable=redefined-outer-name # DISTRUBUTED if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, - config.distributed["backend"], - config.distributed["url"]) + init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) # set model characters model_characters = phonemes if config.use_phonemes else symbols num_chars = len(model_characters) # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets, - eval_split=True) + meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers( - config, args, meta_data_train, OUT_PATH) + num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) # setup model - model = setup_model(num_chars, - num_speakers, - config, - speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), - lr=config.lr, - weight_decay=0, - betas=(0.9, 0.98), - eps=1e-9) + model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) + optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = AlignTTSLoss(config) if args.restore_path: @@ -526,8 +475,7 @@ def main(args): # pylint: disable=redefined-outer-name for group in optimizer.param_groups: group["initial_lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], - flush=True) + print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 @@ -541,9 +489,7 @@ def main(args): # pylint: disable=redefined-outer-name model = DDP_th(model, device_ids=[args.rank]) if config.noam_schedule: - scheduler = NoamLR(optimizer, - warmup_steps=config.warmup_steps, - last_epoch=args.restore_step - 1) + scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None @@ -554,10 +500,8 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float("inf") print(" > Starting with inf best loss.") else: - print(" > Restoring best loss from " - f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, - map_location="cpu")["model_loss"] + print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") + best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] print(f" > Starting with loaded last best loss {best_loss}.") keep_all_best = config.keep_all_best keep_after = config.keep_after # void if keep_all_best False @@ -576,9 +520,10 @@ def main(args): # pylint: disable=redefined-outer-name phase = 0 else: phase = ( - len(config.phase_start_steps) - - [i < global_step - for i in config.phase_start_steps][::-1].index(True) - 1) + len(config.phase_start_steps) + - [i < global_step for i in config.phase_start_steps][::-1].index(True) + - 1 + ) else: phase = None return phase @@ -587,12 +532,10 @@ def main(args): # pylint: disable=redefined-outer-name cur_phase = set_phase() print(f"\n > Current AlignTTS phase: {cur_phase}") c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train(train_loader, model, - criterion, optimizer, - scheduler, ap, global_step, - epoch, cur_phase) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, - global_step, epoch, cur_phase) + train_avg_loss_dict, global_step = train( + train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, cur_phase + ) + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch, cur_phase) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict["avg_loss"] if config.run_eval: @@ -613,8 +556,7 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training( - sys.argv) + args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) try: main(args) diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index 85e7d9b9..29ba1190 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -1,10 +1,9 @@ -from TTS.config.shared_configs import * - import json import os import yaml +from TTS.config.shared_configs import * from TTS.utils.generic_utils import find_module diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index fae4c608..6e09e398 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -14,20 +14,12 @@ class AlignTTSConfig(BaseTTSConfig): hidden_channels: int = 256 encoder_type: str = "fftransformer" encoder_params: dict = field( - default_factory=lambda: { - "hidden_channels_ffn": 1024, - "num_heads": 2, - "num_layers": 6, - "dropout_p": 0.1 - }) + default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} + ) decoder_type: str = "fftransformer" decoder_params: dict = field( - default_factory=lambda: { - "hidden_channels_ffn": 1024, - "num_heads": 2, - "num_layers": 6, - "dropout_p": 0.1 - }) + default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} + ) phase_start_steps: list = None ssim_alpha: float = 1.0 diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 405cf2dc..9f417a1d 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -256,7 +256,7 @@ def synthesis( """ # GST processing style_mel = None - if CONFIG.has('gst') and CONFIG.gst and style_wav is not None: + if CONFIG.has("gst") and CONFIG.gst and style_wav is not None: if isinstance(style_wav, dict): style_mel = style_wav else: diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index cf64edae..fc969593 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -9,11 +9,11 @@ import re import torch +from TTS.config import load_config from TTS.tts.utils.text.symbols import parse_symbols from TTS.utils.console_logger import ConsoleLogger from TTS.utils.generic_utils import create_experiment_folder, get_git_branch from TTS.utils.io import copy_model_files -from TTS.config import load_config from TTS.utils.tensorboard_logger import TensorboardLogger diff --git a/TTS/vocoder/configs/fullband_melgan_config.py b/TTS/vocoder/configs/fullband_melgan_config.py index 9698d36d..d206451f 100644 --- a/TTS/vocoder/configs/fullband_melgan_config.py +++ b/TTS/vocoder/configs/fullband_melgan_config.py @@ -6,22 +6,18 @@ from .shared_configs import BaseGANVocoderConfig @dataclass class FullbandMelganConfig(BaseGANVocoderConfig): """Defines parameters for FullbandMelGAN vocoder.""" + model: str = "melgan" # Model specific params discriminator_model: str = "melgan_multiscale_discriminator" discriminator_model_params: dict = field( - default_factory=lambda: { - "base_channels": 16, - "max_channels": 512, - "downsample_factors": [4, 4, 4] - }) + default_factory=lambda: {"base_channels": 16, "max_channels": 512, "downsample_factors": [4, 4, 4]} + ) generator_model: str = "melgan_generator" generator_model_params: dict = field( - default_factory=lambda: { - "upsample_factors": [8, 8, 2, 2], - "num_res_blocks": 4 - }) + default_factory=lambda: {"upsample_factors": [8, 8, 2, 2], "num_res_blocks": 4} + ) # Training - overrides batch_size: int = 16 @@ -42,8 +38,9 @@ class FullbandMelganConfig(BaseGANVocoderConfig): default_factory=lambda: { "n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], - "win_lengths": [600, 1200, 240] - }) + "win_lengths": [600, 1200, 240], + } + ) # loss weights - overrides stft_loss_weight: float = 0.5 diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py index 072bd27f..40b5fc26 100644 --- a/TTS/vocoder/configs/hifigan_config.py +++ b/TTS/vocoder/configs/hifigan_config.py @@ -18,8 +18,9 @@ class HifiganConfig(BaseGANVocoderConfig): "upsample_initial_channel": 512, "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - "resblock_type": "1" - }) + "resblock_type": "1", + } + ) # LOSS PARAMETERS - overrides use_stft_loss: bool = False @@ -45,9 +46,10 @@ class HifiganConfig(BaseGANVocoderConfig): "win_length": 1024, "n_mels": 80, "mel_fmin": 0.0, - "mel_fmax": None - }) + "mel_fmax": None, + } + ) # optimizer parameters lr: float = 1e-4 - wd: float = 1e-6 \ No newline at end of file + wd: float = 1e-6 diff --git a/TTS/vocoder/configs/melgan_config.py b/TTS/vocoder/configs/melgan_config.py index f000be6a..f67c7d1e 100644 --- a/TTS/vocoder/configs/melgan_config.py +++ b/TTS/vocoder/configs/melgan_config.py @@ -6,22 +6,18 @@ from .shared_configs import BaseGANVocoderConfig @dataclass class MelganConfig(BaseGANVocoderConfig): """Defines parameters for MelGAN vocoder.""" + model: str = "melgan" # Model specific params discriminator_model: str = "melgan_multiscale_discriminator" discriminator_model_params: dict = field( - default_factory=lambda: { - "base_channels": 16, - "max_channels": 1024, - "downsample_factors": [4, 4, 4, 4] - }) + default_factory=lambda: {"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]} + ) generator_model: str = "melgan_generator" generator_model_params: dict = field( - default_factory=lambda: { - "upsample_factors": [8, 8, 2, 2], - "num_res_blocks": 3 - }) + default_factory=lambda: {"upsample_factors": [8, 8, 2, 2], "num_res_blocks": 3} + ) # Training - overrides batch_size: int = 16 @@ -42,8 +38,9 @@ class MelganConfig(BaseGANVocoderConfig): default_factory=lambda: { "n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], - "win_lengths": [600, 1200, 240] - }) + "win_lengths": [600, 1200, 240], + } + ) # loss weights - overrides stft_loss_weight: float = 0.5 diff --git a/TTS/vocoder/configs/multiband_melgan_config.py b/TTS/vocoder/configs/multiband_melgan_config.py index 70745b5c..f8a99152 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.py +++ b/TTS/vocoder/configs/multiband_melgan_config.py @@ -6,42 +6,31 @@ from .shared_configs import BaseGANVocoderConfig @dataclass class MultibandMelganConfig(BaseGANVocoderConfig): """Defines parameters for MultiBandMelGAN vocoder.""" + model: str = "multiband_melgan" # Model specific params discriminator_model: str = "melgan_multiscale_discriminator" discriminator_model_params: dict = field( - default_factory=lambda: { - "base_channels": 16, - "max_channels": 512, - "downsample_factors": [4, 4, 4] - }) + default_factory=lambda: {"base_channels": 16, "max_channels": 512, "downsample_factors": [4, 4, 4]} + ) generator_model: str = "multiband_melgan_generator" - generator_model_params: dict = field( - default_factory=lambda: { - "upsample_factors": [8, 4, 2], - "num_res_blocks": 4 - }) + generator_model_params: dict = field(default_factory=lambda: {"upsample_factors": [8, 4, 2], "num_res_blocks": 4}) use_pqmf: bool = True # optimizer - overrides lr_gen: float = 0.0001 # Initial learning rate. lr_disc: float = 0.0001 # Initial learning rate. optimizer: str = "AdamW" - optimizer_params: dict = field(default_factory=lambda: { - "betas": [0.8, 0.99], - "weight_decay": 0.0 - }) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) lr_scheduler_gen: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_gen_params: dict = field(default_factory=lambda: { - "gamma": 0.5, - "milestones": [100000, 200000, 300000, 400000, 500000, 600000] - }) + lr_scheduler_gen_params: dict = field( + default_factory=lambda: {"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]} + ) lr_scheduler_disc: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_disc_params: dict = field(default_factory=lambda: { - "gamma": 0.5, - "milestones": [100000, 200000, 300000, 400000, 500000, 600000] - }) + lr_scheduler_disc_params: dict = field( + default_factory=lambda: {"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]} + ) # Training - overrides batch_size: int = 64 @@ -60,11 +49,8 @@ class MultibandMelganConfig(BaseGANVocoderConfig): use_l1_spec_loss: bool = False subband_stft_loss_params: dict = field( - default_factory=lambda: { - "n_ffts": [384, 683, 171], - "hop_lengths": [30, 60, 10], - "win_lengths": [150, 300, 60] - }) + default_factory=lambda: {"n_ffts": [384, 683, 171], "hop_lengths": [30, 60, 10], "win_lengths": [150, 300, 60]} + ) # loss weights - overrides stft_loss_weight: float = 0.5 diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index 28d8107f..79afa228 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -6,21 +6,16 @@ from .shared_configs import BaseGANVocoderConfig @dataclass class ParallelWaveganConfig(BaseGANVocoderConfig): """Defines parameters for ParallelWavegan vocoder.""" + model: str = "parallel_wavegan" # Model specific params discriminator_model: str = "parallel_wavegan_discriminator" - discriminator_model_params: dict = field( - default_factory=lambda: { - "num_layers": 10 - }) + discriminator_model_params: dict = field(default_factory=lambda: {"num_layers": 10}) generator_model: str = "parallel_wavegan_generator" generator_model_params: dict = field( - default_factory=lambda: { - "upsample_factors":[4, 4, 4, 4], - "stacks": 3, - "num_res_blocks": 30 - }) + default_factory=lambda: {"upsample_factors": [4, 4, 4, 4], "stacks": 3, "num_res_blocks": 30} + ) # Training - overrides batch_size: int = 6 @@ -42,8 +37,9 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): default_factory=lambda: { "n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], - "win_lengths": [600, 1200, 240] - }) + "win_lengths": [600, 1200, 240], + } + ) # loss weights - overrides stft_loss_weight: float = 0.5 @@ -57,17 +53,8 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): lr_gen: float = 0.0002 # Initial learning rate. lr_disc: float = 0.0002 # Initial learning rate. optimizer: str = "AdamW" - optimizer_params: dict = field(default_factory=lambda: { - "betas": [0.8, 0.99], - "weight_decay": 0.0 - }) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_gen_params: dict = field(default_factory=lambda: { - "gamma": 0.999, - "last_epoch": -1 - }) + lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_disc_params: dict = field(default_factory=lambda: { - "gamma": 0.999, - "last_epoch": -1 - }) + lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) diff --git a/TTS/vocoder/configs/shared_configs.py b/TTS/vocoder/configs/shared_configs.py index 0d64c622..d403f84c 100644 --- a/TTS/vocoder/configs/shared_configs.py +++ b/TTS/vocoder/configs/shared_configs.py @@ -9,6 +9,7 @@ from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @dataclass class BaseVocoderConfig(BaseTrainingConfig): """Shared parameters among all the vocoder models.""" + audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) # dataloading use_noise_augment: bool = False # enable/disable random noise augmentation in spectrograms. @@ -29,6 +30,7 @@ class BaseVocoderConfig(BaseTrainingConfig): @dataclass class BaseGANVocoderConfig(BaseVocoderConfig): """Common config interface for all the GAN based vocoder models.""" + # LOSS PARAMETERS use_stft_loss: bool = True use_subband_stft_loss: bool = True @@ -49,8 +51,9 @@ class BaseGANVocoderConfig(BaseVocoderConfig): default_factory=lambda: { "n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], - "win_lengths": [600, 1200, 240] - }) + "win_lengths": [600, 1200, 240], + } + ) l1_spec_loss_params: dict = field( default_factory=lambda: { @@ -61,8 +64,9 @@ class BaseGANVocoderConfig(BaseVocoderConfig): "win_length": 1024, "n_mels": 80, "mel_fmin": 0.0, - "mel_fmax": None - }) + "mel_fmax": None, + } + ) target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch @@ -72,20 +76,11 @@ class BaseGANVocoderConfig(BaseVocoderConfig): lr_gen: float = 0.0002 # Initial learning rate. lr_disc: float = 0.0002 # Initial learning rate. optimizer: str = "AdamW" - optimizer_params: dict = field(default_factory=lambda: { - "betas": [0.8, 0.99], - "weight_decay": 0.0 - }) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_gen_params: dict = field(default_factory=lambda: { - "gamma": 0.999, - "last_epoch": -1 - }) + lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_disc_params: dict = field(default_factory=lambda: { - "gamma": 0.999, - "last_epoch": -1 - }) + lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) use_pqmf: bool = False # enable/disable using pqmf for multi-band training. (Multi-band MelGAN) steps_to_start_discriminator = 0 # start training the discriminator after this number of steps. diff --git a/TTS/vocoder/configs/wavegrad_config.py b/TTS/vocoder/configs/wavegrad_config.py index 46ff5290..7638988f 100644 --- a/TTS/vocoder/configs/wavegrad_config.py +++ b/TTS/vocoder/configs/wavegrad_config.py @@ -6,24 +6,22 @@ from .shared_configs import BaseVocoderConfig @dataclass class WavegradConfig(BaseVocoderConfig): """Defines parameters for Wavernn vocoder.""" - model: str = 'wavegrad' + + model: str = "wavegrad" # Model specific params generator_model: str = "wavegrad" model_params: dict = field( default_factory=lambda: { - "use_weight_norm": - True, - "y_conv_channels": - 32, - "x_conv_channels": - 768, + "use_weight_norm": True, + "y_conv_channels": 32, + "x_conv_channels": 768, "ublock_out_channels": [512, 512, 256, 128, 128], "dblock_out_channels": [128, 128, 256, 512], "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], - [1, 2, 4, 8], [1, 2, 4, 8]] - }) - target_loss: str = 'avg_wavegrad_loss' # loss value to pick the best model to save after each epoch + "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], + } + ) + target_loss: str = "avg_wavegrad_loss" # loss value to pick the best model to save after each epoch # Training - overrides epochs: int = 10000 @@ -35,24 +33,20 @@ class WavegradConfig(BaseVocoderConfig): eval_split_size: int = 50 # NOISE SCHEDULE PARAMS - train_noise_schedule: dict = field(default_factory=lambda: { - "min_val": 1e-6, - "max_val": 1e-2, - "num_steps": 1000 - }) + train_noise_schedule: dict = field(default_factory=lambda: {"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000}) - test_noise_schedule: dict = field(default_factory=lambda: { # inference noise schedule. Try TTS/bin/tune_wavegrad.py to find the optimal values. - "min_val": 1e-6, - "max_val": 1e-2, - "num_steps": 50 - }) + test_noise_schedule: dict = field( + default_factory=lambda: { # inference noise schedule. Try TTS/bin/tune_wavegrad.py to find the optimal values. + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 50, + } + ) # optimizer overrides grad_clip: float = 1.0 lr: float = 1e-4 # Initial learning rate. lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html lr_scheduler_params: dict = field( - default_factory=lambda: { - "gamma": 0.5, - "milestones": [100000, 200000, 300000, 400000, 500000, 600000] - }) + default_factory=lambda: {"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]} + ) diff --git a/TTS/vocoder/configs/wavernn_config.py b/TTS/vocoder/configs/wavernn_config.py index 636b0b23..daa586f6 100644 --- a/TTS/vocoder/configs/wavernn_config.py +++ b/TTS/vocoder/configs/wavernn_config.py @@ -6,10 +6,11 @@ from .shared_configs import BaseVocoderConfig @dataclass class WavernnConfig(BaseVocoderConfig): """Defines parameters for Wavernn vocoder.""" + model: str = "wavernn" # Model specific params - mode: str = 'mold' # mold [string], gauss [string], bits [int] + mode: str = "mold" # mold [string], gauss [string], bits [int] mulaw: bool = True # apply mulaw if mode is bits generator_model: str = "WaveRNN" wavernn_model_params: dict = field( @@ -21,9 +22,9 @@ class WavernnConfig(BaseVocoderConfig): "num_res_blocks": 10, "use_aux_net": True, "use_upsample_net": True, - "upsample_factors": - [4, 8, 8] # this needs to correctly factorise hop_length - }) + "upsample_factors": [4, 8, 8], # this needs to correctly factorise hop_length + } + ) # Inference batched: bool = True @@ -46,7 +47,4 @@ class WavernnConfig(BaseVocoderConfig): grad_clip: float = 4.0 lr: float = 1e-4 # Initial learning rate. lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_params: dict = field(default_factory=lambda: { - "gamma": 0.5, - "milestones": [200000, 400000, 600000] - }) + lr_scheduler_params: dict = field(default_factory=lambda: {"gamma": 0.5, "milestones": [200000, 400000, 600000]}) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index 358552c4..64355af9 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -21,16 +21,14 @@ config = FullbandMelganConfig( print_step=1, print_eval=True, data_path="tests/data/ljspeech", - output_path=output_path + output_path=output_path, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py index 83a3f4b8..fa431eb3 100644 --- a/tests/vocoder_tests/test_hifigan_train.py +++ b/tests/vocoder_tests/test_hifigan_train.py @@ -22,16 +22,14 @@ config = HifiganConfig( print_step=1, print_eval=True, data_path="tests/data/ljspeech", - output_path=output_path + output_path=output_path, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 65b7346a..b362ce86 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -21,16 +21,14 @@ config = MelganConfig( print_step=1, print_eval=True, data_path="tests/data/ljspeech", - output_path=output_path + output_path=output_path, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index 8ededcce..bd2ae86f 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -21,16 +21,14 @@ config = MultibandMelganConfig( print_step=1, print_eval=True, data_path="tests/data/ljspeech", - output_path=output_path + output_path=output_path, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py index a2edd0c5..5d89d069 100644 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ b/tests/vocoder_tests/test_parallel_wavegan_train.py @@ -21,16 +21,14 @@ config = ParallelWaveganConfig( print_step=1, print_eval=True, data_path="tests/data/ljspeech", - output_path=output_path + output_path=output_path, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index ffa450b8..c2269bbd 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -21,16 +21,14 @@ config = WavegradConfig( print_step=1, print_eval=True, data_path="tests/data/ljspeech", - output_path=output_path + output_path=output_path, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} " run_cli(command_train) # Find latest folder diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py index 33fc4e57..1ac9d9eb 100644 --- a/tests/vocoder_tests/test_wavernn_train.py +++ b/tests/vocoder_tests/test_wavernn_train.py @@ -21,16 +21,14 @@ config = WavernnConfig( print_step=1, print_eval=True, data_path="tests/data/ljspeech", - output_path=output_path + output_path=output_path, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} " run_cli(command_train) # Find latest folder