From 4a989e3cebf68ef9ae2ab4f675fcfbbeb983288a Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 09:45:37 +0100 Subject: [PATCH] compute audio feat on dataload --- TTS/bin/train_wavernn_vocoder.py | 175 ++++++++++++------------ TTS/vocoder/configs/wavernn_config.json | 143 +++++++++---------- TTS/vocoder/datasets/wavernn_dataset.py | 68 ++++++--- TTS/vocoder/models/wavernn.py | 60 ++++---- 4 files changed, 243 insertions(+), 203 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 66a7c913..91a62cbe 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -29,8 +29,8 @@ from TTS.utils.generic_utils import ( from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( find_feat_files, - load_wav_feat_data, - preprocess_wav_files, + load_wav_data, + load_wav_feat_data ) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss from TTS.vocoder.utils.generic_utils import setup_wavernn @@ -41,15 +41,16 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - if is_val and not CONFIG.run_eval: + if is_val and not c.run_eval: loader = None else: dataset = WaveRNNDataset(ap=ap, items=eval_data if is_val else train_data, - seq_len=CONFIG.seq_len, + seq_len=c.seq_len, hop_len=ap.hop_length, - pad=CONFIG.padding, - mode=CONFIG.mode, + pad=c.padding, + mode=c.mode, + mulaw=c.mulaw, is_training=not is_val, verbose=verbose, ) @@ -57,10 +58,10 @@ def setup_loader(ap, is_val=False, verbose=False): loader = DataLoader(dataset, shuffle=True, collate_fn=dataset.collate, - batch_size=CONFIG.batch_size, - num_workers=CONFIG.num_val_loader_workers + batch_size=c.batch_size, + num_workers=c.num_val_loader_workers if is_val - else CONFIG.num_loader_workers, + else c.num_loader_workers, pin_memory=True, ) return loader @@ -89,9 +90,9 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): keep_avg = KeepAverage() if use_cuda: batch_n_iter = int(len(data_loader.dataset) / - (CONFIG.batch_size * num_gpus)) + (c.batch_size * num_gpus)) else: - batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) + batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() c_logger.print_train_start() # train loop @@ -102,9 +103,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): loader_time = time.time() - end_time global_step += 1 - ################## - # MODEL TRAINING # - ################## y_hat = model(x_input, mels) if isinstance(model.mode, int): @@ -112,7 +110,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): else: y_coarse = y_coarse.float() y_coarse = y_coarse.unsqueeze(-1) - # m_scaled, _ = model.upsample(m) # compute losses loss = criterion(y_hat, y_coarse) @@ -120,11 +117,11 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): raise RuntimeError(" [!] None loss. Exiting ...") optimizer.zero_grad() loss.backward() - if CONFIG.grad_clip > 0: + if c.grad_clip > 0: torch.nn.utils.clip_grad_norm_( - model.parameters(), CONFIG.grad_clip) - + model.parameters(), c.grad_clip) optimizer.step() + if scheduler is not None: scheduler.step() @@ -144,7 +141,7 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): keep_avg.update_values(update_train_values) # print training stats - if global_step % CONFIG.print_step == 0: + if global_step % c.print_step == 0: log_dict = {"step_time": [step_time, 2], "loader_time": [loader_time, 4], "current_lr": cur_lr, @@ -164,8 +161,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): tb_logger.tb_train_iter_stats(global_step, iter_stats) # save checkpoint - if global_step % CONFIG.save_step == 0: - if CONFIG.checkpoint: + if global_step % c.save_step == 0: + if c.checkpoint: # save model save_checkpoint(model, optimizer, @@ -180,28 +177,30 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) # synthesize a full voice - wav_path = train_data[random.randrange(0, len(train_data))][0] + rand_idx = random.randrange(0, len(train_data)) + wav_path = train_data[rand_idx] if not isinstance( + train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) sample_wav = model.generate(ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, + c.batched, + c.target_samples, + c.overlap_samples, + use_cuda ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms figures = {"train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T) } + tb_logger.tb_train_figures(global_step, figures) # Sample audio tb_logger.tb_train_audios( global_step, { - "train/audio": sample_wav}, CONFIG.audio["sample_rate"] + "train/audio": sample_wav}, c.audio["sample_rate"] ) - - tb_logger.tb_train_figures(global_step, figures) end_time = time.time() # print epoch stats @@ -259,34 +258,35 @@ def evaluate(model, criterion, ap, global_step, epoch): keep_avg.update_values(update_eval_values) # print eval stats - if CONFIG.print_eval: + if c.print_eval: c_logger.print_eval_step( num_iter, loss_dict, keep_avg.avg_values) - if epoch % CONFIG.test_every_epochs == 0 and epoch != 0: - # synthesize a part of data - wav_path = eval_data[random.randrange(0, len(eval_data))][0] + if epoch % c.test_every_epochs == 0 and epoch != 0: + # synthesize a full voice + rand_idx = random.randrange(0, len(eval_data)) + wav_path = eval_data[rand_idx] if not isinstance( + eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav[:22000]) + ground_mel = ap.melspectrogram(wav) sample_wav = model.generate(ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, + c.batched, + c.target_samples, + c.overlap_samples, use_cuda ) predict_mel = ap.melspectrogram(sample_wav) - # compute spectrograms - figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } - # Sample audio tb_logger.tb_eval_audios( global_step, { - "eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + "eval/audio": sample_wav}, c.audio["sample_rate"] ) + # compute spectrograms + figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T) + } tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -299,53 +299,62 @@ def main(args): # pylint: disable=redefined-outer-name global train_data, eval_data # setup audio processor - ap = AudioProcessor(**CONFIG.audio) + ap = AudioProcessor(**c.audio) - print(f" > Loading wavs from: {CONFIG.data_path}") - if CONFIG.feature_path is not None: - print(f" > Loading features from: {CONFIG.feature_path}") + # print(f" > Loading wavs from: {c.data_path}") + # if c.feature_path is not None: + # print(f" > Loading features from: {c.feature_path}") + # eval_data, train_data = load_wav_feat_data( + # c.data_path, c.feature_path, c.eval_split_size + # ) + # else: + # mel_feat_path = os.path.join(OUT_PATH, "mel") + # feat_data = find_feat_files(mel_feat_path) + # if feat_data: + # print(f" > Loading features from: {mel_feat_path}") + # eval_data, train_data = load_wav_feat_data( + # c.data_path, mel_feat_path, c.eval_split_size + # ) + # else: + # print(" > No feature data found. Preprocessing...") + # # preprocessing feature data from given wav files + # preprocess_wav_files(OUT_PATH, CONFIG, ap) + # eval_data, train_data = load_wav_feat_data( + # c.data_path, mel_feat_path, c.eval_split_size + # ) + + print(f" > Loading wavs from: {c.data_path}") + if c.feature_path is not None: + print(f" > Loading features from: {c.feature_path}") eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size - ) + c.data_path, c.feature_path, c.eval_split_size) else: - mel_feat_path = os.path.join(OUT_PATH, "mel") - feat_data = find_feat_files(mel_feat_path) - if feat_data: - print(f" > Loading features from: {mel_feat_path}") - eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size - ) - else: - print(" > No feature data found. Preprocessing...") - # preprocessing feature data from given wav files - preprocess_wav_files(OUT_PATH, CONFIG, ap) - eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size - ) + eval_data, train_data = load_wav_data( + c.data_path, c.eval_split_size) # setup model - model_wavernn = setup_wavernn(CONFIG) + model_wavernn = setup_wavernn(c) # define train functions - if CONFIG.mode == "mold": + if c.mode == "mold": criterion = discretized_mix_logistic_loss - elif CONFIG.mode == "gauss": + elif c.mode == "gauss": criterion = gaussian_loss - elif isinstance(CONFIG.mode, int): + elif isinstance(c.mode, int): criterion = torch.nn.CrossEntropyLoss() if use_cuda: model_wavernn.cuda() - if isinstance(CONFIG.mode, int): + if isinstance(c.mode, int): criterion.cuda() - optimizer = RAdam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + optimizer = RAdam(model_wavernn.parameters(), lr=c.lr, weight_decay=0) scheduler = None - if "lr_scheduler" in CONFIG: - scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) - scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params) + if "lr_scheduler" in c: + scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) + scheduler = scheduler(optimizer, **c.lr_scheduler_params) # slow start for the first 5 epochs - # lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1) + # lr_lambda = lambda epoch: min(epoch / c.warmup_steps, 1) # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) # restore any checkpoint @@ -366,7 +375,7 @@ def main(args): # pylint: disable=redefined-outer-name # retore only matching layers. print(" > Partial model initialization...") model_dict = model_wavernn.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model_wavernn.load_state_dict(model_dict) print(" > Model restored from step %d" % @@ -386,11 +395,10 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float("inf") global_step = args.restore_step - for epoch in range(0, CONFIG.epochs): - c_logger.print_epoch_start(epoch, CONFIG.epochs) - _, global_step = train( - model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch - ) + for epoch in range(0, c.epochs): + c_logger.print_epoch_start(epoch, c.epochs) + _, global_step = train(model_wavernn, optimizer, + criterion, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate( model_wavernn, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) @@ -462,14 +470,14 @@ if __name__ == "__main__": print(f" > Training continues for {args.restore_path}") # setup output paths and read configs - CONFIG = load_config(args.config_path) + c = load_config(args.config_path) # check_config(c) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path if args.continue_path == "": OUT_PATH = create_experiment_folder( - CONFIG.output_path, CONFIG.run_name, args.debug + c.output_path, c.run_name, args.debug ) AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") @@ -483,7 +491,7 @@ if __name__ == "__main__": new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() copy_config_file( - args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + args.config_path, os.path.join(OUT_PATH, "c.json"), new_fields ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) @@ -492,8 +500,7 @@ if __name__ == "__main__": tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text("model-description", - CONFIG["run_description"], 0) + tb_logger.tb_add_text("model-description", c["run_description"], 0) try: main(args) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 8e6a8c32..9a9fbdae 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,94 +1,97 @@ { "run_name": "wavernn_test", "run_description": "wavernn_test training", - - // AUDIO PARAMETERS - "audio":{ - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. + +// AUDIO PARAMETERS + "audio": { + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. // Silence trimming - "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. - + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, - - // Generating / Synthesizing - "batched": true, - "target_samples": 11000, // target number of samples to be generated in each batch entry - "overlap_samples": 550, // number of samples for crossfading between batches - + +// Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches // DISTRIBUTED TRAINING // "distributed":{ // "backend": "nccl", // "url": "tcp:\/\/localhost:54321" // }, - - // MODEL PARAMETERS - "use_aux_net": true, - "use_upsample_net": true, - "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length - "seq_len": 1280, // has to be devideable by hop_length - "mode": "mold", // mold [string], gauss [string], bits [int] - "mulaw": false, // apply mulaw if mode is bits - "padding": 2, // pad the input for resnet to see wider input length - // DATASET - //"use_gta": true, // use computed gta features from the tts model - "data_path": "path/to/wav/files", // path containing training wav files - "feature_path": null, // path containing computed features from wav files if null compute them +// MODEL MODE + "mode": 10, // mold [string], gauss [string], bits [int] + "mulaw": true, // apply mulaw if mode is bits + +// MODEL PARAMETERS + "wavernn_model_params": { + "rnn_dims": 512, + "fc_dims": 512, + "compute_dims": 128, + "res_out_dims": 128, + "num_res_blocks": 10, + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8] // this needs to correctly factorise hop_length + }, + +// DATASET + //"use_gta": true, // use computed gta features from the tts model + "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them + "seq_len": 1280, // has to be devideable by hop_length + "padding": 2, // pad the input for resnet to see wider input length + +// TRAINING + "batch_size": 64, // Batch size for training. + "epochs": 10000, // total number of epochs to train. - // TRAINING - "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "epochs": 10000, // total number of epochs to train. - - // VALIDATION +// VALIDATION "run_eval": true, - "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) - - // OPTIMIZER - "grad_clip": 4, // apply gradient clipping if > 0 - "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "test_every_epochs": 10, // Test after set number of epochs (Test every 10 epochs for example) + +// OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "lr_scheduler_params": { "gamma": 0.5, "milestones": [200000, 400000, 600000] }, - "lr": 1e-4, // initial learning rate - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. - "print_eval": false, // If True, it prints loss values for each step in eval run. - "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 50, // number of samples for testing - - // PATHS + "lr": 1e-4, // initial learning rate + +// TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + +// DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 50, // number of samples for testing + +// PATHS "output_path": "output/training/path" } - diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 194344a9..3dbb2194 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,11 +1,13 @@ import torch import numpy as np from torch.utils.data import Dataset +from multiprocessing import Manager class WaveRNNDataset(Dataset): """ - WaveRNN Dataset searchs for all the wav files under root path. + WaveRNN Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly. """ def __init__(self, @@ -15,16 +17,19 @@ class WaveRNNDataset(Dataset): hop_len, pad, mode, + mulaw, is_training=True, verbose=False, ): self.ap = ap + self.compute_feat = not isinstance(items[0], (tuple, list)) self.item_list = items self.seq_len = seq_len self.hop_len = hop_len self.pad = pad self.mode = mode + self.mulaw = mulaw self.is_training = is_training self.verbose = verbose @@ -36,22 +41,47 @@ class WaveRNNDataset(Dataset): return item def load_item(self, index): - wavpath, feat_path = self.item_list[index] - m = np.load(feat_path.replace("/quant/", "/mel/")) - # x = self.wav_cache[index] - if m.shape[-1] < 5: - print(" [!] Instance is too short! : {}".format(wavpath)) - self.item_list[index] = self.item_list[index + 1] - feat_path = self.item_list[index] - m = np.load(feat_path.replace("/quant/", "/mel/")) - if self.mode in ["gauss", "mold"]: - # x = np.load(feat_path.replace("/mel/", "/quant/")) - x = self.ap.load_wav(wavpath) - elif isinstance(self.mode, int): - x = np.load(feat_path.replace("/mel/", "/quant/")) + """ + load (audio, feat) couple if feature_path is set + else compute it on the fly + """ + if self.compute_feat: + + wavpath = self.item_list[index] + audio = self.ap.load_wav(wavpath) + mel = self.ap.melspectrogram(audio) + + if mel.shape[-1] < 5: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + audio = self.ap.load_wav(wavpath) + mel = self.ap.melspectrogram(audio) + if self.mode in ["gauss", "mold"]: + x_input = audio + elif isinstance(self.mode, int): + x_input = (self.ap.mulaw_encode(audio, qc=self.mode) + if self.mulaw else self.ap.quantize(audio, bits=self.mode)) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + else: - raise RuntimeError("Unknown dataset mode - ", self.mode) - return m, x + + wavpath, feat_path = self.item_list[index] + mel = np.load(feat_path.replace("/quant/", "/mel/")) + + if mel.shape[-1] < 5: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + feat_path = self.item_list[index] + mel = np.load(feat_path.replace("/quant/", "/mel/")) + if self.mode in ["gauss", "mold"]: + x_input = self.ap.load_wav(wavpath) + elif isinstance(self.mode, int): + x_input = np.load(feat_path.replace("/mel/", "/quant/")) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + + return mel, x_input def collate(self, batch): mel_win = self.seq_len // self.hop_len + 2 * self.pad @@ -79,10 +109,8 @@ class WaveRNNDataset(Dataset): elif isinstance(self.mode, int): coarse = np.stack(coarse).astype(np.int64) coarse = torch.LongTensor(coarse) - x_input = ( - 2 * coarse[:, : self.seq_len].float() / - (2 ** self.mode - 1.0) - 1.0 - ) + x_input = (2 * coarse[:, : self.seq_len].float() / + (2 ** self.mode - 1.0) - 1.0) y_coarse = coarse[:, 1:] mels = torch.FloatTensor(mels) return x_input, mels, y_coarse diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 8a45d9e3..f771175c 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -36,14 +36,14 @@ class ResBlock(nn.Module): class MelResNet(nn.Module): - def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): + def __init__(self, num_res_blocks, in_dims, compute_dims, res_out_dims, pad): super().__init__() k_size = pad * 2 + 1 self.conv_in = nn.Conv1d( in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() - for _ in range(res_blocks): + for _ in range(num_res_blocks): self.layers.append(ResBlock(compute_dims)) self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) @@ -76,7 +76,7 @@ class UpsampleNetwork(nn.Module): feat_dims, upsample_scales, compute_dims, - res_blocks, + num_res_blocks, res_out_dims, pad, use_aux_net, @@ -87,7 +87,7 @@ class UpsampleNetwork(nn.Module): self.use_aux_net = use_aux_net if use_aux_net: self.resnet = MelResNet( - res_blocks, feat_dims, compute_dims, res_out_dims, pad + num_res_blocks, feat_dims, compute_dims, res_out_dims, pad ) self.resnet_stretch = Stretch2d(self.total_scale, 1) self.up_layers = nn.ModuleList() @@ -118,14 +118,14 @@ class UpsampleNetwork(nn.Module): class Upsample(nn.Module): def __init__( - self, scale, pad, res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net + self, scale, pad, num_res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net ): super().__init__() self.scale = scale self.pad = pad self.indent = pad * scale self.use_aux_net = use_aux_net - self.resnet = MelResNet(res_blocks, feat_dims, + self.resnet = MelResNet(num_res_blocks, feat_dims, compute_dims, res_out_dims, pad) def forward(self, m): @@ -147,23 +147,22 @@ class Upsample(nn.Module): class WaveRNN(nn.Module): - def __init__( - self, - rnn_dims, - fc_dims, - mode, - mulaw, - pad, - use_aux_net, - use_upsample_net, - upsample_factors, - feat_dims, - compute_dims, - res_out_dims, - res_blocks, - hop_length, - sample_rate, - ): + def __init__(self, + rnn_dims, + fc_dims, + mode, + mulaw, + pad, + use_aux_net, + use_upsample_net, + upsample_factors, + feat_dims, + compute_dims, + res_out_dims, + num_res_blocks, + hop_length, + sample_rate, + ): super().__init__() self.mode = mode self.mulaw = mulaw @@ -177,7 +176,7 @@ class WaveRNN(nn.Module): elif self.mode == "gauss": self.n_classes = 2 else: - raise RuntimeError(" > Unknown training mode") + raise RuntimeError("Unknown model mode value - ", self.mode) self.rnn_dims = rnn_dims self.aux_dims = res_out_dims // 4 @@ -192,7 +191,7 @@ class WaveRNN(nn.Module): feat_dims, upsample_factors, compute_dims, - res_blocks, + num_res_blocks, res_out_dims, pad, use_aux_net, @@ -201,7 +200,7 @@ class WaveRNN(nn.Module): self.upsample = Upsample( hop_length, pad, - res_blocks, + num_res_blocks, feat_dims, compute_dims, res_out_dims, @@ -260,7 +259,7 @@ class WaveRNN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) - def generate(self, mels, batched, target, overlap, use_cuda): + def generate(self, mels, batched, target, overlap, use_cuda=False): self.eval() device = 'cuda' if use_cuda else 'cpu' @@ -360,7 +359,9 @@ class WaveRNN(nn.Module): # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] - output[-20 * self.hop_length:] *= fade_out + + if wave_len > len(fade_out): + output[-20 * self.hop_length:] *= fade_out self.train() return output @@ -405,7 +406,8 @@ class WaveRNN(nn.Module): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side="after") - folded = torch.zeros(num_folds, target + 2 * overlap, features).to(x.device) + folded = torch.zeros(num_folds, target + 2 * + overlap, features).to(x.device) # Get the values for the folded tensor for i in range(num_folds):