From ea9d8755defd9c1b86b4e3925b1133d8340bd238 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 10:39:20 +0200 Subject: [PATCH] add wavernn tests + name refactoring --- tests/inputs/test_vocoder_wavernn_config.json | 94 +++++++++++++++++++ ...tasets.py => test_vocoder_gan_datasets.py} | 0 ...der_train.sh => test_vocoder_gan_train.sh} | 4 +- tests/test_vocoder_wavernn.py | 31 ++++++ tests/test_vocoder_wavernn_datasets.py | 91 ++++++++++++++++++ tests/test_vocoder_wavernn_train.sh | 15 +++ 6 files changed, 233 insertions(+), 2 deletions(-) create mode 100644 tests/inputs/test_vocoder_wavernn_config.json rename tests/{test_vocoder_datasets.py => test_vocoder_gan_datasets.py} (100%) rename tests/{test_vocoder_train.sh => test_vocoder_gan_train.sh} (57%) create mode 100644 tests/test_vocoder_wavernn.py create mode 100644 tests/test_vocoder_wavernn_datasets.py create mode 100755 tests/test_vocoder_wavernn_train.sh diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json new file mode 100644 index 00000000..28c0f059 --- /dev/null +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -0,0 +1,94 @@ +{ + "run_name": "wavernn_test", + "run_description": "wavernn_test training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length + "seq_len": 1280, // has to be devideable by hop_length + "mode": "mold", // mold [string], gauss [string], bits [int] + "mulaw": false, // apply mulaw if mode is bits + "padding": 2, // pad the input for resnet to see wider input length + + // DATASET + //"use_gta": true, // use computed gta features from the tts model + "data_path": "tests/data/ljspeech/wavs/", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them + + // TRAINING + "batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "epochs": 1, // total number of epochs to train. + + // VALIDATION + "run_eval": true, + "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) + + // OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [200000, 400000, 600000] + }, + "lr": 1e-4, // initial learning rate + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, // number of samples for testing + + // PATHS + "output_path": "tests/train_outputs/" +} + diff --git a/tests/test_vocoder_datasets.py b/tests/test_vocoder_gan_datasets.py similarity index 100% rename from tests/test_vocoder_datasets.py rename to tests/test_vocoder_gan_datasets.py diff --git a/tests/test_vocoder_train.sh b/tests/test_vocoder_gan_train.sh similarity index 57% rename from tests/test_vocoder_train.sh rename to tests/test_vocoder_gan_train.sh index fa99b4bd..75773cc3 100755 --- a/tests/test_vocoder_train.sh +++ b/tests/test_vocoder_gan_train.sh @@ -5,11 +5,11 @@ echo "$BASEDIR" # create run dir mkdir $BASEDIR/train_outputs # run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json # find the training folder LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) echo $LATEST_FOLDER # continue the previous training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER # remove all the outputs rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py new file mode 100644 index 00000000..fdb338f9 --- /dev/null +++ b/tests/test_vocoder_wavernn.py @@ -0,0 +1,31 @@ +import numpy as np +import torch +import random +from TTS.vocoder.models.wavernn import WaveRNN + + +def test_wavernn(): + model = WaveRNN( + rnn_dims=512, + fc_dims=512, + mode=10, + mulaw=False, + pad=2, + use_aux_net=True, + use_upsample_net=True, + upsample_factors=[4, 8, 8], + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=256, + sample_rate=22050, + ) + dummy_x = torch.rand((2, 1280)) + dummy_m = torch.rand((2, 80, 9)) + y_size = random.randrange(20, 60) + dummy_y = torch.rand((80, y_size)) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape + output = model.generate(dummy_y, True, 5500, 550, False) + assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py new file mode 100644 index 00000000..0f4e939a --- /dev/null +++ b/tests/test_vocoder_wavernn_datasets.py @@ -0,0 +1,91 @@ +import os +import shutil + +import numpy as np +from tests import get_tests_path, get_tests_input_path, get_tests_output_path +from torch.utils.data import DataLoader + +from TTS.utils.audio import AudioProcessor +from TTS.utils.io import load_config +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files + +file_path = os.path.dirname(os.path.realpath(__file__)) +OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") +os.makedirs(OUTPATH, exist_ok=True) + +C = load_config(os.path.join(get_tests_input_path(), + "test_vocoder_wavernn_config.json")) + +test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") +test_mel_feat_path = os.path.join(test_data_path, "mel") +test_quant_feat_path = os.path.join(test_data_path, "quant") +ok_ljspeech = os.path.exists(test_data_path) + + +def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): + """ run dataloader with given parameters and check conditions """ + ap = AudioProcessor(**C.audio) + + C.batch_size = batch_size + C.mode = mode + C.seq_len = seq_len + C.data_path = test_data_path + + preprocess_wav_files(test_data_path, C, ap) + _, train_items = load_wav_feat_data( + test_data_path, test_mel_feat_path, 5) + + dataset = WaveRNNDataset(ap=ap, + items=train_items, + seq_len=seq_len, + hop_len=hop_len, + pad=pad, + mode=mode, + ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader(dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=batch_size, + num_workers=num_workers, + pin_memory=True, + ) + + max_iter = 10 + count_iter = 0 + + try: + for data in loader: + x_input, mels, _ = data + expected_feat_shape = (ap.num_mels, + (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all( + mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" + + assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] + count_iter += 1 + if count_iter == max_iter: + break + # except AssertionError: + # shutil.rmtree(test_mel_feat_path) + # shutil.rmtree(test_quant_feat_path) + finally: + shutil.rmtree(test_mel_feat_path) + shutil.rmtree(test_quant_feat_path) + + +def test_parametrized_wavernn_dataset(): + ''' test dataloader with different parameters ''' + params = [ + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", 4], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", 0], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, 2], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", 0], + ] + for param in params: + print(param) + wavernn_dataset_case(*param) diff --git a/tests/test_vocoder_wavernn_train.sh b/tests/test_vocoder_wavernn_train.sh new file mode 100755 index 00000000..f2e32116 --- /dev/null +++ b/tests/test_vocoder_wavernn_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# create run dir +mkdir $BASEDIR/train_outputs +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER \ No newline at end of file