From ea9d8755defd9c1b86b4e3925b1133d8340bd238 Mon Sep 17 00:00:00 2001
From: sanjaesc <alexx.korolev@gmail.com>
Date: Thu, 22 Oct 2020 10:39:20 +0200
Subject: [PATCH] add wavernn tests + name refactoring

---
 tests/inputs/test_vocoder_wavernn_config.json | 94 +++++++++++++++++++
 ...tasets.py => test_vocoder_gan_datasets.py} |  0
 ...der_train.sh => test_vocoder_gan_train.sh} |  4 +-
 tests/test_vocoder_wavernn.py                 | 31 ++++++
 tests/test_vocoder_wavernn_datasets.py        | 91 ++++++++++++++++++
 tests/test_vocoder_wavernn_train.sh           | 15 +++
 6 files changed, 233 insertions(+), 2 deletions(-)
 create mode 100644 tests/inputs/test_vocoder_wavernn_config.json
 rename tests/{test_vocoder_datasets.py => test_vocoder_gan_datasets.py} (100%)
 rename tests/{test_vocoder_train.sh => test_vocoder_gan_train.sh} (57%)
 create mode 100644 tests/test_vocoder_wavernn.py
 create mode 100644 tests/test_vocoder_wavernn_datasets.py
 create mode 100755 tests/test_vocoder_wavernn_train.sh

diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json
new file mode 100644
index 00000000..28c0f059
--- /dev/null
+++ b/tests/inputs/test_vocoder_wavernn_config.json
@@ -0,0 +1,94 @@
+{
+    "run_name": "wavernn_test",
+    "run_description": "wavernn_test training",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 0,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0,         // scaler value appplied after log transform of spectrogram.
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // Generating / Synthesizing
+    "batched": true,			
+    "target_samples": 11000,		// target number of samples to be generated in each batch entry
+    "overlap_samples": 550,		// number of samples for crossfading between batches
+
+    // DISTRIBUTED TRAINING
+    // "distributed":{
+    //     "backend": "nccl",
+    //     "url": "tcp:\/\/localhost:54321"
+    // },
+
+    // MODEL PARAMETERS
+    "use_aux_net": true,
+    "use_upsample_net": true,
+    "upsample_factors": [4, 8, 8],	// this needs to correctly factorise hop_length
+    "seq_len": 1280,			// has to be devideable by hop_length
+    "mode": "mold",         		// mold [string], gauss [string], bits [int]
+    "mulaw": false,         		// apply mulaw if mode is bits
+    "padding": 2,			// pad the input for resnet to see wider input length
+    
+    // DATASET
+    //"use_gta": true,				// use computed gta features from the tts model
+    "data_path": "tests/data/ljspeech/wavs/",	// path containing training wav files
+    "feature_path": null, 			// path containing computed features from wav files if null compute them
+
+    // TRAINING
+    "batch_size": 4,       	// Batch size for training. Lower values than 32 might cause hard to learn attention.
+    "epochs": 1,        	// total number of epochs to train.
+
+    // VALIDATION
+    "run_eval": true,
+    "test_every_epochs": 10,         // Test after set number of epochs (Test every 20 epochs for example)
+
+    // OPTIMIZER
+    "grad_clip": 4,		     // apply gradient clipping if > 0
+    "lr_scheduler": "MultiStepLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    "lr_scheduler_params": {
+        "gamma": 0.5,
+        "milestones": [200000, 400000, 600000]
+    },
+    "lr": 1e-4,			// initial learning rate
+
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log traning on console.
+    "print_eval": false,     // If True, it prints loss values for each step in eval run.
+    "save_step": 25000,      // Number of training steps expected to plot training stats on TB and save model checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "eval_split_size": 10,	    // number of samples for testing	
+
+    // PATHS
+    "output_path": "tests/train_outputs/"
+}
+
diff --git a/tests/test_vocoder_datasets.py b/tests/test_vocoder_gan_datasets.py
similarity index 100%
rename from tests/test_vocoder_datasets.py
rename to tests/test_vocoder_gan_datasets.py
diff --git a/tests/test_vocoder_train.sh b/tests/test_vocoder_gan_train.sh
similarity index 57%
rename from tests/test_vocoder_train.sh
rename to tests/test_vocoder_gan_train.sh
index fa99b4bd..75773cc3 100755
--- a/tests/test_vocoder_train.sh
+++ b/tests/test_vocoder_gan_train.sh
@@ -5,11 +5,11 @@ echo "$BASEDIR"
 # create run dir
 mkdir $BASEDIR/train_outputs
 # run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json
 # find the training folder
 LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
 echo $LATEST_FOLDER
 # continue the previous training
-CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_gan_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
 # remove all the outputs
 rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER
diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py
new file mode 100644
index 00000000..fdb338f9
--- /dev/null
+++ b/tests/test_vocoder_wavernn.py
@@ -0,0 +1,31 @@
+import numpy as np
+import torch
+import random
+from TTS.vocoder.models.wavernn import WaveRNN
+
+
+def test_wavernn():
+    model = WaveRNN(
+        rnn_dims=512,
+        fc_dims=512,
+        mode=10,
+        mulaw=False,
+        pad=2,
+        use_aux_net=True,
+        use_upsample_net=True,
+        upsample_factors=[4, 8, 8],
+        feat_dims=80,
+        compute_dims=128,
+        res_out_dims=128,
+        res_blocks=10,
+        hop_length=256,
+        sample_rate=22050,
+    )
+    dummy_x = torch.rand((2, 1280))
+    dummy_m = torch.rand((2, 80, 9))
+    y_size = random.randrange(20, 60)
+    dummy_y = torch.rand((80, y_size))
+    output = model(dummy_x, dummy_m)
+    assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape
+    output = model.generate(dummy_y, True, 5500, 550, False)
+    assert np.all(output.shape == (256 * (y_size - 1),))
diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py
new file mode 100644
index 00000000..0f4e939a
--- /dev/null
+++ b/tests/test_vocoder_wavernn_datasets.py
@@ -0,0 +1,91 @@
+import os
+import shutil
+
+import numpy as np
+from tests import get_tests_path, get_tests_input_path, get_tests_output_path
+from torch.utils.data import DataLoader
+
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.io import load_config
+from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
+from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
+os.makedirs(OUTPATH, exist_ok=True)
+
+C = load_config(os.path.join(get_tests_input_path(),
+                             "test_vocoder_wavernn_config.json"))
+
+test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
+test_mel_feat_path = os.path.join(test_data_path, "mel")
+test_quant_feat_path = os.path.join(test_data_path, "quant")
+ok_ljspeech = os.path.exists(test_data_path)
+
+
+def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers):
+    """ run dataloader with given parameters and check conditions """
+    ap = AudioProcessor(**C.audio)
+
+    C.batch_size = batch_size
+    C.mode = mode
+    C.seq_len = seq_len
+    C.data_path = test_data_path
+
+    preprocess_wav_files(test_data_path, C, ap)
+    _, train_items = load_wav_feat_data(
+        test_data_path, test_mel_feat_path, 5)
+
+    dataset = WaveRNNDataset(ap=ap,
+                             items=train_items,
+                             seq_len=seq_len,
+                             hop_len=hop_len,
+                             pad=pad,
+                             mode=mode,
+                             )
+    # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+    loader = DataLoader(dataset,
+                        shuffle=True,
+                        collate_fn=dataset.collate,
+                        batch_size=batch_size,
+                        num_workers=num_workers,
+                        pin_memory=True,
+                        )
+
+    max_iter = 10
+    count_iter = 0
+
+    try:
+        for data in loader:
+            x_input, mels, _ = data
+            expected_feat_shape = (ap.num_mels,
+                                   (x_input.shape[-1] // hop_len) + (pad * 2))
+            assert np.all(
+                mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
+
+            assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
+            count_iter += 1
+            if count_iter == max_iter:
+                break
+    # except AssertionError:
+    #     shutil.rmtree(test_mel_feat_path)
+    #     shutil.rmtree(test_quant_feat_path)
+    finally:
+        shutil.rmtree(test_mel_feat_path)
+        shutil.rmtree(test_quant_feat_path)
+
+
+def test_parametrized_wavernn_dataset():
+    ''' test dataloader with different parameters '''
+    params = [
+        [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, 0],
+        [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", 4],
+        [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, 0],
+        [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, 0],
+        [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", 0],
+        [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, 2],
+        [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", 0],
+    ]
+    for param in params:
+        print(param)
+        wavernn_dataset_case(*param)
diff --git a/tests/test_vocoder_wavernn_train.sh b/tests/test_vocoder_wavernn_train.sh
new file mode 100755
index 00000000..f2e32116
--- /dev/null
+++ b/tests/test_vocoder_wavernn_train.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+BASEDIR=$(dirname "$0")
+echo "$BASEDIR"
+# create run dir
+mkdir $BASEDIR/train_outputs
+# run training
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_wavernn_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER
\ No newline at end of file