From 07d2d28ae6716854256aaaa32a0ac6573a52fb01 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 8 Jul 2020 10:21:45 +0200
Subject: [PATCH] update tests

---
 tests/inputs/server_config.json       |  7 ++-
 tests/outputs/dummy_model_config.json |  6 +--
 tests/test_config.json                |  5 ++-
 tests/test_loader.py                  |  2 +-
 tests/test_tacotron2_model.py         |  1 +
 tests/test_tacotron2_tf_model.py      | 63 ---------------------------
 6 files changed, 11 insertions(+), 73 deletions(-)
 delete mode 100644 tests/test_tacotron2_tf_model.py

diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json
index 7f5a60fb..9eb7f09f 100644
--- a/tests/inputs/server_config.json
+++ b/tests/inputs/server_config.json
@@ -5,10 +5,9 @@
     "wavernn_lib_path": null,   // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
     "wavernn_file": null, // wavernn checkpoint file name
     "wavernn_config": null, // wavernn config file
-    "pwgan_lib_path": null,
-    "pwgan_file": null,
-    "pwgan_config": null,
-    "is_wavernn_batched":true, 
+    "vocoder_config":null,
+    "vocoder_file": null,
+    "is_wavernn_batched":true,
     "port": 5002,
     "use_cuda": false,
     "debug": true
diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json
index d301b61a..36fac3e5 100644
--- a/tests/outputs/dummy_model_config.json
+++ b/tests/outputs/dummy_model_config.json
@@ -5,10 +5,10 @@
     "audio":{
         // Audio processing parameters
         "num_mels": 80,         // size of the mel spec frame.
-        "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
         "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "frame_length_ms": 50,  // stft window length in ms.
-        "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
+        "hop_length": 256,
+        "win_length": 1024,
         "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
         "min_level_db": -100,   // normalization range
         "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
diff --git a/tests/test_config.json b/tests/test_config.json
index e9cd48cf..6da13bfc 100644
--- a/tests/test_config.json
+++ b/tests/test_config.json
@@ -1,7 +1,7 @@
     {
     "audio":{
         "audio_processor": "audio",     // to use dictate different audio processors, if available.
-        "num_mels": 80,         // size of the mel spec frame. 
+        "num_mels": 80,         // size of the mel spec frame.
         "num_freq": 513,       // number of stft frequency levels. Size of the linear spectogram frame.
         "sample_rate": 22050,   // wav sample-rate. If different than the original data, it is resampled.
         "frame_length_ms": null,  // stft window length in ms.
@@ -19,7 +19,8 @@
         "max_norm": 4,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
         "mel_fmin": 0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
         "mel_fmax": 8000,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": false
+        "do_trim_silence": false,
+        "spec_gain": 20
     },
 
     "characters":{
diff --git a/tests/test_loader.py b/tests/test_loader.py
index 9edd233f..52d24c7a 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -76,7 +76,7 @@ class TestTTSDataset(unittest.TestCase):
                 # TODO: more assertion here
                 assert type(speaker_name[0]) is str
                 assert linear_input.shape[0] == c.batch_size
-                assert linear_input.shape[2] == self.ap.num_freq
+                assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
                 assert mel_input.shape[0] == c.batch_size
                 assert mel_input.shape[2] == c.audio['num_mels']
                 # check normalization ranges
diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py
index cf1d0778..ae9f20a2 100644
--- a/tests/test_tacotron2_model.py
+++ b/tests/test_tacotron2_model.py
@@ -28,6 +28,7 @@ class TacotronTrainTest(unittest.TestCase):
         mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
         mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
         mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
+        mel_lengths[0] = 30
         stop_targets = torch.zeros(8, 30, 1).float().to(device)
         speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
 
diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py
deleted file mode 100644
index aca363a8..00000000
--- a/tests/test_tacotron2_tf_model.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import os
-import torch
-import unittest
-import numpy as np
-import tensorflow as tf
-
-from TTS.utils.io import load_config
-from TTS.tf.models.tacotron2 import Tacotron2
-
-#pylint: disable=unused-variable
-
-torch.manual_seed(1)
-use_cuda = torch.cuda.is_available()
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-
-file_path = os.path.dirname(os.path.realpath(__file__))
-c = load_config(os.path.join(file_path, 'test_config.json'))
-
-
-class TacotronTFTrainTest(unittest.TestCase):
-
-    @staticmethod
-    def generate_dummy_inputs():
-        chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
-        chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
-        chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
-        mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
-        mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
-        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
-        stop_targets = torch.zeros(8, 30, 1).float().to(device)
-        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
-
-        chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
-        chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
-        mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
-        return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
-            stop_targets, speaker_ids
-
-    def test_train_step(self):
-        ''' test forward pass '''
-        chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
-            stop_targets, speaker_ids = self.generate_dummy_inputs()
-
-        for idx in mel_lengths:
-            stop_targets[:, int(idx.item()):, 0] = 1.0
-
-        stop_targets = stop_targets.view(chars_seq.shape[0],
-                                         stop_targets.size(1) // c.r, -1)
-        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
-
-        model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
-        # training pass
-        output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
-
-        # check model output shapes
-        assert np.all(output[0].shape == mel_spec.shape)
-        assert np.all(output[1].shape == mel_spec.shape)
-        assert output[2].shape[2] == chars_seq.shape[1]
-        assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
-        assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
-
-        # inference pass
-        output = model(chars_seq, training=False)