update tests

2020-07-08 10:21:45 +02:00 · 2020-07-08 10:21:45 +02:00 · 07d2d28ae6
parent 6c60c182b5
commit 07d2d28ae6
6 changed files with 11 additions and 73 deletions
--- a/tests/inputs/server_config.json
+++ b/tests/inputs/server_config.json
@ -5,10 +5,9 @@
    "wavernn_lib_path": null,   // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
    "wavernn_file": null, // wavernn checkpoint file name
    "wavernn_config": null, // wavernn config file
-    "pwgan_lib_path": null,
-    "pwgan_file": null,
-    "pwgan_config": null,
-    "is_wavernn_batched":true, 
+    "vocoder_config":null,
+    "vocoder_file": null,
+    "is_wavernn_batched":true,
    "port": 5002,
    "use_cuda": false,
    "debug": true
--- a/tests/outputs/dummy_model_config.json
+++ b/tests/outputs/dummy_model_config.json
@ -5,10 +5,10 @@
    "audio":{
        // Audio processing parameters
        "num_mels": 80,         // size of the mel spec frame.
-        "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "frame_length_ms": 50,  // stft window length in ms.
-        "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
+        "hop_length": 256,
+        "win_length": 1024,
        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
        "min_level_db": -100,   // normalization range
        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
--- a/tests/test_config.json
+++ b/tests/test_config.json
@ -1,7 +1,7 @@
    {
    "audio":{
        "audio_processor": "audio",     // to use dictate different audio processors, if available.
-        "num_mels": 80,         // size of the mel spec frame. 
+        "num_mels": 80,         // size of the mel spec frame.
        "num_freq": 513,       // number of stft frequency levels. Size of the linear spectogram frame.
        "sample_rate": 22050,   // wav sample-rate. If different than the original data, it is resampled.
        "frame_length_ms": null,  // stft window length in ms.
@ -19,7 +19,8 @@
        "max_norm": 4,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "mel_fmin": 0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
        "mel_fmax": 8000,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": false
+        "do_trim_silence": false,
+        "spec_gain": 20
    },

    "characters":{
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@ -76,7 +76,7 @@ class TestTTSDataset(unittest.TestCase):
                # TODO: more assertion here
                assert type(speaker_name[0]) is str
                assert linear_input.shape[0] == c.batch_size
-                assert linear_input.shape[2] == self.ap.num_freq
+                assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
                assert mel_input.shape[0] == c.batch_size
                assert mel_input.shape[2] == c.audio['num_mels']
                # check normalization ranges
--- a/tests/test_tacotron2_model.py
+++ b/tests/test_tacotron2_model.py
@ -28,6 +28,7 @@ class TacotronTrainTest(unittest.TestCase):
        mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
        mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
+        mel_lengths[0] = 30
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

--- a/tests/test_tacotron2_tf_model.py
+++ b/tests/test_tacotron2_tf_model.py
@ -1,63 +0,0 @@
-import os
-import torch
-import unittest
-import numpy as np
-import tensorflow as tf
-
-from TTS.utils.io import load_config
-from TTS.tf.models.tacotron2 import Tacotron2
-
-#pylint: disable=unused-variable
-
-torch.manual_seed(1)
-use_cuda = torch.cuda.is_available()
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-
-file_path = os.path.dirname(os.path.realpath(__file__))
-c = load_config(os.path.join(file_path, 'test_config.json'))
-
-
-class TacotronTFTrainTest(unittest.TestCase):
-
-    @staticmethod
-    def generate_dummy_inputs():
-        chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
-        chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
-        chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
-        mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
-        mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
-        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
-        stop_targets = torch.zeros(8, 30, 1).float().to(device)
-        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
-
-        chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
-        chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
-        mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
-        return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
-            stop_targets, speaker_ids
-
-    def test_train_step(self):
-        ''' test forward pass '''
-        chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
-            stop_targets, speaker_ids = self.generate_dummy_inputs()
-
-        for idx in mel_lengths:
-            stop_targets[:, int(idx.item()):, 0] = 1.0
-
-        stop_targets = stop_targets.view(chars_seq.shape[0],
-                                         stop_targets.size(1) // c.r, -1)
-        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
-
-        model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
-        # training pass
-        output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
-
-        # check model output shapes
-        assert np.all(output[0].shape == mel_spec.shape)
-        assert np.all(output[1].shape == mel_spec.shape)
-        assert output[2].shape[2] == chars_seq.shape[1]
-        assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
-        assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
-
-        # inference pass
-        output = model(chars_seq, training=False)