From 07d2d28ae6716854256aaaa32a0ac6573a52fb01 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 8 Jul 2020 10:21:45 +0200 Subject: [PATCH] update tests --- tests/inputs/server_config.json | 7 ++- tests/outputs/dummy_model_config.json | 6 +-- tests/test_config.json | 5 ++- tests/test_loader.py | 2 +- tests/test_tacotron2_model.py | 1 + tests/test_tacotron2_tf_model.py | 63 --------------------------- 6 files changed, 11 insertions(+), 73 deletions(-) delete mode 100644 tests/test_tacotron2_tf_model.py diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json index 7f5a60fb..9eb7f09f 100644 --- a/tests/inputs/server_config.json +++ b/tests/inputs/server_config.json @@ -5,10 +5,9 @@ "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. "wavernn_file": null, // wavernn checkpoint file name "wavernn_config": null, // wavernn config file - "pwgan_lib_path": null, - "pwgan_file": null, - "pwgan_config": null, - "is_wavernn_batched":true, + "vocoder_config":null, + "vocoder_file": null, + "is_wavernn_batched":true, "port": 5002, "use_cuda": false, "debug": true diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json index d301b61a..36fac3e5 100644 --- a/tests/outputs/dummy_model_config.json +++ b/tests/outputs/dummy_model_config.json @@ -5,10 +5,10 @@ "audio":{ // Audio processing parameters "num_mels": 80, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. + "hop_length": 256, + "win_length": 1024, "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "min_level_db": -100, // normalization range "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. diff --git a/tests/test_config.json b/tests/test_config.json index e9cd48cf..6da13bfc 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -1,7 +1,7 @@ { "audio":{ "audio_processor": "audio", // to use dictate different audio processors, if available. - "num_mels": 80, // size of the mel spec frame. + "num_mels": 80, // size of the mel spec frame. "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. "frame_length_ms": null, // stft window length in ms. @@ -19,7 +19,8 @@ "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false + "do_trim_silence": false, + "spec_gain": 20 }, "characters":{ diff --git a/tests/test_loader.py b/tests/test_loader.py index 9edd233f..52d24c7a 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -76,7 +76,7 @@ class TestTTSDataset(unittest.TestCase): # TODO: more assertion here assert type(speaker_name[0]) is str assert linear_input.shape[0] == c.batch_size - assert linear_input.shape[2] == self.ap.num_freq + assert linear_input.shape[2] == self.ap.fft_size // 2 + 1 assert mel_input.shape[0] == c.batch_size assert mel_input.shape[2] == c.audio['num_mels'] # check normalization ranges diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index cf1d0778..ae9f20a2 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -28,6 +28,7 @@ class TacotronTrainTest(unittest.TestCase): mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py deleted file mode 100644 index aca363a8..00000000 --- a/tests/test_tacotron2_tf_model.py +++ /dev/null @@ -1,63 +0,0 @@ -import os -import torch -import unittest -import numpy as np -import tensorflow as tf - -from TTS.utils.io import load_config -from TTS.tf.models.tacotron2 import Tacotron2 - -#pylint: disable=unused-variable - -torch.manual_seed(1) -use_cuda = torch.cuda.is_available() -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - -file_path = os.path.dirname(os.path.realpath(__file__)) -c = load_config(os.path.join(file_path, 'test_config.json')) - - -class TacotronTFTrainTest(unittest.TestCase): - - @staticmethod - def generate_dummy_inputs(): - chars_seq = torch.randint(0, 24, (8, 128)).long().to(device) - chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device) - chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) - stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) - - chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy()) - chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy()) - mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy()) - return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\ - stop_targets, speaker_ids - - def test_train_step(self): - ''' test forward pass ''' - chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\ - stop_targets, speaker_ids = self.generate_dummy_inputs() - - for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 - - stop_targets = stop_targets.view(chars_seq.shape[0], - stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() - - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5) - # training pass - output = model(chars_seq, chars_seq_lengths, mel_spec, training=True) - - # check model output shapes - assert np.all(output[0].shape == mel_spec.shape) - assert np.all(output[1].shape == mel_spec.shape) - assert output[2].shape[2] == chars_seq.shape[1] - assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r) - assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r) - - # inference pass - output = model(chars_seq, training=False)