mirror of https://github.com/coqui-ai/TTS.git
update tests
parent
6c60c182b5
commit
07d2d28ae6
|
@ -5,10 +5,9 @@
|
||||||
"wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
|
"wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
|
||||||
"wavernn_file": null, // wavernn checkpoint file name
|
"wavernn_file": null, // wavernn checkpoint file name
|
||||||
"wavernn_config": null, // wavernn config file
|
"wavernn_config": null, // wavernn config file
|
||||||
"pwgan_lib_path": null,
|
"vocoder_config":null,
|
||||||
"pwgan_file": null,
|
"vocoder_file": null,
|
||||||
"pwgan_config": null,
|
"is_wavernn_batched":true,
|
||||||
"is_wavernn_batched":true,
|
|
||||||
"port": 5002,
|
"port": 5002,
|
||||||
"use_cuda": false,
|
"use_cuda": false,
|
||||||
"debug": true
|
"debug": true
|
||||||
|
|
|
@ -5,10 +5,10 @@
|
||||||
"audio":{
|
"audio":{
|
||||||
// Audio processing parameters
|
// Audio processing parameters
|
||||||
"num_mels": 80, // size of the mel spec frame.
|
"num_mels": 80, // size of the mel spec frame.
|
||||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||||
"frame_length_ms": 50, // stft window length in ms.
|
"hop_length": 256,
|
||||||
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
|
"win_length": 1024,
|
||||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||||
"min_level_db": -100, // normalization range
|
"min_level_db": -100, // normalization range
|
||||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"audio":{
|
"audio":{
|
||||||
"audio_processor": "audio", // to use dictate different audio processors, if available.
|
"audio_processor": "audio", // to use dictate different audio processors, if available.
|
||||||
"num_mels": 80, // size of the mel spec frame.
|
"num_mels": 80, // size of the mel spec frame.
|
||||||
"num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
|
"num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||||
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
||||||
"frame_length_ms": null, // stft window length in ms.
|
"frame_length_ms": null, // stft window length in ms.
|
||||||
|
@ -19,7 +19,8 @@
|
||||||
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||||
"mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
"mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||||
"mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!!
|
"mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!!
|
||||||
"do_trim_silence": false
|
"do_trim_silence": false,
|
||||||
|
"spec_gain": 20
|
||||||
},
|
},
|
||||||
|
|
||||||
"characters":{
|
"characters":{
|
||||||
|
|
|
@ -76,7 +76,7 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
# TODO: more assertion here
|
# TODO: more assertion here
|
||||||
assert type(speaker_name[0]) is str
|
assert type(speaker_name[0]) is str
|
||||||
assert linear_input.shape[0] == c.batch_size
|
assert linear_input.shape[0] == c.batch_size
|
||||||
assert linear_input.shape[2] == self.ap.num_freq
|
assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
|
||||||
assert mel_input.shape[0] == c.batch_size
|
assert mel_input.shape[0] == c.batch_size
|
||||||
assert mel_input.shape[2] == c.audio['num_mels']
|
assert mel_input.shape[2] == c.audio['num_mels']
|
||||||
# check normalization ranges
|
# check normalization ranges
|
||||||
|
|
|
@ -28,6 +28,7 @@ class TacotronTrainTest(unittest.TestCase):
|
||||||
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||||
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||||
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
|
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
|
||||||
|
mel_lengths[0] = 30
|
||||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||||
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
||||||
|
|
||||||
|
|
|
@ -1,63 +0,0 @@
|
||||||
import os
|
|
||||||
import torch
|
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
|
||||||
|
|
||||||
from TTS.utils.io import load_config
|
|
||||||
from TTS.tf.models.tacotron2 import Tacotron2
|
|
||||||
|
|
||||||
#pylint: disable=unused-variable
|
|
||||||
|
|
||||||
torch.manual_seed(1)
|
|
||||||
use_cuda = torch.cuda.is_available()
|
|
||||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
||||||
|
|
||||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
|
||||||
|
|
||||||
|
|
||||||
class TacotronTFTrainTest(unittest.TestCase):
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def generate_dummy_inputs():
|
|
||||||
chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
|
|
||||||
chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
|
|
||||||
chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
|
|
||||||
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
|
||||||
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
|
||||||
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
|
|
||||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
|
||||||
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
|
||||||
|
|
||||||
chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
|
|
||||||
chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
|
|
||||||
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
|
|
||||||
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
|
|
||||||
stop_targets, speaker_ids
|
|
||||||
|
|
||||||
def test_train_step(self):
|
|
||||||
''' test forward pass '''
|
|
||||||
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
|
|
||||||
stop_targets, speaker_ids = self.generate_dummy_inputs()
|
|
||||||
|
|
||||||
for idx in mel_lengths:
|
|
||||||
stop_targets[:, int(idx.item()):, 0] = 1.0
|
|
||||||
|
|
||||||
stop_targets = stop_targets.view(chars_seq.shape[0],
|
|
||||||
stop_targets.size(1) // c.r, -1)
|
|
||||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
|
||||||
|
|
||||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
|
|
||||||
# training pass
|
|
||||||
output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
|
|
||||||
|
|
||||||
# check model output shapes
|
|
||||||
assert np.all(output[0].shape == mel_spec.shape)
|
|
||||||
assert np.all(output[1].shape == mel_spec.shape)
|
|
||||||
assert output[2].shape[2] == chars_seq.shape[1]
|
|
||||||
assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
|
|
||||||
assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
|
|
||||||
|
|
||||||
# inference pass
|
|
||||||
output = model(chars_seq, training=False)
|
|
Loading…
Reference in New Issue