mirror of https://github.com/coqui-ai/TTS.git
pylint and test fixes
parent
2abe3df153
commit
fede46e96e
|
@ -13,7 +13,6 @@ Sample run on LJSpeech dataset.
|
|||
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import importlib
|
||||
import os
|
||||
|
||||
|
@ -22,7 +21,7 @@ import torch
|
|||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
from TTS.tts.datasets.TTSDataset import MyDataset
|
||||
from TTS.tts.utils.generic_utils import sequence_mask, setup_model
|
||||
from TTS.tts.utils.generic_utils import setup_model
|
||||
from TTS.tts.utils.io import load_checkpoint
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
@ -101,7 +100,7 @@ if __name__ == '__main__':
|
|||
ap=ap,
|
||||
meta_data=meta_data,
|
||||
tp=C.characters if 'characters' in C.keys() else None,
|
||||
add_blank=c['add_blank'] if 'add_blank' in C.keys() else False,
|
||||
add_blank=C['add_blank'] if 'add_blank' in C.keys() else False,
|
||||
use_phonemes=C.use_phonemes,
|
||||
phoneme_cache_path=C.phoneme_cache_path,
|
||||
phoneme_language=C.phoneme_language,
|
||||
|
|
|
@ -155,7 +155,7 @@ def train(data_loader, model, criterion, optimizer, scheduler,
|
|||
|
||||
# format data
|
||||
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
|
||||
avg_text_length, avg_spec_length, attn_mask, dur_target, item_idx = format_data(data)
|
||||
avg_text_length, avg_spec_length, _, dur_target, _ = format_data(data)
|
||||
|
||||
loader_time = time.time() - end_time
|
||||
|
||||
|
@ -302,7 +302,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch):
|
|||
|
||||
# format data
|
||||
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
|
||||
avg_text_length, avg_spec_length, attn_mask, dur_target, item_idx = format_data(data)
|
||||
_, _, _, dur_target, _ = format_data(data)
|
||||
|
||||
# forward pass model
|
||||
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
|
||||
|
|
|
@ -379,7 +379,7 @@ class MonotonicDynamicConvolutionAttention(nn.Module):
|
|||
def __init__(
|
||||
self,
|
||||
query_dim,
|
||||
embedding_dim,
|
||||
embedding_dim, # pylint: disable=unused-argument
|
||||
attention_dim,
|
||||
static_filter_dim,
|
||||
static_kernel_size,
|
||||
|
@ -447,7 +447,7 @@ class MonotonicDynamicConvolutionAttention(nn.Module):
|
|||
context = torch.bmm(attention_weights.unsqueeze(1), inputs).squeeze(1)
|
||||
return context
|
||||
|
||||
def preprocess_inputs(self, inputs):
|
||||
def preprocess_inputs(self, inputs): # pylint: disable=no-self-use
|
||||
return None
|
||||
|
||||
def init_states(self, inputs):
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from .normalization import TemporalBatchNorm1d
|
||||
|
||||
|
||||
class ZeroTemporalPad(nn.ZeroPad2d):
|
||||
class ZeroTemporalPad(nn.Module):
|
||||
"""Pad sequences to equal lentgh in the temporal dimension"""
|
||||
def __init__(self, kernel_size, dilation):
|
||||
super().__init__()
|
||||
total_pad = (dilation * (kernel_size - 1))
|
||||
begin = total_pad // 2
|
||||
end = total_pad - begin
|
||||
super(ZeroTemporalPad, self).__init__((0, 0, begin, end))
|
||||
self.pad_layer = nn.ZeroPad2d((0, 0, begin, end))
|
||||
|
||||
def forward(self, x):
|
||||
return self.pad_layer(x)
|
||||
|
||||
|
||||
class ConvBN(nn.Module):
|
||||
|
|
|
@ -116,6 +116,8 @@ class Encoder(nn.Module):
|
|||
hidden_channels,
|
||||
kernel_size=5,
|
||||
num_layers=3 + num_layers)
|
||||
else:
|
||||
raise ValueError(" [!] Unkown encoder type.")
|
||||
|
||||
# final projection layers
|
||||
self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
|
|
|
@ -241,6 +241,7 @@ class GuidedAttentionLoss(torch.nn.Module):
|
|||
|
||||
|
||||
class Huber(nn.Module):
|
||||
# pylint: disable=R0201
|
||||
def forward(self, x, y, length=None):
|
||||
"""
|
||||
Shapes:
|
||||
|
|
|
@ -1,11 +1,5 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from TTS.tts.layers.glow_tts.transformer import Transformer
|
||||
from TTS.tts.layers.glow_tts.glow import ConvLayerNorm
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.layers.generic.res_conv_bn import ResidualConvBNBlock, ConvBNBlock
|
||||
from TTS.tts.layers.generic.res_conv_bn import ConvBNBlock, ResidualConvBNBlock
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
|
@ -35,7 +29,8 @@ class Decoder(nn.Module):
|
|||
nn.Conv1d(hidden_channels, out_channels, 1),
|
||||
)
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
|
||||
# TODO: implement multi-speaker
|
||||
o = self.decoder(x, x_mask)
|
||||
o = self.post_conv(o) + x
|
||||
return self.post_net(o)
|
||||
return self.post_net(o)
|
||||
|
|
|
@ -17,6 +17,7 @@ class PositionalEncoding(nn.Module):
|
|||
"""
|
||||
|
||||
def __init__(self, dim, dropout=0.0, max_len=5000):
|
||||
super().__init__()
|
||||
if dim % 2 != 0:
|
||||
raise ValueError("Cannot use sin/cos positional encoding with "
|
||||
"odd dim (got dim={:d})".format(dim))
|
||||
|
@ -27,7 +28,6 @@ class PositionalEncoding(nn.Module):
|
|||
pe[:, 0::2] = torch.sin(position.float() * div_term)
|
||||
pe[:, 1::2] = torch.cos(position.float() * div_term)
|
||||
pe = pe.unsqueeze(0).transpose(1, 2)
|
||||
super(PositionalEncoding, self).__init__()
|
||||
self.register_buffer('pe', pe)
|
||||
if dropout > 0:
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
|
@ -125,7 +125,7 @@ class Encoder(nn.Module):
|
|||
num_layers=3,
|
||||
dropout_p=0.5)
|
||||
# text encoder
|
||||
self.encoder = Transformer(hidden_channels, **encoder_params)
|
||||
self.encoder = Transformer(hidden_channels, **encoder_params) # pylint: disable=unexpected-keyword-arg
|
||||
elif encoder_type.lower() == 'residual_conv_bn':
|
||||
self.pre = nn.Sequential(
|
||||
nn.Conv1d(hidden_channels, hidden_channels, 1), nn.ReLU())
|
||||
|
@ -139,7 +139,8 @@ class Encoder(nn.Module):
|
|||
self.post_bn = nn.BatchNorm1d(hidden_channels)
|
||||
self.post_conv2 = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
|
||||
# TODO: implement multi-speaker
|
||||
if self.encoder_type == 'transformer':
|
||||
o = self.pre(x, x_mask)
|
||||
else:
|
||||
|
|
|
@ -8,6 +8,7 @@ from TTS.tts.layers.glow_tts.monotonic_align import generate_path
|
|||
|
||||
|
||||
class SpeedySpeech(nn.Module):
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(
|
||||
self,
|
||||
num_chars,
|
||||
|
@ -40,7 +41,8 @@ class SpeedySpeech(nn.Module):
|
|||
decoder_residual_conv_bn_params)
|
||||
self.duration_predictor = DurationPredictor(hidden_channels)
|
||||
|
||||
def expand_encoder_outputs(self, en, dr, x_mask, y_mask):
|
||||
@staticmethod
|
||||
def expand_encoder_outputs(en, dr, x_mask, y_mask):
|
||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
||||
attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype)
|
||||
o_en_ex = torch.matmul(
|
||||
|
@ -54,10 +56,8 @@ class SpeedySpeech(nn.Module):
|
|||
o_dr = torch.round(o_dr)
|
||||
return o_dr
|
||||
|
||||
def forward(self, x, x_lengths, y_lengths, dr, g=None):
|
||||
"""
|
||||
docstring
|
||||
"""
|
||||
def forward(self, x, x_lengths, y_lengths, dr, g=None): # pylint: disable=unused-argument
|
||||
# TODO: multi-speaker
|
||||
# [B, T, C]
|
||||
x_emb = self.emb(x)
|
||||
# [B, C, T]
|
||||
|
@ -88,7 +88,8 @@ class SpeedySpeech(nn.Module):
|
|||
|
||||
return o_de, o_dr_log.squeeze(1), attn.transpose(1, 2)
|
||||
|
||||
def inference(self, x, x_lengths, g=None):
|
||||
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
|
||||
# TODO: multi-speaker
|
||||
# pad input to prevent dropping the last word
|
||||
x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
|
||||
|
||||
|
|
|
@ -107,7 +107,6 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
|||
hidden_channels_ffn=768,
|
||||
hidden_channels_dp=256,
|
||||
out_channels=c.audio['num_mels'],
|
||||
kernel_size=3,
|
||||
num_heads=2,
|
||||
num_layers_enc=6,
|
||||
encoder_type=c.encoder_type,
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
"encoder_type": "gatedconv",
|
||||
"encoder_type": "transformer",
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log training on console.
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
{
|
||||
"model": "speedy_speech",
|
||||
"run_name": "test_sample_dataset_run",
|
||||
"run_description": "sample dataset test run",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
// stft parameters
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// Griffin-Lim
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1,
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
// "characters":{
|
||||
// "pad": "_",
|
||||
// "eos": "&",
|
||||
// "bos": "*",
|
||||
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? ",
|
||||
// "punctuations":"!'(),-.:;? ",
|
||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ'̃' "
|
||||
// },
|
||||
|
||||
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"positional_encoding": true,
|
||||
"encoder_type": "residual_conv_bn",
|
||||
"encoder_params":{
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
},
|
||||
"decoder_residual_conv_bn_params":{
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17
|
||||
},
|
||||
|
||||
// TRAINING
|
||||
"batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":32,
|
||||
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
|
||||
// LOSS PARAMETERS
|
||||
"ssim_alpha": 1,
|
||||
"l1_alpha": 1,
|
||||
"huber_alpha": 1,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": -1, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": true, // use noam warmup and lr schedule.
|
||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"lr": 0.002, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 1, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n
|
||||
"mixed_precision": false,
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "english_cleaners",
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 0, // number of evaluation data loader processes.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 300, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronoun[ciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
"external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
|
||||
|
||||
// DATASETS
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "tests/data/ljspeech/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": "metadata.csv",
|
||||
"meta_file_attn_mask": "tests/data/ljspeech/metadata_attn_mask.txt"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -44,10 +44,9 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
# model to train
|
||||
model = GlowTts(num_chars=32,
|
||||
hidden_channels=128,
|
||||
filter_channels=32,
|
||||
filter_channels_dp=32,
|
||||
hidden_channels_ffn=32,
|
||||
hidden_channels_dp=32,
|
||||
out_channels=80,
|
||||
kernel_size=3,
|
||||
num_heads=2,
|
||||
num_layers_enc=6,
|
||||
dropout_p=0.1,
|
||||
|
@ -72,10 +71,9 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
# reference model to compare model weights
|
||||
model_ref = GlowTts(num_chars=32,
|
||||
hidden_channels=128,
|
||||
filter_channels=32,
|
||||
filter_channels_dp=32,
|
||||
hidden_channels_ffn=32,
|
||||
hidden_channels_dp=32,
|
||||
out_channels=80,
|
||||
kernel_size=3,
|
||||
num_heads=2,
|
||||
num_layers_enc=6,
|
||||
dropout_p=0.1,
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
import torch
|
||||
|
||||
from TTS.tts.layers.speedy_speech.encoder import Encoder
|
||||
from TTS.tts.layers.speedy_speech.decoder import Decoder
|
||||
from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeech
|
||||
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def test_encoder():
|
||||
input_dummy = torch.rand(8, 14, 37).to(device)
|
||||
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
|
||||
input_lengths[-1] = 37
|
||||
input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)),
|
||||
1).to(device)
|
||||
|
||||
# residual bn conv encoder
|
||||
layer = Encoder(out_channels=11,
|
||||
hidden_channels=14,
|
||||
encoder_type='residual_conv_bn').to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape)==[8, 11, 37]
|
||||
|
||||
# transformer encoder
|
||||
layer = Encoder(out_channels=11,
|
||||
hidden_channels=14,
|
||||
encoder_type='transformer',
|
||||
encoder_params={
|
||||
'hidden_channels_ffn': 768,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape)==[8, 11, 37]
|
||||
|
||||
|
||||
def test_decoder():
|
||||
input_dummy = torch.rand(8, 128, 37).to(device)
|
||||
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
|
||||
input_lengths[-1] = 37
|
||||
|
||||
input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
|
||||
layer = Decoder(out_channels=11, hidden_channels=128).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
|
||||
def test_duration_predictor():
|
||||
input_dummy = torch.rand(8, 128, 27).to(device)
|
||||
input_lengths = torch.randint(20, 27, (8, )).long().to(device)
|
||||
input_lengths[-1] = 27
|
||||
|
||||
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
|
||||
layer = DurationPredictor(hidden_channels=128).to(device)
|
||||
|
||||
output = layer(input_dummy, x_mask)
|
||||
assert list(output.shape)==[8, 1, 27]
|
||||
|
||||
|
||||
def test_speedy_speech():
|
||||
num_chars = 7
|
||||
B = 8
|
||||
T_en = 37
|
||||
T_de = 74
|
||||
|
||||
x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device)
|
||||
x_lengths = torch.randint(31, T_en, (B, )).long().to(device)
|
||||
x_lengths[-1] = T_en
|
||||
|
||||
# set durations. max total duration should be equal to T_de
|
||||
durations = torch.randint(1, 4, (B, T_en))
|
||||
durations = durations * (T_de / durations.sum(1)).unsqueeze(1)
|
||||
durations = durations.to(torch.long).to(device)
|
||||
max_dur = durations.sum(1).max()
|
||||
durations[:, 0] += T_de - max_dur if T_de > max_dur else 0
|
||||
|
||||
y_lengths = durations.sum(1)
|
||||
|
||||
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128)
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
|
||||
# forward pass
|
||||
o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations)
|
||||
|
||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
Loading…
Reference in New Issue