pylint and test fixes

pull/10/head
erogol 2020-12-28 17:52:55 +01:00
parent 2abe3df153
commit fede46e96e
14 changed files with 282 additions and 37 deletions

View File

@ -13,7 +13,6 @@ Sample run on LJSpeech dataset.
import argparse
import glob
import importlib
import os
@ -22,7 +21,7 @@ import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from TTS.tts.datasets.TTSDataset import MyDataset
from TTS.tts.utils.generic_utils import sequence_mask, setup_model
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.io import load_checkpoint
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.audio import AudioProcessor
@ -101,7 +100,7 @@ if __name__ == '__main__':
ap=ap,
meta_data=meta_data,
tp=C.characters if 'characters' in C.keys() else None,
add_blank=c['add_blank'] if 'add_blank' in C.keys() else False,
add_blank=C['add_blank'] if 'add_blank' in C.keys() else False,
use_phonemes=C.use_phonemes,
phoneme_cache_path=C.phoneme_cache_path,
phoneme_language=C.phoneme_language,

View File

@ -155,7 +155,7 @@ def train(data_loader, model, criterion, optimizer, scheduler,
# format data
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
avg_text_length, avg_spec_length, attn_mask, dur_target, item_idx = format_data(data)
avg_text_length, avg_spec_length, _, dur_target, _ = format_data(data)
loader_time = time.time() - end_time
@ -302,7 +302,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch):
# format data
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
avg_text_length, avg_spec_length, attn_mask, dur_target, item_idx = format_data(data)
_, _, _, dur_target, _ = format_data(data)
# forward pass model
with torch.cuda.amp.autocast(enabled=c.mixed_precision):

View File

@ -379,7 +379,7 @@ class MonotonicDynamicConvolutionAttention(nn.Module):
def __init__(
self,
query_dim,
embedding_dim,
embedding_dim, # pylint: disable=unused-argument
attention_dim,
static_filter_dim,
static_kernel_size,
@ -447,7 +447,7 @@ class MonotonicDynamicConvolutionAttention(nn.Module):
context = torch.bmm(attention_weights.unsqueeze(1), inputs).squeeze(1)
return context
def preprocess_inputs(self, inputs):
def preprocess_inputs(self, inputs): # pylint: disable=no-self-use
return None
def init_states(self, inputs):

View File

@ -1,15 +1,17 @@
import torch
from torch import nn
from .normalization import TemporalBatchNorm1d
class ZeroTemporalPad(nn.ZeroPad2d):
class ZeroTemporalPad(nn.Module):
"""Pad sequences to equal lentgh in the temporal dimension"""
def __init__(self, kernel_size, dilation):
super().__init__()
total_pad = (dilation * (kernel_size - 1))
begin = total_pad // 2
end = total_pad - begin
super(ZeroTemporalPad, self).__init__((0, 0, begin, end))
self.pad_layer = nn.ZeroPad2d((0, 0, begin, end))
def forward(self, x):
return self.pad_layer(x)
class ConvBN(nn.Module):

View File

@ -116,6 +116,8 @@ class Encoder(nn.Module):
hidden_channels,
kernel_size=5,
num_layers=3 + num_layers)
else:
raise ValueError(" [!] Unkown encoder type.")
# final projection layers
self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1)

View File

@ -241,6 +241,7 @@ class GuidedAttentionLoss(torch.nn.Module):
class Huber(nn.Module):
# pylint: disable=R0201
def forward(self, x, y, length=None):
"""
Shapes:

View File

@ -1,11 +1,5 @@
import torch
from torch import nn
from torch.nn import functional as F
from TTS.tts.layers.glow_tts.transformer import Transformer
from TTS.tts.layers.glow_tts.glow import ConvLayerNorm
from TTS.tts.utils.generic_utils import sequence_mask
from TTS.tts.layers.generic.res_conv_bn import ResidualConvBNBlock, ConvBNBlock
from TTS.tts.layers.generic.res_conv_bn import ConvBNBlock, ResidualConvBNBlock
class Decoder(nn.Module):
@ -35,7 +29,8 @@ class Decoder(nn.Module):
nn.Conv1d(hidden_channels, out_channels, 1),
)
def forward(self, x, x_mask, g=None):
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
# TODO: implement multi-speaker
o = self.decoder(x, x_mask)
o = self.post_conv(o) + x
return self.post_net(o)
return self.post_net(o)

View File

@ -17,6 +17,7 @@ class PositionalEncoding(nn.Module):
"""
def __init__(self, dim, dropout=0.0, max_len=5000):
super().__init__()
if dim % 2 != 0:
raise ValueError("Cannot use sin/cos positional encoding with "
"odd dim (got dim={:d})".format(dim))
@ -27,7 +28,6 @@ class PositionalEncoding(nn.Module):
pe[:, 0::2] = torch.sin(position.float() * div_term)
pe[:, 1::2] = torch.cos(position.float() * div_term)
pe = pe.unsqueeze(0).transpose(1, 2)
super(PositionalEncoding, self).__init__()
self.register_buffer('pe', pe)
if dropout > 0:
self.dropout = nn.Dropout(p=dropout)
@ -125,7 +125,7 @@ class Encoder(nn.Module):
num_layers=3,
dropout_p=0.5)
# text encoder
self.encoder = Transformer(hidden_channels, **encoder_params)
self.encoder = Transformer(hidden_channels, **encoder_params) # pylint: disable=unexpected-keyword-arg
elif encoder_type.lower() == 'residual_conv_bn':
self.pre = nn.Sequential(
nn.Conv1d(hidden_channels, hidden_channels, 1), nn.ReLU())
@ -139,7 +139,8 @@ class Encoder(nn.Module):
self.post_bn = nn.BatchNorm1d(hidden_channels)
self.post_conv2 = nn.Conv1d(hidden_channels, out_channels, 1)
def forward(self, x, x_mask, g=None):
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
# TODO: implement multi-speaker
if self.encoder_type == 'transformer':
o = self.pre(x, x_mask)
else:

View File

@ -8,6 +8,7 @@ from TTS.tts.layers.glow_tts.monotonic_align import generate_path
class SpeedySpeech(nn.Module):
# pylint: disable=dangerous-default-value
def __init__(
self,
num_chars,
@ -40,7 +41,8 @@ class SpeedySpeech(nn.Module):
decoder_residual_conv_bn_params)
self.duration_predictor = DurationPredictor(hidden_channels)
def expand_encoder_outputs(self, en, dr, x_mask, y_mask):
@staticmethod
def expand_encoder_outputs(en, dr, x_mask, y_mask):
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype)
o_en_ex = torch.matmul(
@ -54,10 +56,8 @@ class SpeedySpeech(nn.Module):
o_dr = torch.round(o_dr)
return o_dr
def forward(self, x, x_lengths, y_lengths, dr, g=None):
"""
docstring
"""
def forward(self, x, x_lengths, y_lengths, dr, g=None): # pylint: disable=unused-argument
# TODO: multi-speaker
# [B, T, C]
x_emb = self.emb(x)
# [B, C, T]
@ -88,7 +88,8 @@ class SpeedySpeech(nn.Module):
return o_de, o_dr_log.squeeze(1), attn.transpose(1, 2)
def inference(self, x, x_lengths, g=None):
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
# TODO: multi-speaker
# pad input to prevent dropping the last word
x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)

View File

@ -107,7 +107,6 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
hidden_channels_ffn=768,
hidden_channels_dp=256,
out_channels=c.audio['num_mels'],
kernel_size=3,
num_heads=2,
num_layers_enc=6,
encoder_type=c.encoder_type,

View File

@ -86,7 +86,7 @@
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
"encoder_type": "gatedconv",
"encoder_type": "transformer",
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console.

View File

@ -0,0 +1,149 @@
{
"model": "speedy_speech",
"run_name": "test_sample_dataset_run",
"run_description": "sample dataset test run",
// AUDIO PARAMETERS
"audio":{
// stft parameters
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1,
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
// "characters":{
// "pad": "_",
// "eos": "&",
// "bos": "*",
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? ",
// "punctuations":"!'(),-.:;? ",
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ'̃' "
// },
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// MODEL PARAMETERS
"positional_encoding": true,
"encoder_type": "residual_conv_bn",
"encoder_params":{
"kernel_size": 4,
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
"num_conv_blocks": 2,
"num_res_blocks": 13
},
"decoder_residual_conv_bn_params":{
"kernel_size": 4,
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
"num_conv_blocks": 2,
"num_res_blocks": 17
},
// TRAINING
"batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":32,
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
// LOSS PARAMETERS
"ssim_alpha": 1,
"l1_alpha": 1,
"huber_alpha": 1,
// VALIDATION
"run_eval": true,
"test_delay_epochs": -1, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule": true, // use noam warmup and lr schedule.
"grad_clip": 1.0, // upper limit for gradients for clipping.
"epochs": 1, // total number of epochs to train.
"lr": 0.002, // Initial learning rate. If Noam decay is active, maximum learning rate.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
// TENSORBOARD and LOGGING
"print_step": 1, // Number of steps to log training on console.
"tb_plot_step": 100, // Number of steps to plot TB training figures.
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n
"mixed_precision": false,
// DATA LOADING
"text_cleaner": "english_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 0, // number of evaluation data loader processes.
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 300, // DATASET-RELATED: maximum text length
"compute_f0": false, // compute f0 values in data-loader
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
// PATHS
"output_path": "tests/train_outputs/",
// PHONEMES
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronoun[ciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
// DATASETS
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "tests/data/ljspeech/",
"meta_file_train": "metadata.csv",
"meta_file_val": "metadata.csv",
"meta_file_attn_mask": "tests/data/ljspeech/metadata_attn_mask.txt"
}
]
}

View File

@ -44,10 +44,9 @@ class GlowTTSTrainTest(unittest.TestCase):
# model to train
model = GlowTts(num_chars=32,
hidden_channels=128,
filter_channels=32,
filter_channels_dp=32,
hidden_channels_ffn=32,
hidden_channels_dp=32,
out_channels=80,
kernel_size=3,
num_heads=2,
num_layers_enc=6,
dropout_p=0.1,
@ -72,10 +71,9 @@ class GlowTTSTrainTest(unittest.TestCase):
# reference model to compare model weights
model_ref = GlowTts(num_chars=32,
hidden_channels=128,
filter_channels=32,
filter_channels_dp=32,
hidden_channels_ffn=32,
hidden_channels_dp=32,
out_channels=80,
kernel_size=3,
num_heads=2,
num_layers_enc=6,
dropout_p=0.1,

View File

@ -0,0 +1,98 @@
import torch
from TTS.tts.layers.speedy_speech.encoder import Encoder
from TTS.tts.layers.speedy_speech.decoder import Decoder
from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
from TTS.tts.utils.generic_utils import sequence_mask
from TTS.tts.models.speedy_speech import SpeedySpeech
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def test_encoder():
input_dummy = torch.rand(8, 14, 37).to(device)
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
input_lengths[-1] = 37
input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)),
1).to(device)
# residual bn conv encoder
layer = Encoder(out_channels=11,
hidden_channels=14,
encoder_type='residual_conv_bn').to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape)==[8, 11, 37]
# transformer encoder
layer = Encoder(out_channels=11,
hidden_channels=14,
encoder_type='transformer',
encoder_params={
'hidden_channels_ffn': 768,
'num_heads': 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"rel_attn_window_size": 4,
"input_length": None
}).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape)==[8, 11, 37]
def test_decoder():
input_dummy = torch.rand(8, 128, 37).to(device)
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
input_lengths[-1] = 37
input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
layer = Decoder(out_channels=11, hidden_channels=128).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
def test_duration_predictor():
input_dummy = torch.rand(8, 128, 27).to(device)
input_lengths = torch.randint(20, 27, (8, )).long().to(device)
input_lengths[-1] = 27
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
layer = DurationPredictor(hidden_channels=128).to(device)
output = layer(input_dummy, x_mask)
assert list(output.shape)==[8, 1, 27]
def test_speedy_speech():
num_chars = 7
B = 8
T_en = 37
T_de = 74
x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device)
x_lengths = torch.randint(31, T_en, (B, )).long().to(device)
x_lengths[-1] = T_en
# set durations. max total duration should be equal to T_de
durations = torch.randint(1, 4, (B, T_en))
durations = durations * (T_de / durations.sum(1)).unsqueeze(1)
durations = durations.to(torch.long).to(device)
max_dur = durations.sum(1).max()
durations[:, 0] += T_de - max_dur if T_de > max_dur else 0
y_lengths = durations.sum(1)
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128)
if use_cuda:
model.cuda()
# forward pass
o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations)
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]
assert list(o_dr.shape) == [B, T_en]