Merge pull request #279 from mozilla/dev

merging dev branch
pull/10/head
Eren Gölge 2019-10-24 14:40:55 +02:00 committed by GitHub
commit 50088cbf3b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 866 additions and 764 deletions

View File

@ -4,13 +4,13 @@ yes | apt-get install ffmpeg
yes | apt-get install espeak yes | apt-get install espeak
yes | apt-get install tmux yes | apt-get install tmux
yes | apt-get install zsh yes | apt-get install zsh
# pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.0%2Bcu100-cp36-cp36m-linux_x86_64.whl
# wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar
wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh
sudo sh install.sh sudo sh install.sh
pip install pytorch==1.3.0+cu100
python3 setup.py develop python3 setup.py develop
# cp -R ${USER_DIR}/GermanData ../tmp/
# python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/
# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
# python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/
# python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360
# python3 distribute.py --config_path config.json
while true; do sleep 1000000; done while true; do sleep 1000000; done

View File

@ -1,6 +1,6 @@
{ {
"run_name": "ljspeech", "run_name": "ljspeech",
"run_description": "gradual training with prenet frame size 1 + no maxout for cbhg + symmetric norm.", "run_description": "Tacotron ljspeech release training",
"audio":{ "audio":{
// Audio processing parameters // Audio processing parameters
@ -55,20 +55,16 @@
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16, "eval_batch_size":16,
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. "gradual_training": [[0, 7, 32], [1, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled.
"wd": 0.000001, // Weight decay weight. "wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step" "checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 10000, // Number of training steps expected to save traning stats and checkpoints. "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
"print_step": 25, // Number of steps to log traning on console. "print_step": 25, // Number of steps to log traning on console.
"batch_group_size": 0, //Number of batches to shuffle after bucketing. "batch_group_size": 0, //Number of batches to shuffle after bucketing.
"run_eval": true, "run_eval": true,
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
"data_path": "/home/erogol/Data/LJSpeech-1.1/", // DATASET-RELATED: can overwritten from command argument
"meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
"meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
"dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 150, // DATASET-RELATED: maximum text length "max_seq_len": 150, // DATASET-RELATED: maximum text length
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
@ -79,6 +75,18 @@
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
"text_cleaner": "phoneme_cleaners", "text_cleaner": "phoneme_cleaners",
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"style_wav_for_test": null // path to style wav file to be used in TacotronGST inference. "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
"use_gst": false, // TACOTRON ONLY: use global style tokens
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "/data/ro/shared/data/keithito/LJSpeech-1.1/",
"meta_file_train": "metadata_train.csv",
"meta_file_val": "metadata_val.csv"
}
]
} }

View File

@ -2,6 +2,27 @@ import os
from glob import glob from glob import glob
import re import re
import sys import sys
from TTS.utils.generic_utils import split_dataset
def load_meta_data(datasets):
meta_data_train_all = []
meta_data_eval_all = []
for dataset in datasets:
name = dataset['name']
root_path = dataset['path']
meta_file_train = dataset['meta_file_train']
meta_file_val = dataset['meta_file_val']
preprocessor = get_preprocessor_by_name(name)
meta_data_train = preprocessor(root_path, meta_file_train)
if meta_file_val is None:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
else:
meta_data_eval = preprocessor(root_path, meta_file_val)
meta_data_train_all += meta_data_train
meta_data_eval_all += meta_data_eval
return meta_data_train_all, meta_data_eval_all
def get_preprocessor_by_name(name): def get_preprocessor_by_name(name):

View File

@ -1,3 +1,5 @@
import numpy as np
import torch
from torch import nn from torch import nn
from torch.nn import functional from torch.nn import functional
from TTS.utils.generic_utils import sequence_mask from TTS.utils.generic_utils import sequence_mask
@ -53,3 +55,18 @@ class MSELossMasked(nn.Module):
x * mask, target * mask, reduction="sum") x * mask, target * mask, reduction="sum")
loss = loss / mask.sum() loss = loss / mask.sum()
return loss return loss
class AttentionEntropyLoss(nn.Module):
# pylint: disable=R0201
def forward(self, align):
"""
Forces attention to be more decisive by penalizing
soft attention weights
TODO: arguments
TODO: unit_test
"""
entropy = torch.distributions.Categorical(probs=align).entropy()
loss = (entropy / np.log(align.shape[1])).mean()
return loss

View File

@ -273,7 +273,7 @@ class Decoder(nn.Module):
def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing, def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing,
attn_norm, prenet_type, prenet_dropout, forward_attn, attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn, trans_agent, forward_attn_mask, location_attn,
separate_stopnet): separate_stopnet, speaker_embedding_dim):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.r_init = r self.r_init = r
self.r = r self.r = r
@ -285,8 +285,9 @@ class Decoder(nn.Module):
self.separate_stopnet = separate_stopnet self.separate_stopnet = separate_stopnet
self.query_dim = 256 self.query_dim = 256
# memory -> |Prenet| -> processed_memory # memory -> |Prenet| -> processed_memory
prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim
self.prenet = Prenet( self.prenet = Prenet(
memory_dim * self.memory_size if self.use_memory_queue else memory_dim, prenet_dim,
prenet_type, prenet_type,
prenet_dropout, prenet_dropout,
out_features=[256, 128]) out_features=[256, 128])
@ -339,13 +340,13 @@ class Decoder(nn.Module):
T = inputs.size(1) T = inputs.size(1)
# go frame as zeros matrix # go frame as zeros matrix
if self.use_memory_queue: if self.use_memory_queue:
self.memory_input = torch.zeros(B, self.memory_dim * self.memory_size, device=inputs.device) self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size)
else: else:
self.memory_input = torch.zeros(B, self.memory_dim, device=inputs.device) self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim)
# decoder states # decoder states
self.attention_rnn_hidden = torch.zeros(B, 256, device=inputs.device) self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
self.decoder_rnn_hiddens = [ self.decoder_rnn_hiddens = [
torch.zeros(B, 256, device=inputs.device) torch.zeros(1, device=inputs.device).repeat(B, 256)
for idx in range(len(self.decoder_rnns)) for idx in range(len(self.decoder_rnns))
] ]
self.context_vec = inputs.data.new(B, self.in_features).zero_() self.context_vec = inputs.data.new(B, self.in_features).zero_()
@ -405,9 +406,9 @@ class Decoder(nn.Module):
self.memory_input = new_memory[:, :self.memory_size * self.memory_dim] self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
else: else:
# use only the last frame prediction # use only the last frame prediction
self.memory_input = new_memory[:, :self.memory_dim] self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):]
def forward(self, inputs, memory, mask): def forward(self, inputs, memory, mask, speaker_embeddings=None):
""" """
Args: Args:
inputs: Encoder outputs. inputs: Encoder outputs.
@ -432,6 +433,8 @@ class Decoder(nn.Module):
if t > 0: if t > 0:
new_memory = memory[t - 1] new_memory = memory[t - 1]
self._update_memory_input(new_memory) self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, mask) output, stop_token, attention = self.decode(inputs, mask)
outputs += [output] outputs += [output]
attentions += [attention] attentions += [attention]
@ -440,13 +443,15 @@ class Decoder(nn.Module):
return self._parse_outputs(outputs, attentions, stop_tokens) return self._parse_outputs(outputs, attentions, stop_tokens)
def inference(self, inputs): def inference(self, inputs, speaker_embeddings=None):
""" """
Args: Args:
inputs: Encoder outputs. inputs: encoder outputs.
speaker_embeddings: speaker vectors.
Shapes: Shapes:
- inputs: batch x time x encoder_out_dim - inputs: batch x time x encoder_out_dim
- speaker_embeddings: batch x embed_dim
""" """
outputs = [] outputs = []
attentions = [] attentions = []
@ -459,6 +464,8 @@ class Decoder(nn.Module):
if t > 0: if t > 0:
new_memory = outputs[-1] new_memory = outputs[-1]
self._update_memory_input(new_memory) self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, None) output, stop_token, attention = self.decode(inputs, None)
stop_token = torch.sigmoid(stop_token.data) stop_token = torch.sigmoid(stop_token.data)
outputs += [output] outputs += [output]

View File

@ -10,8 +10,10 @@ class ConvBNBlock(nn.Module):
super(ConvBNBlock, self).__init__() super(ConvBNBlock, self).__init__()
assert (kernel_size - 1) % 2 == 0 assert (kernel_size - 1) % 2 == 0
padding = (kernel_size - 1) // 2 padding = (kernel_size - 1) // 2
conv1d = nn.Conv1d( conv1d = nn.Conv1d(in_channels,
in_channels, out_channels, kernel_size, padding=padding) out_channels,
kernel_size,
padding=padding)
norm = nn.BatchNorm1d(out_channels) norm = nn.BatchNorm1d(out_channels)
dropout = nn.Dropout(p=0.5) dropout = nn.Dropout(p=0.5)
if nonlinear == 'relu': if nonlinear == 'relu':
@ -52,8 +54,7 @@ class Encoder(nn.Module):
convolutions.append( convolutions.append(
ConvBNBlock(in_features, in_features, 5, 'relu')) ConvBNBlock(in_features, in_features, 5, 'relu'))
self.convolutions = nn.Sequential(*convolutions) self.convolutions = nn.Sequential(*convolutions)
self.lstm = nn.LSTM( self.lstm = nn.LSTM(in_features,
in_features,
int(in_features / 2), int(in_features / 2),
num_layers=1, num_layers=1,
batch_first=True, batch_first=True,
@ -64,8 +65,9 @@ class Encoder(nn.Module):
x = self.convolutions(x) x = self.convolutions(x)
x = x.transpose(1, 2) x = x.transpose(1, 2)
input_lengths = input_lengths.cpu().numpy() input_lengths = input_lengths.cpu().numpy()
x = nn.utils.rnn.pack_padded_sequence( x = nn.utils.rnn.pack_padded_sequence(x,
x, input_lengths, batch_first=True) input_lengths,
batch_first=True)
self.lstm.flatten_parameters() self.lstm.flatten_parameters()
outputs, _ = self.lstm(x) outputs, _ = self.lstm(x)
outputs, _ = nn.utils.rnn.pad_packed_sequence( outputs, _ = nn.utils.rnn.pad_packed_sequence(
@ -101,6 +103,7 @@ class Decoder(nn.Module):
forward_attn_mask, location_attn, separate_stopnet): forward_attn_mask, location_attn, separate_stopnet):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.mel_channels = inputs_dim self.mel_channels = inputs_dim
self.r_init = r
self.r = r self.r = r
self.encoder_embedding_dim = in_features self.encoder_embedding_dim = in_features
self.separate_stopnet = separate_stopnet self.separate_stopnet = separate_stopnet
@ -111,10 +114,11 @@ class Decoder(nn.Module):
self.gate_threshold = 0.5 self.gate_threshold = 0.5
self.p_attention_dropout = 0.1 self.p_attention_dropout = 0.1
self.p_decoder_dropout = 0.1 self.p_decoder_dropout = 0.1
self.prenet = Prenet(self.mel_channels,
self.prenet = Prenet(self.mel_channels * r, prenet_type, prenet_type,
prenet_dropout, prenet_dropout,
[self.prenet_dim, self.prenet_dim], bias=False) [self.prenet_dim, self.prenet_dim],
bias=False)
self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features,
self.query_dim) self.query_dim)
@ -135,51 +139,46 @@ class Decoder(nn.Module):
self.decoder_rnn_dim, 1) self.decoder_rnn_dim, 1)
self.linear_projection = Linear(self.decoder_rnn_dim + in_features, self.linear_projection = Linear(self.decoder_rnn_dim + in_features,
self.mel_channels * r) self.mel_channels * self.r_init)
self.stopnet = nn.Sequential( self.stopnet = nn.Sequential(
nn.Dropout(0.1), nn.Dropout(0.1),
Linear( Linear(self.decoder_rnn_dim + self.mel_channels * self.r_init,
self.decoder_rnn_dim + self.mel_channels * r,
1, 1,
bias=True, bias=True,
init_gain='sigmoid')) init_gain='sigmoid'))
self.attention_rnn_init = nn.Embedding(1, self.query_dim)
self.go_frame_init = nn.Embedding(1, self.mel_channels * r)
self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim)
self.memory_truncated = None self.memory_truncated = None
def set_r(self, new_r):
self.r = new_r
def get_go_frame(self, inputs): def get_go_frame(self, inputs):
B = inputs.size(0) B = inputs.size(0)
memory = self.go_frame_init(inputs.data.new_zeros(B).long()) memory = torch.zeros(1, device=inputs.device).repeat(B,
self.mel_channels * self.r)
return memory return memory
def _init_states(self, inputs, mask, keep_states=False): def _init_states(self, inputs, mask, keep_states=False):
B = inputs.size(0) B = inputs.size(0)
# T = inputs.size(1) # T = inputs.size(1)
if not keep_states: if not keep_states:
self.query = self.attention_rnn_init( self.query = torch.zeros(1, device=inputs.device).repeat(
inputs.data.new_zeros(B).long()) B, self.query_dim)
self.attention_rnn_cell_state = Variable( self.attention_rnn_cell_state = torch.zeros(
inputs.data.new(B, self.query_dim).zero_()) 1, device=inputs.device).repeat(B, self.query_dim)
self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(
self.decoder_hidden = self.decoder_rnn_inits( B, self.decoder_rnn_dim)
inputs.data.new_zeros(B).long()) self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(
self.decoder_cell = Variable( B, self.decoder_rnn_dim)
inputs.data.new(B, self.decoder_rnn_dim).zero_()) self.context = torch.zeros(1, device=inputs.device).repeat(
B, self.encoder_embedding_dim)
self.context = Variable(
inputs.data.new(B, self.encoder_embedding_dim).zero_())
self.inputs = inputs self.inputs = inputs
self.processed_inputs = self.attention.inputs_layer(inputs) self.processed_inputs = self.attention.inputs_layer(inputs)
self.mask = mask self.mask = mask
def _reshape_memory(self, memories): def _reshape_memory(self, memories):
memories = memories.view( memories = memories.view(memories.size(0),
memories.size(0), int(memories.size(1) / self.r), -1) int(memories.size(1) / self.r), -1)
memories = memories.transpose(0, 1) memories = memories.transpose(0, 1)
return memories return memories
@ -192,14 +191,20 @@ class Decoder(nn.Module):
outputs = outputs.transpose(1, 2) outputs = outputs.transpose(1, 2)
return outputs, stop_tokens, alignments return outputs, stop_tokens, alignments
def _update_memory(self, memory):
if len(memory.shape) == 2:
return memory[:, self.mel_channels * (self.r - 1):]
return memory[:, :, self.mel_channels * (self.r - 1):]
def decode(self, memory): def decode(self, memory):
query_input = torch.cat((memory, self.context), -1) query_input = torch.cat((memory, self.context), -1)
self.query, self.attention_rnn_cell_state = self.attention_rnn( self.query, self.attention_rnn_cell_state = self.attention_rnn(
query_input, (self.query, self.attention_rnn_cell_state)) query_input, (self.query, self.attention_rnn_cell_state))
self.query = F.dropout( self.query = F.dropout(self.query, self.p_attention_dropout,
self.query, self.p_attention_dropout, self.training) self.training)
self.attention_rnn_cell_state = F.dropout( self.attention_rnn_cell_state = F.dropout(
self.attention_rnn_cell_state, self.p_attention_dropout, self.training) self.attention_rnn_cell_state, self.p_attention_dropout,
self.training)
self.context = self.attention(self.query, self.inputs, self.context = self.attention(self.query, self.inputs,
self.processed_inputs, self.mask) self.processed_inputs, self.mask)
@ -223,13 +228,14 @@ class Decoder(nn.Module):
stop_token = self.stopnet(stopnet_input.detach()) stop_token = self.stopnet(stopnet_input.detach())
else: else:
stop_token = self.stopnet(stopnet_input) stop_token = self.stopnet(stopnet_input)
decoder_output = decoder_output[:, :self.r * self.mel_channels]
return decoder_output, stop_token, self.attention.attention_weights return decoder_output, stop_token, self.attention.attention_weights
def forward(self, inputs, memories, mask): def forward(self, inputs, memories, mask):
memory = self.get_go_frame(inputs).unsqueeze(0) memory = self.get_go_frame(inputs).unsqueeze(0)
memories = self._reshape_memory(memories) memories = self._reshape_memory(memories)
memories = torch.cat((memory, memories), dim=0) memories = torch.cat((memory, memories), dim=0)
memories = self.prenet(memories) memories = self.prenet(self._update_memory(memories))
self._init_states(inputs, mask=mask) self._init_states(inputs, mask=mask)
self.attention.init_states(inputs) self.attention.init_states(inputs)
@ -249,6 +255,8 @@ class Decoder(nn.Module):
def inference(self, inputs): def inference(self, inputs):
memory = self.get_go_frame(inputs) memory = self.get_go_frame(inputs)
memory = self._update_memory(memory)
self._init_states(inputs, mask=None) self._init_states(inputs, mask=None)
self.attention.init_win_idx() self.attention.init_win_idx()
@ -256,7 +264,6 @@ class Decoder(nn.Module):
outputs, stop_tokens, alignments, t = [], [], [], 0 outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False] stop_flags = [True, False, False]
stop_count = 0
while True: while True:
memory = self.prenet(memory) memory = self.prenet(memory)
mel_output, stop_token, alignment = self.decode(memory) mel_output, stop_token, alignment = self.decode(memory)
@ -270,14 +277,12 @@ class Decoder(nn.Module):
and t > inputs.shape[1]) and t > inputs.shape[1])
stop_flags[2] = t > inputs.shape[1] * 2 stop_flags[2] = t > inputs.shape[1] * 2
if all(stop_flags): if all(stop_flags):
stop_count += 1
if stop_count > 20:
break break
elif len(outputs) == self.max_decoder_steps: if len(outputs) == self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps") print(" | > Decoder stopped with 'max_decoder_steps")
break break
memory = mel_output memory = self._update_memory(mel_output)
t += 1 t += 1
outputs, stop_tokens, alignments = self._parse_outputs( outputs, stop_tokens, alignments = self._parse_outputs(
@ -299,7 +304,6 @@ class Decoder(nn.Module):
self.attention.init_states(inputs) self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0 outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False] stop_flags = [True, False, False]
stop_count = 0
while True: while True:
memory = self.prenet(self.memory_truncated) memory = self.prenet(self.memory_truncated)
mel_output, stop_token, alignment = self.decode(memory) mel_output, stop_token, alignment = self.decode(memory)
@ -313,10 +317,8 @@ class Decoder(nn.Module):
and t > inputs.shape[1]) and t > inputs.shape[1])
stop_flags[2] = t > inputs.shape[1] * 2 stop_flags[2] = t > inputs.shape[1] * 2
if all(stop_flags): if all(stop_flags):
stop_count += 1
if stop_count > 20:
break break
elif len(outputs) == self.max_decoder_steps: if len(outputs) == self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps") print(" | > Decoder stopped with 'max_decoder_steps")
break break

View File

@ -1,7 +1,9 @@
# coding: utf-8 # coding: utf-8
import torch
from torch import nn from torch import nn
from TTS.layers.tacotron import Encoder, Decoder, PostCBHG from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
from TTS.utils.generic_utils import sequence_mask from TTS.utils.generic_utils import sequence_mask
from TTS.layers.gst_layers import GST
class Tacotron(nn.Module): class Tacotron(nn.Module):
@ -13,6 +15,7 @@ class Tacotron(nn.Module):
mel_dim=80, mel_dim=80,
memory_size=5, memory_size=5,
attn_win=False, attn_win=False,
gst=False,
attn_norm="sigmoid", attn_norm="sigmoid",
prenet_type="original", prenet_type="original",
prenet_dropout=True, prenet_dropout=True,
@ -25,55 +28,117 @@ class Tacotron(nn.Module):
self.r = r self.r = r
self.mel_dim = mel_dim self.mel_dim = mel_dim
self.linear_dim = linear_dim self.linear_dim = linear_dim
self.gst = gst
self.num_speakers = num_speakers
self.embedding = nn.Embedding(num_chars, 256) self.embedding = nn.Embedding(num_chars, 256)
self.embedding.weight.data.normal_(0, 0.3) self.embedding.weight.data.normal_(0, 0.3)
decoder_dim = 512 if num_speakers > 1 else 256
encoder_dim = 512 if num_speakers > 1 else 256
proj_speaker_dim = 80 if num_speakers > 1 else 0
# boilerplate model
self.encoder = Encoder(encoder_dim)
self.decoder = Decoder(decoder_dim, mel_dim, r, memory_size, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, separate_stopnet,
proj_speaker_dim)
self.postnet = PostCBHG(mel_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
linear_dim)
# speaker embedding layers
if num_speakers > 1: if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, 256) self.speaker_embedding = nn.Embedding(num_speakers, 256)
self.speaker_embedding.weight.data.normal_(0, 0.3) self.speaker_embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(256) self.speaker_project_mel = nn.Sequential(
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win, nn.Linear(256, proj_speaker_dim), nn.Tanh())
attn_norm, prenet_type, prenet_dropout, self.speaker_embeddings = None
forward_attn, trans_agent, forward_attn_mask, self.speaker_embeddings_projected = None
location_attn, separate_stopnet) # global style token layers
self.postnet = PostCBHG(mel_dim) if self.gst:
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim) gst_embedding_dim = 256
self.gst_layer = GST(num_mel=80,
num_heads=4,
num_style_tokens=10,
embedding_dim=gst_embedding_dim)
def _init_states(self):
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
def compute_speaker_embedding(self, speaker_ids):
if hasattr(self, "speaker_embedding") and speaker_ids is None:
raise RuntimeError(
" [!] Model has speaker embedding layer but speaker_id is not provided"
)
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
self.speaker_embeddings = self._compute_speaker_embedding(
speaker_ids)
self.speaker_embeddings_projected = self.speaker_project_mel(
self.speaker_embeddings).squeeze(1)
def compute_gst(self, inputs, mel_specs):
gst_outputs = self.gst_layer(mel_specs)
inputs = self._add_speaker_embedding(inputs, gst_outputs)
return inputs
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
B = characters.size(0) B = characters.size(0)
mask = sequence_mask(text_lengths).to(characters.device) mask = sequence_mask(text_lengths).to(characters.device)
inputs = self.embedding(characters) inputs = self.embedding(characters)
self._init_states()
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
encoder_outputs = self.encoder(inputs) encoder_outputs = self.encoder(inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs, if self.gst:
speaker_ids) encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
mel_outputs, alignments, stop_tokens = self.decoder( mel_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, mask) encoder_outputs, mel_specs, mask,
self.speaker_embeddings_projected)
mel_outputs = mel_outputs.view(B, -1, self.mel_dim) mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
linear_outputs = self.postnet(mel_outputs) linear_outputs = self.postnet(mel_outputs)
linear_outputs = self.last_linear(linear_outputs) linear_outputs = self.last_linear(linear_outputs)
return mel_outputs, linear_outputs, alignments, stop_tokens return mel_outputs, linear_outputs, alignments, stop_tokens
def inference(self, characters, speaker_ids=None): def inference(self, characters, speaker_ids=None, style_mel=None):
B = characters.size(0) B = characters.size(0)
inputs = self.embedding(characters) inputs = self.embedding(characters)
self._init_states()
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
encoder_outputs = self.encoder(inputs) encoder_outputs = self.encoder(inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs, if self.gst and style_mel is not None:
speaker_ids) encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
mel_outputs, alignments, stop_tokens = self.decoder.inference( mel_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs) encoder_outputs, self.speaker_embeddings_projected)
mel_outputs = mel_outputs.view(B, -1, self.mel_dim) mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
linear_outputs = self.postnet(mel_outputs) linear_outputs = self.postnet(mel_outputs)
linear_outputs = self.last_linear(linear_outputs) linear_outputs = self.last_linear(linear_outputs)
return mel_outputs, linear_outputs, alignments, stop_tokens return mel_outputs, linear_outputs, alignments, stop_tokens
def _add_speaker_embedding(self, encoder_outputs, speaker_ids): def _compute_speaker_embedding(self, speaker_ids):
if hasattr(self, "speaker_embedding") and speaker_ids is None:
raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
speaker_embeddings = self.speaker_embedding(speaker_ids) speaker_embeddings = self.speaker_embedding(speaker_ids)
return speaker_embeddings.unsqueeze_(1)
speaker_embeddings.unsqueeze_(1) @staticmethod
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0), def _add_speaker_embedding(outputs, speaker_embeddings):
encoder_outputs.size(1), speaker_embeddings_ = speaker_embeddings.expand(
-1) outputs.size(0), outputs.size(1), -1)
encoder_outputs = encoder_outputs + speaker_embeddings outputs = outputs + speaker_embeddings_
return encoder_outputs return outputs
@staticmethod
def _concat_speaker_embedding(outputs, speaker_embeddings):
speaker_embeddings_ = speaker_embeddings.expand(
outputs.size(0), outputs.size(1), -1)
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
return outputs

View File

@ -1,87 +0,0 @@
# coding: utf-8
from torch import nn
from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
from TTS.layers.gst_layers import GST
from TTS.utils.generic_utils import sequence_mask
class TacotronGST(nn.Module):
def __init__(self,
num_chars,
num_speakers,
r=5,
linear_dim=1025,
mel_dim=80,
memory_size=5,
attn_win=False,
attn_norm="sigmoid",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
separate_stopnet=True):
super(TacotronGST, self).__init__()
self.r = r
self.mel_dim = mel_dim
self.linear_dim = linear_dim
self.embedding = nn.Embedding(num_chars, 256)
self.embedding.weight.data.normal_(0, 0.3)
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, 256)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(256)
self.gst = GST(num_mel=80, num_heads=4, num_style_tokens=10, embedding_dim=256)
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, separate_stopnet)
self.postnet = PostCBHG(mel_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim)
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
B = characters.size(0)
mask = sequence_mask(text_lengths).to(characters.device)
inputs = self.embedding(characters)
encoder_outputs = self.encoder(inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
gst_outputs = self.gst(mel_specs)
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
encoder_outputs = encoder_outputs + gst_outputs
mel_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, mask)
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
linear_outputs = self.postnet(mel_outputs)
linear_outputs = self.last_linear(linear_outputs)
return mel_outputs, linear_outputs, alignments, stop_tokens
def inference(self, characters, speaker_ids=None, style_mel=None):
B = characters.size(0)
inputs = self.embedding(characters)
encoder_outputs = self.encoder(inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
if style_mel is not None:
gst_outputs = self.gst(style_mel)
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
encoder_outputs = encoder_outputs + gst_outputs
mel_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs)
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
linear_outputs = self.postnet(mel_outputs)
linear_outputs = self.last_linear(linear_outputs)
return mel_outputs, linear_outputs, alignments, stop_tokens
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
speaker_embeddings = self.speaker_embedding(speaker_ids)
speaker_embeddings.unsqueeze_(1)
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
encoder_outputs.size(1),
-1)
encoder_outputs = encoder_outputs + speaker_embeddings
return encoder_outputs

View File

@ -19,7 +19,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -29,28 +29,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/erogol/miniconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']\n",
"`%matplotlib` prevents importing * from pylab and numpy\n",
" \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n"
]
}
],
"source": [ "source": [
"%load_ext autoreload\n", "%load_ext autoreload\n",
"%autoreload 2\n", "%autoreload 2\n",
@ -59,6 +42,7 @@
"import io\n", "import io\n",
"import torch \n", "import torch \n",
"import time\n", "import time\n",
"import json\n",
"import numpy as np\n", "import numpy as np\n",
"from collections import OrderedDict\n", "from collections import OrderedDict\n",
"from matplotlib import pylab as plt\n", "from matplotlib import pylab as plt\n",
@ -86,23 +70,25 @@
"from IPython.display import Audio\n", "from IPython.display import Audio\n",
"\n", "\n",
"import os\n", "import os\n",
"os.environ['CUDA_VISIBLE_DEVICES']='1'\n", "os.environ['CUDA_VISIBLE_DEVICES']='1'"
"os.environ['OMP_NUM_THREADS']='1'\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n", "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
" t_1 = time.time()\n", " t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False, speaker_id=speaker_id, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" # coorect the normalization differences b/w TTS and the Vocoder.\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
" mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n",
" if not use_gl:\n", " if not use_gl:\n",
" waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)\n", " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n",
"\n", "\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n", " print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" if figures: \n", " if figures: \n",
@ -117,31 +103,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-3306702a6bbc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mVOCODER_MODEL_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mVOCODER_CONFIG_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mVOCODER_CONFIG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mVOCODER_CONFIG_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0muse_cuda\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/projects/TTS/tts_namespace/TTS/utils/generic_utils.py\u001b[0m in \u001b[0;36mload_config\u001b[0;34m(config_path)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mconfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAttrDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'\\\\\\n'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'"
]
}
],
"source": [ "source": [
"# Set constants\n", "# Set constants\n",
"ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'\n", "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5099/'\n",
"MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n", "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n", "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
"CONFIG = load_config(CONFIG_PATH)\n", "CONFIG = load_config(CONFIG_PATH)\n",
"VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\n", "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n",
"VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\n", "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n",
"VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n", "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n",
"use_cuda = False\n", "use_cuda = False\n",
"\n", "\n",
@ -149,10 +122,12 @@
"# CONFIG.windowing = False\n", "# CONFIG.windowing = False\n",
"# CONFIG.prenet_dropout = False\n", "# CONFIG.prenet_dropout = False\n",
"# CONFIG.separate_stopnet = True\n", "# CONFIG.separate_stopnet = True\n",
"CONFIG.use_forward_attn = True\n",
"# CONFIG.forward_attn_mask = True\n",
"# CONFIG.stopnet = True\n", "# CONFIG.stopnet = True\n",
"\n", "\n",
"# Set the vocoder\n", "# Set the vocoder\n",
"use_gl = True # use GL if True\n", "use_gl = False # use GL if True\n",
"batched_wavernn = True # use batched wavernn inference if True" "batched_wavernn = True # use batched wavernn inference if True"
] ]
}, },
@ -165,9 +140,17 @@
"# LOAD TTS MODEL\n", "# LOAD TTS MODEL\n",
"from utils.text.symbols import symbols, phonemes\n", "from utils.text.symbols import symbols, phonemes\n",
"\n", "\n",
"# multi speaker \n",
"if CONFIG.use_speaker_embedding:\n",
" speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
" speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
"else:\n",
" speakers = []\n",
" speaker_id = None\n",
"\n",
"# load the model\n", "# load the model\n",
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, CONFIG)\n", "model = setup_model(num_chars, len(speakers), CONFIG)\n",
"\n", "\n",
"# load the audio processor\n", "# load the audio processor\n",
"ap = AudioProcessor(**CONFIG.audio) \n", "ap = AudioProcessor(**CONFIG.audio) \n",
@ -184,7 +167,12 @@
"if use_cuda:\n", "if use_cuda:\n",
" model.cuda()\n", " model.cuda()\n",
"model.eval()\n", "model.eval()\n",
"print(cp['step'])" "print(cp['step'])\n",
"print(cp['r'])\n",
"\n",
"# set model stepsize\n",
"if 'r' in cp:\n",
" model.decoder.set_r(cp['r'])"
] ]
}, },
{ {
@ -196,25 +184,28 @@
"# LOAD WAVERNN\n", "# LOAD WAVERNN\n",
"if use_gl == False:\n", "if use_gl == False:\n",
" from WaveRNN.models.wavernn import Model\n", " from WaveRNN.models.wavernn import Model\n",
" from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
" bits = 10\n", " bits = 10\n",
"\n", " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n",
" wavernn = Model(\n", " wavernn = Model(\n",
" rnn_dims=512,\n", " rnn_dims=512,\n",
" fc_dims=512,\n", " fc_dims=512,\n",
" mode=\"mold\",\n", " mode=VOCODER_CONFIG.mode,\n",
" pad=2,\n", " mulaw=VOCODER_CONFIG.mulaw,\n",
" upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset\n", " pad=VOCODER_CONFIG.pad,\n",
" upsample_factors=VOCODER_CONFIG.upsample_factors,\n",
" feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n", " feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n",
" compute_dims=128,\n", " compute_dims=128,\n",
" res_out_dims=128,\n", " res_out_dims=128,\n",
" res_blocks=10,\n", " res_blocks=10,\n",
" hop_length=ap.hop_length,\n", " hop_length=ap_vocoder.hop_length,\n",
" sample_rate=ap.sample_rate,\n", " sample_rate=ap_vocoder.sample_rate,\n",
" use_upsample_net = True,\n",
" use_aux_net = True\n",
" ).cuda()\n", " ).cuda()\n",
"\n", "\n",
"\n",
" check = torch.load(VOCODER_MODEL_PATH)\n", " check = torch.load(VOCODER_MODEL_PATH)\n",
" wavernn.load_state_dict(check['model'])\n", " wavernn.load_state_dict(check['model'], strict=False)\n",
" if use_cuda:\n", " if use_cuda:\n",
" wavernn.cuda()\n", " wavernn.cuda()\n",
" wavernn.eval();\n", " wavernn.eval();\n",
@ -230,111 +221,67 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-e285d5bde9fb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_decoder_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m2000\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mspeaker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Bill got in the habit of asking himself “Is that thought true?” And if he wasnt absolutely certain it was, he just let it go.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [ "source": [
"model.eval()\n", "model.eval()\n",
"model.decoder.max_decoder_steps = 2000\n", "model.decoder.max_decoder_steps = 2000\n",
"speaker_id = 0\n", "speaker_id = None\n",
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” And if he wasnt absolutely certain it was, he just let it go.\"\n", "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-621056ffa667>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Be a voice, not an echo.\"\u001b[0m \u001b[0;31m# 'echo' is not in training set.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-26967668a1a1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"The human voice is the most perfect instrument of all.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-28cb5023e353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"I'm sorry Dave. I'm afraid I can't do that.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true, "outputs": [],
"jupyter": { "source": [
"outputs_hidden": true "model.eval()\n",
"model.decoder.max_decoder_steps = 2000\n",
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
}, },
"scrolled": true {
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"This cake is great. It's so delicious and moist.\"\n", "sentence = \"This cake is great. It's so delicious and moist.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
@ -347,76 +294,51 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n", "sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Heres a way to measure the acute emotional intelligence that has never gone out of style.\"\n", "sentence = \"Heres a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
@ -429,136 +351,91 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n", "sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \" He has read the whole thing.\"\n", "sentence = \" He has read the whole thing.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"He reads books.\"\n", "sentence = \"He reads books.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Thisss isrealy awhsome.\"\n", "sentence = \"Thisss isrealy awhsome.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"This is your internet browser, Firefox.\"\n", "sentence = \"This is your internet browser, Firefox.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"This is your internet browser Firefox.\"\n", "sentence = \"This is your internet browser Firefox.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"The quick brown fox jumps over the lazy dog.\"\n", "sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
@ -568,7 +445,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Eren, how are you?\"\n", "sentence = \"Eren, how are you?\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
@ -581,107 +458,62 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Encouraged, he started with a minute a day.\"\n", "sentence = \"Encouraged, he started with a minute a day.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"If he decided to watch TV he really watched it.\"\n", "sentence = \"If he decided to watch TV he really watched it.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"scrolled": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"# for twb dataset\n", "# for twb dataset\n",
"sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [],
"source": [
"# !zip benchmark_samples/samples.zip benchmark_samples/*"
] ]
} }
], ],

View File

@ -2,6 +2,7 @@ import os
import time import time
import argparse import argparse
import torch import torch
import json
import string import string
from TTS.utils.synthesis import synthesis from TTS.utils.synthesis import synthesis
@ -16,22 +17,28 @@ def tts(model,
VC, VC,
text, text,
ap, ap,
ap_vocoder,
use_cuda, use_cuda,
batched_vocoder, batched_vocoder,
speaker_id=None,
figures=False): figures=False):
t_1 = time.time() t_1 = time.time()
use_vocoder_model = vocoder_model is not None use_vocoder_model = vocoder_model is not None
waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis( waveform, alignment, _, postnet_output, stop_tokens = synthesis(
model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars) model, text, C, use_cuda, ap, speaker_id, False,
C.enable_eos_bos_chars)
if C.model == "Tacotron" and use_vocoder_model: if C.model == "Tacotron" and use_vocoder_model:
postnet_output = ap.out_linear_to_mel(postnet_output.T).T postnet_output = ap.out_linear_to_mel(postnet_output.T).T
# correct if there is a scale difference b/w two models
postnet_output = ap._denormalize(postnet_output)
postnet_output = ap_vocoder._normalize(postnet_output)
if use_vocoder_model: if use_vocoder_model:
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
waveform = vocoder_model.generate( waveform = vocoder_model.generate(
vocoder_input.cuda() if use_cuda else vocoder_input, vocoder_input.cuda() if use_cuda else vocoder_input,
batched=batched_vocoder, batched=batched_vocoder,
target=11000, target=8000,
overlap=550) overlap=400)
print(" > Run-time: {}".format(time.time() - t_1)) print(" > Run-time: {}".format(time.time() - t_1))
return alignment, postnet_output, stop_tokens, waveform return alignment, postnet_output, stop_tokens, waveform
@ -39,13 +46,10 @@ def tts(model,
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument('text', type=str, help='Text to generate speech.')
'text', type=str, help='Text to generate speech.') parser.add_argument('config_path',
parser.add_argument(
'config_path',
type=str, type=str,
help='Path to model config file.' help='Path to model config file.')
)
parser.add_argument( parser.add_argument(
'model_path', 'model_path',
type=str, type=str,
@ -56,8 +60,10 @@ if __name__ == "__main__":
type=str, type=str,
help='Path to save final wav file.', help='Path to save final wav file.',
) )
parser.add_argument( parser.add_argument('--use_cuda',
'--use_cuda', type=bool, help='Run model on CUDA.', default=False) type=bool,
help='Run model on CUDA.',
default=False)
parser.add_argument( parser.add_argument(
'--vocoder_path', '--vocoder_path',
type=str, type=str,
@ -65,8 +71,7 @@ if __name__ == "__main__":
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
default="", default="",
) )
parser.add_argument( parser.add_argument('--vocoder_config_path',
'--vocoder_config_path',
type=str, type=str,
help='Path to vocoder model config file.', help='Path to vocoder model config file.',
default="") default="")
@ -75,12 +80,15 @@ if __name__ == "__main__":
type=bool, type=bool,
help="If True, vocoder model uses faster batch processing.", help="If True, vocoder model uses faster batch processing.",
default=True) default=True)
parser.add_argument( parser.add_argument('--speakers_json',
'--speakers_json',
type=str, type=str,
help="JSON file for multi-speaker model.", help="JSON file for multi-speaker model.",
default="" default="")
) parser.add_argument(
'--speaker_id',
type=int,
help="target speaker_id if the model is multi-speaker.",
default=None)
args = parser.parse_args() args = parser.parse_args()
if args.vocoder_path != "": if args.vocoder_path != "":
@ -109,13 +117,14 @@ if __name__ == "__main__":
model.eval() model.eval()
if args.use_cuda: if args.use_cuda:
model.cuda() model.cuda()
model.decoder.set_r(cp['r'])
# load vocoder model # load vocoder model
if args.vocoder_path != "": if args.vocoder_path != "":
VC = load_config(args.vocoder_config_path) VC = load_config(args.vocoder_config_path)
ap_vocoder = AudioProcessor(**VC.audio)
bits = 10 bits = 10
vocoder_model = VocoderModel( vocoder_model = VocoderModel(rnn_dims=512,
rnn_dims=512,
fc_dims=512, fc_dims=512,
mode=VC.mode, mode=VC.mode,
mulaw=VC.mulaw, mulaw=VC.mulaw,
@ -127,7 +136,8 @@ if __name__ == "__main__":
res_blocks=10, res_blocks=10,
hop_length=ap.hop_length, hop_length=ap.hop_length,
sample_rate=ap.sample_rate, sample_rate=ap.sample_rate,
) use_aux_net=True,
use_upsample_net=True)
check = torch.load(args.vocoder_path) check = torch.load(args.vocoder_path)
vocoder_model.load_state_dict(check['model']) vocoder_model.load_state_dict(check['model'])
@ -137,23 +147,26 @@ if __name__ == "__main__":
else: else:
vocoder_model = None vocoder_model = None
VC = None VC = None
ap_vocoder = None
# synthesize voice # synthesize voice
print(" > Text: {}".format(args.text)) print(" > Text: {}".format(args.text))
_, _, _, wav = tts( _, _, _, wav = tts(model,
model,
vocoder_model, vocoder_model,
C, C,
VC, VC,
args.text, args.text,
ap, ap,
ap_vocoder,
args.use_cuda, args.use_cuda,
args.batched_vocoder, args.batched_vocoder,
speaker_id=args.speaker_id,
figures=False) figures=False)
# save the results # save the results
file_name = args.text.replace(" ", "_") file_name = args.text.replace(" ", "_")
file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', '')))+'.wav' file_name = file_name.translate(
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(args.out_path, file_name) out_path = os.path.join(args.out_path, file_name)
print(" > Saving output to {}".format(out_path)) print(" > Saving output to {}".format(out_path))
ap.save_wav(wav, out_path) ap.save_wav(wav, out_path)

View File

@ -54,7 +54,8 @@ class DecoderTests(unittest.TestCase):
trans_agent=True, trans_agent=True,
forward_attn_mask=True, forward_attn_mask=True,
location_attn=True, location_attn=True,
separate_stopnet=True) separate_stopnet=True,
speaker_embedding_dim=0)
dummy_input = T.rand(4, 8, 256) dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80) dummy_memory = T.rand(4, 2, 80)
@ -66,6 +67,35 @@ class DecoderTests(unittest.TestCase):
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2]) assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4 assert stop_tokens.shape[0] == 4
@staticmethod
def test_in_out_multispeaker():
layer = Decoder(
in_features=256,
memory_dim=80,
r=2,
memory_size=4,
attn_windowing=False,
attn_norm="sigmoid",
prenet_type='original',
prenet_dropout=True,
forward_attn=True,
trans_agent=True,
forward_attn_mask=True,
location_attn=True,
separate_stopnet=True,
speaker_embedding_dim=80)
dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80)
dummy_embed = T.rand(4, 80)
output, alignment, stop_tokens = layer(
dummy_input, dummy_memory, mask=None, speaker_embeddings=dummy_embed)
assert output.shape[0] == 4
assert output.shape[1] == 1, "size not {}".format(output.shape[1])
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4
class EncoderTests(unittest.TestCase): class EncoderTests(unittest.TestCase):
def test_in_out(self): def test_in_out(self):

View File

@ -25,8 +25,9 @@ def count_parameters(model):
class TacotronTrainTest(unittest.TestCase): class TacotronTrainTest(unittest.TestCase):
def test_train_step(self): @staticmethod
input = torch.randint(0, 24, (8, 128)).long().to(device) def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths[-1] = 128 input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
@ -38,7 +39,7 @@ class TacotronTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0 stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input.shape[0], stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1) stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze() 0.0).unsqueeze(2).float().squeeze()
@ -51,9 +52,11 @@ class TacotronTrainTest(unittest.TestCase):
linear_dim=c.audio['num_freq'], linear_dim=c.audio['num_freq'],
mel_dim=c.audio['num_mels'], mel_dim=c.audio['num_mels'],
r=c.r, r=c.r,
memory_size=c.memory_size).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
model.train() model.train()
print(" > Num parameters for Tacotron model:%s"%(count_parameters(model))) print(" > Num parameters for Tacotron model:%s" %
(count_parameters(model)))
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
count = 0 count = 0
for param, param_ref in zip(model.parameters(), for param, param_ref in zip(model.parameters(),
@ -63,7 +66,7 @@ class TacotronTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5): for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward( mel_out, linear_out, align, stop_tokens = model.forward(
input, input_lengths, mel_spec, speaker_ids) input_dummy, input_lengths, mel_spec, speaker_ids)
optimizer.zero_grad() optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths) loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets) stop_loss = criterion_st(stop_tokens, stop_targets)
@ -81,3 +84,66 @@ class TacotronTrainTest(unittest.TestCase):
), "param {} with shape {} not updated!! \n{}\n{}".format( ), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref) count, param.shape, param, param_ref)
count += 1 count += 1
class TacotronGSTTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device)
mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
stop_targets = torch.zeros(8, 120, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked().to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
gst=True,
linear_dim=c.audio['num_freq'],
mel_dim=c.audio['num_mels'],
r=c.r,
memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(model)
print(" > Num parameters for Tacotron GST model:%s" %
(count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(10):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, speaker_ids)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
# ignore pre-higway layer since it works conditional
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
count += 1

331
train.py
View File

@ -15,21 +15,21 @@ from distribute import (DistributedSampler, apply_gradient_allreduce,
init_distributed, reduce_tensor) init_distributed, reduce_tensor)
from TTS.layers.losses import L1LossMasked, MSELossMasked from TTS.layers.losses import L1LossMasked, MSELossMasked
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters, from TTS.utils.generic_utils import (
create_experiment_folder, get_git_branch, NoamLR, check_update, count_parameters, create_experiment_folder,
load_config, remove_experiment_folder, get_git_branch, load_config, remove_experiment_folder, save_best_model,
save_best_model, save_checkpoint, weight_decay, save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file,
set_init_dict, copy_config_file, setup_model, setup_model, gradual_training_scheduler, KeepAverage,
split_dataset, gradual_training_scheduler) set_weight_decay)
from TTS.utils.logger import Logger from TTS.utils.logger import Logger
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
get_speakers get_speakers
from TTS.utils.synthesis import synthesis from TTS.utils.synthesis import synthesis
from TTS.utils.text.symbols import phonemes, symbols from TTS.utils.text.symbols import phonemes, symbols
from TTS.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.visual import plot_alignment, plot_spectrogram
from TTS.datasets.preprocess import get_preprocessor_by_name from TTS.datasets.preprocess import load_meta_data
from TTS.utils.radam import RAdam from TTS.utils.radam import RAdam
from TTS.utils.measures import alignment_diagonal_score
torch.backends.cudnn.enabled = True torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False torch.backends.cudnn.benchmark = False
@ -41,18 +41,6 @@ print(" > Number of GPUs: ", num_gpus)
def setup_loader(ap, is_val=False, verbose=False): def setup_loader(ap, is_val=False, verbose=False):
global meta_data_train
global meta_data_eval
if "meta_data_train" not in globals():
if c.meta_file_train is not None:
meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path, c.meta_file_train)
else:
meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path)
if "meta_data_eval" not in globals() and c.run_eval:
if c.meta_file_val is not None:
meta_data_eval = get_preprocessor_by_name(c.dataset)(c.data_path, c.meta_file_val)
else:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
if is_val and not c.run_eval: if is_val and not c.run_eval:
loader = None loader = None
else: else:
@ -61,7 +49,8 @@ def setup_loader(ap, is_val=False, verbose=False):
c.text_cleaner, c.text_cleaner,
meta_data=meta_data_eval if is_val else meta_data_train, meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap, ap=ap,
batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len, min_seq_len=c.min_seq_len,
max_seq_len=c.max_seq_len, max_seq_len=c.max_seq_len,
phoneme_cache_path=c.phoneme_cache_path, phoneme_cache_path=c.phoneme_cache_path,
@ -90,14 +79,21 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
speaker_mapping = load_speaker_mapping(OUT_PATH) speaker_mapping = load_speaker_mapping(OUT_PATH)
model.train() model.train()
epoch_time = 0 epoch_time = 0
avg_postnet_loss = 0 train_values = {
avg_decoder_loss = 0 'avg_postnet_loss': 0,
avg_stop_loss = 0 'avg_decoder_loss': 0,
avg_step_time = 0 'avg_stop_loss': 0,
avg_loader_time = 0 'avg_align_score': 0,
'avg_step_time': 0,
'avg_loader_time': 0,
'avg_alignment_score': 0
}
keep_avg = KeepAverage()
keep_avg.add_values(train_values)
print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True)
if use_cuda: if use_cuda:
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) batch_n_iter = int(
len(data_loader.dataset) / (c.batch_size * num_gpus))
else: else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size) batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time() end_time = time.time()
@ -108,7 +104,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
text_input = data[0] text_input = data[0]
text_lengths = data[1] text_lengths = data[1]
speaker_names = data[2] speaker_names = data[2]
linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"
] else None
mel_input = data[4] mel_input = data[4]
mel_lengths = data[5] mel_lengths = data[5]
stop_targets = data[6] stop_targets = data[6]
@ -117,8 +114,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
loader_time = time.time() - end_time loader_time = time.time() - end_time
if c.use_speaker_embedding: if c.use_speaker_embedding:
speaker_ids = [speaker_mapping[speaker_name] speaker_ids = [
for speaker_name in speaker_names] speaker_mapping[speaker_name] for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids) speaker_ids = torch.LongTensor(speaker_ids)
else: else:
speaker_ids = None speaker_ids = None
@ -126,7 +124,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
# set stop targets view, we predict a single stop token per r frames prediction # set stop targets view, we predict a single stop token per r frames prediction
stop_targets = stop_targets.view(text_input.shape[0], stop_targets = stop_targets.view(text_input.shape[0],
stop_targets.size(1) // c.r, -1) stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze(2)
global_step += 1 global_step += 1
@ -143,7 +142,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
text_lengths = text_lengths.cuda(non_blocking=True) text_lengths = text_lengths.cuda(non_blocking=True)
mel_input = mel_input.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True)
mel_lengths = mel_lengths.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True)
linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron", "TacotronGST"] else None linear_input = linear_input.cuda(
non_blocking=True) if c.model in ["Tacotron", "TacotronGST"
] else None
stop_targets = stop_targets.cuda(non_blocking=True) stop_targets = stop_targets.cuda(non_blocking=True)
if speaker_ids is not None: if speaker_ids is not None:
speaker_ids = speaker_ids.cuda(non_blocking=True) speaker_ids = speaker_ids.cuda(non_blocking=True)
@ -153,13 +154,16 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
text_input, text_lengths, mel_input, speaker_ids=speaker_ids) text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
# loss computation # loss computation
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) stop_loss = criterion_st(stop_tokens,
stop_targets) if c.stopnet else torch.zeros(1)
if c.loss_masking: if c.loss_masking:
decoder_loss = criterion(decoder_output, mel_input, mel_lengths) decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
if c.model in ["Tacotron", "TacotronGST"]: if c.model in ["Tacotron", "TacotronGST"]:
postnet_loss = criterion(postnet_output, linear_input, mel_lengths) postnet_loss = criterion(postnet_output, linear_input,
mel_lengths)
else: else:
postnet_loss = criterion(postnet_output, mel_input, mel_lengths) postnet_loss = criterion(postnet_output, mel_input,
mel_lengths)
else: else:
decoder_loss = criterion(decoder_output, mel_input) decoder_loss = criterion(decoder_output, mel_input)
if c.model in ["Tacotron", "TacotronGST"]: if c.model in ["Tacotron", "TacotronGST"]:
@ -171,14 +175,18 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
loss += stop_loss loss += stop_loss
loss.backward() loss.backward()
optimizer, current_lr = weight_decay(optimizer, c.wd) optimizer, current_lr = adam_weight_decay(optimizer)
grad_norm, _ = check_update(model, c.grad_clip) grad_norm, _ = check_update(model, c.grad_clip)
optimizer.step() optimizer.step()
# compute alignment score
align_score = alignment_diagonal_score(alignments)
keep_avg.update_value('avg_align_score', align_score)
# backpass and check the grad norm for stop loss # backpass and check the grad norm for stop loss
if c.separate_stopnet: if c.separate_stopnet:
stop_loss.backward() stop_loss.backward()
optimizer_st, _ = weight_decay(optimizer_st, c.wd) optimizer_st, _ = adam_weight_decay(optimizer_st)
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
optimizer_st.step() optimizer_st.step()
else: else:
@ -189,14 +197,14 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
if global_step % c.print_step == 0: if global_step % c.print_step == 0:
print( print(
" | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} " " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} "
"DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} "
"GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} "
"LoaderTime:{:.2f} LR:{:.6f}".format( "LoaderTime:{:.2f} LR:{:.6f}".format(
num_iter, batch_n_iter, global_step, loss.item(), num_iter, batch_n_iter, global_step, postnet_loss.item(),
postnet_loss.item(), decoder_loss.item(), stop_loss.item(), decoder_loss.item(), stop_loss.item(), align_score,
grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, grad_norm, grad_norm_st, avg_text_length, avg_spec_length,
loader_time, current_lr), step_time, loader_time, current_lr),
flush=True) flush=True)
# aggregate losses from processes # aggregate losses from processes
@ -204,24 +212,36 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
loss = reduce_tensor(loss.data, num_gpus) loss = reduce_tensor(loss.data, num_gpus)
stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss stop_loss = reduce_tensor(stop_loss.data,
num_gpus) if c.stopnet else stop_loss
if args.rank == 0: if args.rank == 0:
avg_postnet_loss += float(postnet_loss.item()) update_train_values = {
avg_decoder_loss += float(decoder_loss.item()) 'avg_postnet_loss':
avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()) float(postnet_loss.item()),
avg_step_time += step_time 'avg_decoder_loss':
avg_loader_time += loader_time float(decoder_loss.item()),
'avg_stop_loss':
stop_loss
if isinstance(stop_loss, float) else float(stop_loss.item()),
'avg_step_time':
step_time,
'avg_loader_time':
loader_time
}
keep_avg.update_values(update_train_values)
# Plot Training Iter Stats # Plot Training Iter Stats
# reduce TB load # reduce TB load
if global_step % 10 == 0: if global_step % 10 == 0:
iter_stats = {"loss_posnet": postnet_loss.item(), iter_stats = {
"loss_posnet": postnet_loss.item(),
"loss_decoder": decoder_loss.item(), "loss_decoder": decoder_loss.item(),
"lr": current_lr, "lr": current_lr,
"grad_norm": grad_norm, "grad_norm": grad_norm,
"grad_norm_st": grad_norm_st, "grad_norm_st": grad_norm_st,
"step_time": step_time} "step_time": step_time
}
tb_logger.tb_train_iter_stats(global_step, iter_stats) tb_logger.tb_train_iter_stats(global_step, iter_stats)
if global_step % c.save_step == 0: if global_step % c.save_step == 0:
@ -233,7 +253,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
# Diagnostic visualizations # Diagnostic visualizations
const_spec = postnet_output[0].data.cpu().numpy() const_spec = postnet_output[0].data.cpu().numpy()
gt_spec = linear_input[0].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() if c.model in [
"Tacotron", "TacotronGST"
] else mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy()
figures = { figures = {
@ -253,35 +275,31 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
c.audio["sample_rate"]) c.audio["sample_rate"])
end_time = time.time() end_time = time.time()
avg_postnet_loss /= (num_iter + 1)
avg_decoder_loss /= (num_iter + 1)
avg_stop_loss /= (num_iter + 1)
avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss
avg_step_time /= (num_iter + 1)
avg_loader_time /= (num_iter + 1)
# print epoch stats # print epoch stats
print( print(" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
"AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} "
"AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} "
"AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, avg_total_loss, "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(
avg_postnet_loss, avg_decoder_loss, global_step, keep_avg['avg_postnet_loss'],
avg_stop_loss, epoch_time, avg_step_time, keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'],
avg_loader_time), keep_avg['avg_align_score'], epoch_time,
keep_avg['avg_step_time'], keep_avg['avg_loader_time']),
flush=True) flush=True)
# Plot Epoch Stats # Plot Epoch Stats
if args.rank == 0: if args.rank == 0:
# Plot Training Epoch Stats # Plot Training Epoch Stats
epoch_stats = {"loss_postnet": avg_postnet_loss, epoch_stats = {
"loss_decoder": avg_decoder_loss, "loss_postnet": keep_avg['avg_postnet_loss'],
"stop_loss": avg_stop_loss, "loss_decoder": keep_avg['avg_decoder_loss'],
"epoch_time": epoch_time} "stop_loss": keep_avg['avg_stop_loss'],
"alignment_score": keep_avg['avg_align_score'],
"epoch_time": epoch_time
}
tb_logger.tb_train_epoch_stats(global_step, epoch_stats) tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
if c.tb_model_param_stats: if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, global_step) tb_logger.tb_model_weights(model, global_step)
return avg_postnet_loss, global_step return keep_avg['avg_postnet_loss'], global_step
def evaluate(model, criterion, criterion_st, ap, global_step, epoch): def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
@ -290,9 +308,14 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
speaker_mapping = load_speaker_mapping(OUT_PATH) speaker_mapping = load_speaker_mapping(OUT_PATH)
model.eval() model.eval()
epoch_time = 0 epoch_time = 0
avg_postnet_loss = 0 eval_values_dict = {
avg_decoder_loss = 0 'avg_postnet_loss': 0,
avg_stop_loss = 0 'avg_decoder_loss': 0,
'avg_stop_loss': 0,
'avg_align_score': 0
}
keep_avg = KeepAverage()
keep_avg.add_values(eval_values_dict)
print("\n > Validation") print("\n > Validation")
if c.test_sentences_file is None: if c.test_sentences_file is None:
test_sentences = [ test_sentences = [
@ -313,14 +336,18 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
text_input = data[0] text_input = data[0]
text_lengths = data[1] text_lengths = data[1]
speaker_names = data[2] speaker_names = data[2]
linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None linear_input = data[3] if c.model in [
"Tacotron", "TacotronGST"
] else None
mel_input = data[4] mel_input = data[4]
mel_lengths = data[5] mel_lengths = data[5]
stop_targets = data[6] stop_targets = data[6]
if c.use_speaker_embedding: if c.use_speaker_embedding:
speaker_ids = [speaker_mapping[speaker_name] speaker_ids = [
for speaker_name in speaker_names] speaker_mapping[speaker_name]
for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids) speaker_ids = torch.LongTensor(speaker_ids)
else: else:
speaker_ids = None speaker_ids = None
@ -329,14 +356,17 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
stop_targets = stop_targets.view(text_input.shape[0], stop_targets = stop_targets.view(text_input.shape[0],
stop_targets.size(1) // c.r, stop_targets.size(1) // c.r,
-1) -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze(2)
# dispatch data to GPU # dispatch data to GPU
if use_cuda: if use_cuda:
text_input = text_input.cuda() text_input = text_input.cuda()
mel_input = mel_input.cuda() mel_input = mel_input.cuda()
mel_lengths = mel_lengths.cuda() mel_lengths = mel_lengths.cuda()
linear_input = linear_input.cuda() if c.model in ["Tacotron", "TacotronGST"] else None linear_input = linear_input.cuda() if c.model in [
"Tacotron", "TacotronGST"
] else None
stop_targets = stop_targets.cuda() stop_targets = stop_targets.cuda()
if speaker_ids is not None: if speaker_ids is not None:
speaker_ids = speaker_ids.cuda() speaker_ids = speaker_ids.cuda()
@ -347,13 +377,17 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
speaker_ids=speaker_ids) speaker_ids=speaker_ids)
# loss computation # loss computation
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) stop_loss = criterion_st(
stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
if c.loss_masking: if c.loss_masking:
decoder_loss = criterion(decoder_output, mel_input, mel_lengths) decoder_loss = criterion(decoder_output, mel_input,
mel_lengths)
if c.model in ["Tacotron", "TacotronGST"]: if c.model in ["Tacotron", "TacotronGST"]:
postnet_loss = criterion(postnet_output, linear_input, mel_lengths) postnet_loss = criterion(postnet_output, linear_input,
mel_lengths)
else: else:
postnet_loss = criterion(postnet_output, mel_input, mel_lengths) postnet_loss = criterion(postnet_output, mel_input,
mel_lengths)
else: else:
decoder_loss = criterion(decoder_output, mel_input) decoder_loss = criterion(decoder_output, mel_input)
if c.model in ["Tacotron", "TacotronGST"]: if c.model in ["Tacotron", "TacotronGST"]:
@ -365,14 +399,9 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
step_time = time.time() - start_time step_time = time.time() - start_time
epoch_time += step_time epoch_time += step_time
if num_iter % c.print_step == 0: # compute alignment score
print( align_score = alignment_diagonal_score(alignments)
" | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " keep_avg.update_value('avg_align_score', align_score)
"StopLoss: {:.5f} ".format(loss.item(),
postnet_loss.item(),
decoder_loss.item(),
stop_loss.item()),
flush=True)
# aggregate losses from processes # aggregate losses from processes
if num_gpus > 1: if num_gpus > 1:
@ -381,15 +410,34 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
if c.stopnet: if c.stopnet:
stop_loss = reduce_tensor(stop_loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus)
avg_postnet_loss += float(postnet_loss.item()) keep_avg.update_values({
avg_decoder_loss += float(decoder_loss.item()) 'avg_postnet_loss':
avg_stop_loss += stop_loss.item() float(postnet_loss.item()),
'avg_decoder_loss':
float(decoder_loss.item()),
'avg_stop_loss':
float(stop_loss.item())
})
if num_iter % c.print_step == 0:
print(
" | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} "
"StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}"
.format(loss.item(), postnet_loss.item(),
keep_avg['avg_postnet_loss'],
decoder_loss.item(),
keep_avg['avg_decoder_loss'], stop_loss.item(),
keep_avg['avg_stop_loss'], align_score,
keep_avg['avg_align_score']),
flush=True)
if args.rank == 0: if args.rank == 0:
# Diagnostic visualizations # Diagnostic visualizations
idx = np.random.randint(mel_input.shape[0]) idx = np.random.randint(mel_input.shape[0])
const_spec = postnet_output[idx].data.cpu().numpy() const_spec = postnet_output[idx].data.cpu().numpy()
gt_spec = linear_input[idx].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
"Tacotron", "TacotronGST"
] else mel_input[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy()
eval_figures = { eval_figures = {
@ -404,17 +452,15 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
eval_audio = ap.inv_spectrogram(const_spec.T) eval_audio = ap.inv_spectrogram(const_spec.T)
else: else:
eval_audio = ap.inv_mel_spectrogram(const_spec.T) eval_audio = ap.inv_mel_spectrogram(const_spec.T)
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
c.audio["sample_rate"])
# compute average losses
avg_postnet_loss /= (num_iter + 1)
avg_decoder_loss /= (num_iter + 1)
avg_stop_loss /= (num_iter + 1)
# Plot Validation Stats # Plot Validation Stats
epoch_stats = {"loss_postnet": avg_postnet_loss, epoch_stats = {
"loss_decoder": avg_decoder_loss, "loss_postnet": keep_avg['avg_postnet_loss'],
"stop_loss": avg_stop_loss} "loss_decoder": keep_avg['avg_decoder_loss'],
"stop_loss": keep_avg['avg_stop_loss']
}
tb_logger.tb_eval_stats(global_step, epoch_stats) tb_logger.tb_eval_stats(global_step, epoch_stats)
if args.rank == 0 and epoch > c.test_delay_epochs: if args.rank == 0 and epoch > c.test_delay_epochs:
@ -427,7 +473,11 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
for idx, test_sentence in enumerate(test_sentences): for idx, test_sentence in enumerate(test_sentences):
try: try:
wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
model, test_sentence, c, use_cuda, ap, model,
test_sentence,
c,
use_cuda,
ap,
speaker_id=speaker_id, speaker_id=speaker_id,
style_wav=style_wav) style_wav=style_wav)
file_path = os.path.join(AUDIO_PATH, str(global_step)) file_path = os.path.join(AUDIO_PATH, str(global_step))
@ -436,18 +486,22 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
"TestSentence_{}.wav".format(idx)) "TestSentence_{}.wav".format(idx))
ap.save_wav(wav, file_path) ap.save_wav(wav, file_path)
test_audios['{}-audio'.format(idx)] = wav test_audios['{}-audio'.format(idx)] = wav
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(postnet_output, ap) test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment) postnet_output, ap)
test_figures['{}-alignment'.format(idx)] = plot_alignment(
alignment)
except: except:
print(" !! Error creating Test Sentence -", idx) print(" !! Error creating Test Sentence -", idx)
traceback.print_exc() traceback.print_exc()
tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_audios(global_step, test_audios,
c.audio['sample_rate'])
tb_logger.tb_test_figures(global_step, test_figures) tb_logger.tb_test_figures(global_step, test_figures)
return avg_postnet_loss return keep_avg['avg_postnet_loss']
# FIXME: move args definition/parsing inside of main? # FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name def main(args): # pylint: disable=redefined-outer-name
global meta_data_train, meta_data_eval
# Audio processor # Audio processor
ap = AudioProcessor(**c.audio) ap = AudioProcessor(**c.audio)
@ -457,8 +511,12 @@ def main(args): #pylint: disable=redefined-outer-name
c.distributed["backend"], c.distributed["url"]) c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols) num_chars = len(phonemes) if c.use_phonemes else len(symbols)
# load data instances
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
# parse speakers
if c.use_speaker_embedding: if c.use_speaker_embedding:
speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset) speakers = get_speakers(meta_data_train)
if args.restore_path: if args.restore_path:
prev_out_path = os.path.dirname(args.restore_path) prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path) speaker_mapping = load_speaker_mapping(prev_out_path)
@ -467,8 +525,7 @@ def main(args): #pylint: disable=redefined-outer-name
"introduce new speakers to " \ "introduce new speakers to " \
"a previously trained model." "a previously trained model."
else: else:
speaker_mapping = {name: i speaker_mapping = {name: i for i, name in enumerate(speakers)}
for i, name in enumerate(speakers)}
save_speaker_mapping(OUT_PATH, speaker_mapping) save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping) num_speakers = len(speaker_mapping)
print("Training with {} speakers: {}".format(num_speakers, print("Training with {} speakers: {}".format(num_speakers,
@ -480,18 +537,23 @@ def main(args): #pylint: disable=redefined-outer-name
print(" | > Num output units : {}".format(ap.num_freq), flush=True) print(" | > Num output units : {}".format(ap.num_freq), flush=True)
optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0) params = set_weight_decay(model, c.wd)
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
if c.stopnet and c.separate_stopnet: if c.stopnet and c.separate_stopnet:
optimizer_st = RAdam( optimizer_st = RAdam(model.decoder.stopnet.parameters(),
model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) lr=c.lr,
weight_decay=0)
else: else:
optimizer_st = None optimizer_st = None
if c.loss_masking: if c.loss_masking:
criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"] else MSELossMasked() criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"
] else MSELossMasked()
else: else:
criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"] else nn.MSELoss() criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None ] else nn.MSELoss()
criterion_st = nn.BCEWithLogitsLoss(
pos_weight=torch.tensor(20.0)) if c.stopnet else None
if args.restore_path: if args.restore_path:
checkpoint = torch.load(args.restore_path) checkpoint = torch.load(args.restore_path)
@ -510,8 +572,8 @@ def main(args): #pylint: disable=redefined-outer-name
del model_dict del model_dict
for group in optimizer.param_groups: for group in optimizer.param_groups:
group['lr'] = c.lr group['lr'] = c.lr
print( print(" > Model restored from step %d" % checkpoint['step'],
" > Model restored from step %d" % checkpoint['step'], flush=True) flush=True)
args.restore_step = checkpoint['step'] args.restore_step = checkpoint['step']
else: else:
args.restore_step = 0 args.restore_step = 0
@ -527,8 +589,7 @@ def main(args): #pylint: disable=redefined-outer-name
model = apply_gradient_allreduce(model) model = apply_gradient_allreduce(model)
if c.lr_decay: if c.lr_decay:
scheduler = NoamLR( scheduler = NoamLR(optimizer,
optimizer,
warmup_steps=c.warmup_steps, warmup_steps=c.warmup_steps,
last_epoch=args.restore_step - 1) last_epoch=args.restore_step - 1)
else: else:
@ -550,11 +611,11 @@ def main(args): #pylint: disable=redefined-outer-name
print(" > Number of outputs per iteration:", model.decoder.r) print(" > Number of outputs per iteration:", model.decoder.r)
train_loss, global_step = train(model, criterion, criterion_st, train_loss, global_step = train(model, criterion, criterion_st,
optimizer, optimizer_st, scheduler, optimizer, optimizer_st, scheduler, ap,
ap, global_step, epoch) global_step, epoch)
val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, global_step,
print( epoch)
" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format(
train_loss, val_loss), train_loss, val_loss),
flush=True) flush=True)
target_loss = train_loss target_loss = train_loss
@ -576,8 +637,7 @@ if __name__ == '__main__':
type=str, type=str,
help='Path to config file for training.', help='Path to config file for training.',
) )
parser.add_argument( parser.add_argument('--debug',
'--debug',
type=bool, type=bool,
default=True, default=True,
help='Do not verify commit integrity to run training.') help='Do not verify commit integrity to run training.')
@ -586,17 +646,14 @@ if __name__ == '__main__':
type=str, type=str,
default='', default='',
help='Defines the data path. It overwrites config.json.') help='Defines the data path. It overwrites config.json.')
parser.add_argument( parser.add_argument('--output_path',
'--output_path',
type=str, type=str,
help='path for training outputs.', help='path for training outputs.',
default='') default='')
parser.add_argument( parser.add_argument('--output_folder',
'--output_folder',
type=str, type=str,
default='', default='',
help='folder name for training outputs.' help='folder name for training outputs.')
)
# DISTRUBUTED # DISTRUBUTED
parser.add_argument( parser.add_argument(
@ -604,8 +661,7 @@ if __name__ == '__main__':
type=int, type=int,
default=0, default=0,
help='DISTRIBUTED: process rank for distributed training.') help='DISTRIBUTED: process rank for distributed training.')
parser.add_argument( parser.add_argument('--group_id',
'--group_id',
type=str, type=str,
default="", default="",
help='DISTRIBUTED: process group id.') help='DISTRIBUTED: process group id.')
@ -635,7 +691,8 @@ if __name__ == '__main__':
if args.restore_path: if args.restore_path:
new_fields["restore_path"] = args.restore_path new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch() new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'), new_fields) copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
os.chmod(AUDIO_PATH, 0o775) os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775) os.chmod(OUT_PATH, 0o775)

View File

@ -24,6 +24,7 @@ class AudioProcessor(object):
clip_norm=True, clip_norm=True,
griffin_lim_iters=None, griffin_lim_iters=None,
do_trim_silence=False, do_trim_silence=False,
sound_norm=False,
**_): **_):
print(" > Setting up Audio Processor...") print(" > Setting up Audio Processor...")
@ -45,6 +46,7 @@ class AudioProcessor(object):
self.max_norm = 1.0 if max_norm is None else float(max_norm) self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm
self.n_fft, self.hop_length, self.win_length = self._stft_parameters() self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
members = vars(self) members = vars(self)
for key, value in members.items(): for key, value in members.items():
@ -210,11 +212,11 @@ class AudioProcessor(object):
return len(wav) return len(wav)
def trim_silence(self, wav): def trim_silence(self, wav):
""" Trim silent parts with a threshold and 0.1 sec margin """ """ Trim silent parts with a threshold and 0.01 sec margin """
margin = int(self.sample_rate * 0.1) margin = int(self.sample_rate * 0.01)
wav = wav[margin:-margin] wav = wav[margin:-margin]
return librosa.effects.trim( return librosa.effects.trim(
wav, top_db=40, frame_length=1024, hop_length=256)[0] wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
@staticmethod @staticmethod
def mulaw_encode(wav, qc): def mulaw_encode(wav, qc):
@ -243,6 +245,8 @@ class AudioProcessor(object):
except ValueError: except ValueError:
print(f' [!] File cannot be trimmed for silence - {filename}') print(f' [!] File cannot be trimmed for silence - {filename}')
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
if self.sound_norm:
x = x / x.max() * 0.9
return x return x
@staticmethod @staticmethod

View File

@ -31,7 +31,8 @@ def load_config(config_path):
def get_git_branch(): def get_git_branch():
try: try:
out = subprocess.check_output(["git", "branch"]).decode("utf8") out = subprocess.check_output(["git", "branch"]).decode("utf8")
current = next(line for line in out.split("\n") if line.startswith("*")) current = next(line for line in out.split("\n")
if line.startswith("*"))
current.replace("* ", "") current.replace("* ", "")
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
current = "inside_docker" current = "inside_docker"
@ -47,8 +48,8 @@ def get_commit_hash():
# raise RuntimeError( # raise RuntimeError(
# " !! Commit before training to get the commit hash.") # " !! Commit before training to get the commit hash.")
try: try:
commit = subprocess.check_output(['git', 'rev-parse', '--short', commit = subprocess.check_output(
'HEAD']).decode().strip() ['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
# Not copying .git folder into docker container # Not copying .git folder into docker container
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
commit = "0000000" commit = "0000000"
@ -168,16 +169,42 @@ def lr_decay(init_lr, global_step, warmup_steps):
return lr return lr
def weight_decay(optimizer, wd): def adam_weight_decay(optimizer):
""" """
Custom weight decay operation, not effecting grad values. Custom weight decay operation, not effecting grad values.
""" """
for group in optimizer.param_groups: for group in optimizer.param_groups:
for param in group['params']: for param in group['params']:
current_lr = group['lr'] current_lr = group['lr']
param.data = param.data.add(-wd * group['lr'], param.data) weight_decay = group['weight_decay']
param.data = param.data.add(-weight_decay * group['lr'],
param.data)
return optimizer, current_lr return optimizer, current_lr
# pylint: disable=dangerous-default-value
def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}):
"""
Skip biases, BatchNorm parameters, rnns.
and attention projection layer v
"""
decay = []
no_decay = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue
if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]):
no_decay.append(param)
else:
decay.append(param)
return [{
'params': no_decay,
'weight_decay': 0.
}, {
'params': decay,
'weight_decay': weight_decay
}]
class NoamLR(torch.optim.lr_scheduler._LRScheduler): class NoamLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1):
@ -187,8 +214,8 @@ class NoamLR(torch.optim.lr_scheduler._LRScheduler):
def get_lr(self): def get_lr(self):
step = max(self.last_epoch, 1) step = max(self.last_epoch, 1)
return [ return [
base_lr * self.warmup_steps**0.5 * min( base_lr * self.warmup_steps**0.5 *
step * self.warmup_steps**-1.5, step**-0.5) min(step * self.warmup_steps**-1.5, step**-0.5)
for base_lr in self.base_lrs for base_lr in self.base_lrs
] ]
@ -243,8 +270,8 @@ def set_init_dict(model_dict, checkpoint, c):
} }
# 4. overwrite entries in the existing state dict # 4. overwrite entries in the existing state dict
model_dict.update(pretrained_dict) model_dict.update(pretrained_dict)
print(" | > {} / {} layers are restored.".format( print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
len(pretrained_dict), len(model_dict))) len(model_dict)))
return model_dict return model_dict
@ -252,13 +279,13 @@ def setup_model(num_chars, num_speakers, c):
print(" > Using model: {}".format(c.model)) print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('TTS.models.' + c.model.lower()) MyModel = importlib.import_module('TTS.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model) MyModel = getattr(MyModel, c.model)
if c.model.lower() in ["tacotron", "tacotrongst"]: if c.model.lower() in "tacotron":
model = MyModel( model = MyModel(num_chars=num_chars,
num_chars=num_chars,
num_speakers=num_speakers, num_speakers=num_speakers,
r=c.r, r=c.r,
linear_dim=1025, linear_dim=1025,
mel_dim=80, mel_dim=80,
gst=c.use_gst,
memory_size=c.memory_size, memory_size=c.memory_size,
attn_win=c.windowing, attn_win=c.windowing,
attn_norm=c.attention_norm, attn_norm=c.attention_norm,
@ -270,8 +297,7 @@ def setup_model(num_chars, num_speakers, c):
location_attn=c.location_attn, location_attn=c.location_attn,
separate_stopnet=c.separate_stopnet) separate_stopnet=c.separate_stopnet)
elif c.model.lower() == "tacotron2": elif c.model.lower() == "tacotron2":
model = MyModel( model = MyModel(num_chars=num_chars,
num_chars=num_chars,
num_speakers=num_speakers, num_speakers=num_speakers,
r=c.r, r=c.r,
attn_win=c.windowing, attn_win=c.windowing,
@ -290,7 +316,8 @@ def split_dataset(items):
is_multi_speaker = False is_multi_speaker = False
speakers = [item[-1] for item in items] speakers = [item[-1] for item in items]
is_multi_speaker = len(set(speakers)) > 1 is_multi_speaker = len(set(speakers)) > 1
eval_split_size = 500 if 500 < len(items) * 0.01 else int(len(items) * 0.01) eval_split_size = 500 if len(items) * 0.01 > 500 else int(
len(items) * 0.01)
np.random.seed(0) np.random.seed(0)
np.random.shuffle(items) np.random.shuffle(items)
if is_multi_speaker: if is_multi_speaker:
@ -314,3 +341,34 @@ def gradual_training_scheduler(global_step, config):
if global_step >= values[0]: if global_step >= values[0]:
new_values = values new_values = values
return new_values[1], new_values[2] return new_values[1], new_values[2]
class KeepAverage():
def __init__(self):
self.avg_values = {}
self.iters = {}
def __getitem__(self, key):
return self.avg_values[key]
def add_value(self, name, init_val=0, init_iter=0):
self.avg_values[name] = init_val
self.iters[name] = init_iter
def update_value(self, name, value, weighted_avg=False):
if weighted_avg:
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
self.iters[name] += 1
else:
self.avg_values[name] = self.avg_values[name] * \
self.iters[name] + value
self.iters[name] += 1
self.avg_values[name] /= self.iters[name]
def add_values(self, name_dict):
for key, value in name_dict.items():
self.add_value(key, init_val=value)
def update_values(self, value_dict):
for key, value in value_dict.items():
self.update_value(key, value)

11
utils/measures.py Normal file
View File

@ -0,0 +1,11 @@
def alignment_diagonal_score(alignments):
"""
Compute how diagonal alignment predictions are. It is useful
to measure the alignment consistency of a model
Args:
alignments (torch.Tensor): batch of alignments.
Shape:
alignments : batch x decoder_steps x encoder_steps
"""
return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0).item()

View File

@ -25,9 +25,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
json.dump(speaker_mapping, f, indent=4) json.dump(speaker_mapping, f, indent=4)
def get_speakers(data_root, meta_file, dataset_type): def get_speakers(items):
"""Returns a sorted, unique list of speakers in a given dataset.""" """Returns a sorted, unique list of speakers in a given dataset."""
preprocessor = get_preprocessor_by_name(dataset_type)
items = preprocessor(data_root, meta_file)
speakers = {e[2] for e in items} speakers = {e[2] for e in items}
return sorted(speakers) return sorted(speakers)