Merge pull request #279 from mozilla/dev

merging dev branch
pull/10/head
Eren Gölge 2019-10-24 14:40:55 +02:00 committed by GitHub
commit 50088cbf3b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 866 additions and 764 deletions

View File

@ -4,13 +4,13 @@ yes | apt-get install ffmpeg
yes | apt-get install espeak
yes | apt-get install tmux
yes | apt-get install zsh
# pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl
# wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar
wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh
pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.0%2Bcu100-cp36-cp36m-linux_x86_64.whl
sudo sh install.sh
pip install pytorch==1.3.0+cu100
python3 setup.py develop
# cp -R ${USER_DIR}/GermanData ../tmp/
# python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/
# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
# python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/
# python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360
# python3 distribute.py --config_path config.json
while true; do sleep 1000000; done

View File

@ -1,6 +1,6 @@
{
"run_name": "ljspeech",
"run_description": "gradual training with prenet frame size 1 + no maxout for cbhg + symmetric norm.",
"run_description": "Tacotron ljspeech release training",
"audio":{
// Audio processing parameters
@ -55,20 +55,16 @@
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16,
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled.
"gradual_training": [[0, 7, 32], [1, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled.
"wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 10000, // Number of training steps expected to save traning stats and checkpoints.
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
"print_step": 25, // Number of steps to log traning on console.
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"run_eval": true,
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
"data_path": "/home/erogol/Data/LJSpeech-1.1/", // DATASET-RELATED: can overwritten from command argument
"meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
"meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
"dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 150, // DATASET-RELATED: maximum text length
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
@ -79,6 +75,18 @@
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
"text_cleaner": "phoneme_cleaners",
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"style_wav_for_test": null // path to style wav file to be used in TacotronGST inference.
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
"use_gst": false, // TACOTRON ONLY: use global style tokens
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "/data/ro/shared/data/keithito/LJSpeech-1.1/",
"meta_file_train": "metadata_train.csv",
"meta_file_val": "metadata_val.csv"
}
]
}

View File

@ -2,6 +2,27 @@ import os
from glob import glob
import re
import sys
from TTS.utils.generic_utils import split_dataset
def load_meta_data(datasets):
meta_data_train_all = []
meta_data_eval_all = []
for dataset in datasets:
name = dataset['name']
root_path = dataset['path']
meta_file_train = dataset['meta_file_train']
meta_file_val = dataset['meta_file_val']
preprocessor = get_preprocessor_by_name(name)
meta_data_train = preprocessor(root_path, meta_file_train)
if meta_file_val is None:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
else:
meta_data_eval = preprocessor(root_path, meta_file_val)
meta_data_train_all += meta_data_train
meta_data_eval_all += meta_data_eval
return meta_data_train_all, meta_data_eval_all
def get_preprocessor_by_name(name):

View File

@ -1,3 +1,5 @@
import numpy as np
import torch
from torch import nn
from torch.nn import functional
from TTS.utils.generic_utils import sequence_mask
@ -53,3 +55,18 @@ class MSELossMasked(nn.Module):
x * mask, target * mask, reduction="sum")
loss = loss / mask.sum()
return loss
class AttentionEntropyLoss(nn.Module):
# pylint: disable=R0201
def forward(self, align):
"""
Forces attention to be more decisive by penalizing
soft attention weights
TODO: arguments
TODO: unit_test
"""
entropy = torch.distributions.Categorical(probs=align).entropy()
loss = (entropy / np.log(align.shape[1])).mean()
return loss

View File

@ -273,7 +273,7 @@ class Decoder(nn.Module):
def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing,
attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn,
separate_stopnet):
separate_stopnet, speaker_embedding_dim):
super(Decoder, self).__init__()
self.r_init = r
self.r = r
@ -285,8 +285,9 @@ class Decoder(nn.Module):
self.separate_stopnet = separate_stopnet
self.query_dim = 256
# memory -> |Prenet| -> processed_memory
prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim
self.prenet = Prenet(
memory_dim * self.memory_size if self.use_memory_queue else memory_dim,
prenet_dim,
prenet_type,
prenet_dropout,
out_features=[256, 128])
@ -339,13 +340,13 @@ class Decoder(nn.Module):
T = inputs.size(1)
# go frame as zeros matrix
if self.use_memory_queue:
self.memory_input = torch.zeros(B, self.memory_dim * self.memory_size, device=inputs.device)
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size)
else:
self.memory_input = torch.zeros(B, self.memory_dim, device=inputs.device)
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim)
# decoder states
self.attention_rnn_hidden = torch.zeros(B, 256, device=inputs.device)
self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
self.decoder_rnn_hiddens = [
torch.zeros(B, 256, device=inputs.device)
torch.zeros(1, device=inputs.device).repeat(B, 256)
for idx in range(len(self.decoder_rnns))
]
self.context_vec = inputs.data.new(B, self.in_features).zero_()
@ -405,9 +406,9 @@ class Decoder(nn.Module):
self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
else:
# use only the last frame prediction
self.memory_input = new_memory[:, :self.memory_dim]
self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):]
def forward(self, inputs, memory, mask):
def forward(self, inputs, memory, mask, speaker_embeddings=None):
"""
Args:
inputs: Encoder outputs.
@ -432,6 +433,8 @@ class Decoder(nn.Module):
if t > 0:
new_memory = memory[t - 1]
self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, mask)
outputs += [output]
attentions += [attention]
@ -440,13 +443,15 @@ class Decoder(nn.Module):
return self._parse_outputs(outputs, attentions, stop_tokens)
def inference(self, inputs):
def inference(self, inputs, speaker_embeddings=None):
"""
Args:
inputs: Encoder outputs.
inputs: encoder outputs.
speaker_embeddings: speaker vectors.
Shapes:
- inputs: batch x time x encoder_out_dim
- speaker_embeddings: batch x embed_dim
"""
outputs = []
attentions = []
@ -459,6 +464,8 @@ class Decoder(nn.Module):
if t > 0:
new_memory = outputs[-1]
self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, None)
stop_token = torch.sigmoid(stop_token.data)
outputs += [output]

View File

@ -10,8 +10,10 @@ class ConvBNBlock(nn.Module):
super(ConvBNBlock, self).__init__()
assert (kernel_size - 1) % 2 == 0
padding = (kernel_size - 1) // 2
conv1d = nn.Conv1d(
in_channels, out_channels, kernel_size, padding=padding)
conv1d = nn.Conv1d(in_channels,
out_channels,
kernel_size,
padding=padding)
norm = nn.BatchNorm1d(out_channels)
dropout = nn.Dropout(p=0.5)
if nonlinear == 'relu':
@ -52,8 +54,7 @@ class Encoder(nn.Module):
convolutions.append(
ConvBNBlock(in_features, in_features, 5, 'relu'))
self.convolutions = nn.Sequential(*convolutions)
self.lstm = nn.LSTM(
in_features,
self.lstm = nn.LSTM(in_features,
int(in_features / 2),
num_layers=1,
batch_first=True,
@ -64,8 +65,9 @@ class Encoder(nn.Module):
x = self.convolutions(x)
x = x.transpose(1, 2)
input_lengths = input_lengths.cpu().numpy()
x = nn.utils.rnn.pack_padded_sequence(
x, input_lengths, batch_first=True)
x = nn.utils.rnn.pack_padded_sequence(x,
input_lengths,
batch_first=True)
self.lstm.flatten_parameters()
outputs, _ = self.lstm(x)
outputs, _ = nn.utils.rnn.pad_packed_sequence(
@ -101,6 +103,7 @@ class Decoder(nn.Module):
forward_attn_mask, location_attn, separate_stopnet):
super(Decoder, self).__init__()
self.mel_channels = inputs_dim
self.r_init = r
self.r = r
self.encoder_embedding_dim = in_features
self.separate_stopnet = separate_stopnet
@ -111,10 +114,11 @@ class Decoder(nn.Module):
self.gate_threshold = 0.5
self.p_attention_dropout = 0.1
self.p_decoder_dropout = 0.1
self.prenet = Prenet(self.mel_channels * r, prenet_type,
self.prenet = Prenet(self.mel_channels,
prenet_type,
prenet_dropout,
[self.prenet_dim, self.prenet_dim], bias=False)
[self.prenet_dim, self.prenet_dim],
bias=False)
self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features,
self.query_dim)
@ -135,51 +139,46 @@ class Decoder(nn.Module):
self.decoder_rnn_dim, 1)
self.linear_projection = Linear(self.decoder_rnn_dim + in_features,
self.mel_channels * r)
self.mel_channels * self.r_init)
self.stopnet = nn.Sequential(
nn.Dropout(0.1),
Linear(
self.decoder_rnn_dim + self.mel_channels * r,
Linear(self.decoder_rnn_dim + self.mel_channels * self.r_init,
1,
bias=True,
init_gain='sigmoid'))
self.attention_rnn_init = nn.Embedding(1, self.query_dim)
self.go_frame_init = nn.Embedding(1, self.mel_channels * r)
self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim)
self.memory_truncated = None
def set_r(self, new_r):
self.r = new_r
def get_go_frame(self, inputs):
B = inputs.size(0)
memory = self.go_frame_init(inputs.data.new_zeros(B).long())
memory = torch.zeros(1, device=inputs.device).repeat(B,
self.mel_channels * self.r)
return memory
def _init_states(self, inputs, mask, keep_states=False):
B = inputs.size(0)
# T = inputs.size(1)
if not keep_states:
self.query = self.attention_rnn_init(
inputs.data.new_zeros(B).long())
self.attention_rnn_cell_state = Variable(
inputs.data.new(B, self.query_dim).zero_())
self.decoder_hidden = self.decoder_rnn_inits(
inputs.data.new_zeros(B).long())
self.decoder_cell = Variable(
inputs.data.new(B, self.decoder_rnn_dim).zero_())
self.context = Variable(
inputs.data.new(B, self.encoder_embedding_dim).zero_())
self.query = torch.zeros(1, device=inputs.device).repeat(
B, self.query_dim)
self.attention_rnn_cell_state = torch.zeros(
1, device=inputs.device).repeat(B, self.query_dim)
self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(
B, self.decoder_rnn_dim)
self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(
B, self.decoder_rnn_dim)
self.context = torch.zeros(1, device=inputs.device).repeat(
B, self.encoder_embedding_dim)
self.inputs = inputs
self.processed_inputs = self.attention.inputs_layer(inputs)
self.mask = mask
def _reshape_memory(self, memories):
memories = memories.view(
memories.size(0), int(memories.size(1) / self.r), -1)
memories = memories.view(memories.size(0),
int(memories.size(1) / self.r), -1)
memories = memories.transpose(0, 1)
return memories
@ -192,14 +191,20 @@ class Decoder(nn.Module):
outputs = outputs.transpose(1, 2)
return outputs, stop_tokens, alignments
def _update_memory(self, memory):
if len(memory.shape) == 2:
return memory[:, self.mel_channels * (self.r - 1):]
return memory[:, :, self.mel_channels * (self.r - 1):]
def decode(self, memory):
query_input = torch.cat((memory, self.context), -1)
self.query, self.attention_rnn_cell_state = self.attention_rnn(
query_input, (self.query, self.attention_rnn_cell_state))
self.query = F.dropout(
self.query, self.p_attention_dropout, self.training)
self.query = F.dropout(self.query, self.p_attention_dropout,
self.training)
self.attention_rnn_cell_state = F.dropout(
self.attention_rnn_cell_state, self.p_attention_dropout, self.training)
self.attention_rnn_cell_state, self.p_attention_dropout,
self.training)
self.context = self.attention(self.query, self.inputs,
self.processed_inputs, self.mask)
@ -223,13 +228,14 @@ class Decoder(nn.Module):
stop_token = self.stopnet(stopnet_input.detach())
else:
stop_token = self.stopnet(stopnet_input)
decoder_output = decoder_output[:, :self.r * self.mel_channels]
return decoder_output, stop_token, self.attention.attention_weights
def forward(self, inputs, memories, mask):
memory = self.get_go_frame(inputs).unsqueeze(0)
memories = self._reshape_memory(memories)
memories = torch.cat((memory, memories), dim=0)
memories = self.prenet(memories)
memories = self.prenet(self._update_memory(memories))
self._init_states(inputs, mask=mask)
self.attention.init_states(inputs)
@ -249,6 +255,8 @@ class Decoder(nn.Module):
def inference(self, inputs):
memory = self.get_go_frame(inputs)
memory = self._update_memory(memory)
self._init_states(inputs, mask=None)
self.attention.init_win_idx()
@ -256,7 +264,6 @@ class Decoder(nn.Module):
outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False]
stop_count = 0
while True:
memory = self.prenet(memory)
mel_output, stop_token, alignment = self.decode(memory)
@ -270,14 +277,12 @@ class Decoder(nn.Module):
and t > inputs.shape[1])
stop_flags[2] = t > inputs.shape[1] * 2
if all(stop_flags):
stop_count += 1
if stop_count > 20:
break
elif len(outputs) == self.max_decoder_steps:
if len(outputs) == self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break
memory = mel_output
memory = self._update_memory(mel_output)
t += 1
outputs, stop_tokens, alignments = self._parse_outputs(
@ -299,7 +304,6 @@ class Decoder(nn.Module):
self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False]
stop_count = 0
while True:
memory = self.prenet(self.memory_truncated)
mel_output, stop_token, alignment = self.decode(memory)
@ -313,10 +317,8 @@ class Decoder(nn.Module):
and t > inputs.shape[1])
stop_flags[2] = t > inputs.shape[1] * 2
if all(stop_flags):
stop_count += 1
if stop_count > 20:
break
elif len(outputs) == self.max_decoder_steps:
if len(outputs) == self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break

View File

@ -1,7 +1,9 @@
# coding: utf-8
import torch
from torch import nn
from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
from TTS.utils.generic_utils import sequence_mask
from TTS.layers.gst_layers import GST
class Tacotron(nn.Module):
@ -13,6 +15,7 @@ class Tacotron(nn.Module):
mel_dim=80,
memory_size=5,
attn_win=False,
gst=False,
attn_norm="sigmoid",
prenet_type="original",
prenet_dropout=True,
@ -25,55 +28,117 @@ class Tacotron(nn.Module):
self.r = r
self.mel_dim = mel_dim
self.linear_dim = linear_dim
self.gst = gst
self.num_speakers = num_speakers
self.embedding = nn.Embedding(num_chars, 256)
self.embedding.weight.data.normal_(0, 0.3)
decoder_dim = 512 if num_speakers > 1 else 256
encoder_dim = 512 if num_speakers > 1 else 256
proj_speaker_dim = 80 if num_speakers > 1 else 0
# boilerplate model
self.encoder = Encoder(encoder_dim)
self.decoder = Decoder(decoder_dim, mel_dim, r, memory_size, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, separate_stopnet,
proj_speaker_dim)
self.postnet = PostCBHG(mel_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
linear_dim)
# speaker embedding layers
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, 256)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(256)
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, separate_stopnet)
self.postnet = PostCBHG(mel_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim)
self.speaker_project_mel = nn.Sequential(
nn.Linear(256, proj_speaker_dim), nn.Tanh())
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
# global style token layers
if self.gst:
gst_embedding_dim = 256
self.gst_layer = GST(num_mel=80,
num_heads=4,
num_style_tokens=10,
embedding_dim=gst_embedding_dim)
def _init_states(self):
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
def compute_speaker_embedding(self, speaker_ids):
if hasattr(self, "speaker_embedding") and speaker_ids is None:
raise RuntimeError(
" [!] Model has speaker embedding layer but speaker_id is not provided"
)
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
self.speaker_embeddings = self._compute_speaker_embedding(
speaker_ids)
self.speaker_embeddings_projected = self.speaker_project_mel(
self.speaker_embeddings).squeeze(1)
def compute_gst(self, inputs, mel_specs):
gst_outputs = self.gst_layer(mel_specs)
inputs = self._add_speaker_embedding(inputs, gst_outputs)
return inputs
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
B = characters.size(0)
mask = sequence_mask(text_lengths).to(characters.device)
inputs = self.embedding(characters)
self._init_states()
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
encoder_outputs = self.encoder(inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
if self.gst:
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
mel_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, mask)
encoder_outputs, mel_specs, mask,
self.speaker_embeddings_projected)
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
linear_outputs = self.postnet(mel_outputs)
linear_outputs = self.last_linear(linear_outputs)
return mel_outputs, linear_outputs, alignments, stop_tokens
def inference(self, characters, speaker_ids=None):
def inference(self, characters, speaker_ids=None, style_mel=None):
B = characters.size(0)
inputs = self.embedding(characters)
self._init_states()
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
encoder_outputs = self.encoder(inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
if self.gst and style_mel is not None:
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
mel_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs)
encoder_outputs, self.speaker_embeddings_projected)
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
linear_outputs = self.postnet(mel_outputs)
linear_outputs = self.last_linear(linear_outputs)
return mel_outputs, linear_outputs, alignments, stop_tokens
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
if hasattr(self, "speaker_embedding") and speaker_ids is None:
raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
def _compute_speaker_embedding(self, speaker_ids):
speaker_embeddings = self.speaker_embedding(speaker_ids)
return speaker_embeddings.unsqueeze_(1)
speaker_embeddings.unsqueeze_(1)
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
encoder_outputs.size(1),
-1)
encoder_outputs = encoder_outputs + speaker_embeddings
return encoder_outputs
@staticmethod
def _add_speaker_embedding(outputs, speaker_embeddings):
speaker_embeddings_ = speaker_embeddings.expand(
outputs.size(0), outputs.size(1), -1)
outputs = outputs + speaker_embeddings_
return outputs
@staticmethod
def _concat_speaker_embedding(outputs, speaker_embeddings):
speaker_embeddings_ = speaker_embeddings.expand(
outputs.size(0), outputs.size(1), -1)
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
return outputs

View File

@ -1,87 +0,0 @@
# coding: utf-8
from torch import nn
from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
from TTS.layers.gst_layers import GST
from TTS.utils.generic_utils import sequence_mask
class TacotronGST(nn.Module):
def __init__(self,
num_chars,
num_speakers,
r=5,
linear_dim=1025,
mel_dim=80,
memory_size=5,
attn_win=False,
attn_norm="sigmoid",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
separate_stopnet=True):
super(TacotronGST, self).__init__()
self.r = r
self.mel_dim = mel_dim
self.linear_dim = linear_dim
self.embedding = nn.Embedding(num_chars, 256)
self.embedding.weight.data.normal_(0, 0.3)
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, 256)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(256)
self.gst = GST(num_mel=80, num_heads=4, num_style_tokens=10, embedding_dim=256)
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, separate_stopnet)
self.postnet = PostCBHG(mel_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim)
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
B = characters.size(0)
mask = sequence_mask(text_lengths).to(characters.device)
inputs = self.embedding(characters)
encoder_outputs = self.encoder(inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
gst_outputs = self.gst(mel_specs)
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
encoder_outputs = encoder_outputs + gst_outputs
mel_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, mask)
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
linear_outputs = self.postnet(mel_outputs)
linear_outputs = self.last_linear(linear_outputs)
return mel_outputs, linear_outputs, alignments, stop_tokens
def inference(self, characters, speaker_ids=None, style_mel=None):
B = characters.size(0)
inputs = self.embedding(characters)
encoder_outputs = self.encoder(inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
if style_mel is not None:
gst_outputs = self.gst(style_mel)
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
encoder_outputs = encoder_outputs + gst_outputs
mel_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs)
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
linear_outputs = self.postnet(mel_outputs)
linear_outputs = self.last_linear(linear_outputs)
return mel_outputs, linear_outputs, alignments, stop_tokens
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
speaker_embeddings = self.speaker_embedding(speaker_ids)
speaker_embeddings.unsqueeze_(1)
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
encoder_outputs.size(1),
-1)
encoder_outputs = encoder_outputs + speaker_embeddings
return encoder_outputs

View File

@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -29,28 +29,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/erogol/miniconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']\n",
"`%matplotlib` prevents importing * from pylab and numpy\n",
" \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n"
]
}
],
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
@ -59,6 +42,7 @@
"import io\n",
"import torch \n",
"import time\n",
"import json\n",
"import numpy as np\n",
"from collections import OrderedDict\n",
"from matplotlib import pylab as plt\n",
@ -86,23 +70,25 @@
"from IPython.display import Audio\n",
"\n",
"import os\n",
"os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
"os.environ['OMP_NUM_THREADS']='1'\n"
"os.environ['CUDA_VISIBLE_DEVICES']='1'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n",
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False, speaker_id=speaker_id, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" # coorect the normalization differences b/w TTS and the Vocoder.\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
" mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n",
" if not use_gl:\n",
" waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)\n",
" waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n",
"\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" if figures: \n",
@ -117,31 +103,18 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-3306702a6bbc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mVOCODER_MODEL_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mVOCODER_CONFIG_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mVOCODER_CONFIG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mVOCODER_CONFIG_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0muse_cuda\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/projects/TTS/tts_namespace/TTS/utils/generic_utils.py\u001b[0m in \u001b[0;36mload_config\u001b[0;34m(config_path)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mconfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAttrDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'\\\\\\n'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'"
]
}
],
"outputs": [],
"source": [
"# Set constants\n",
"ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'\n",
"MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n",
"ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5099/'\n",
"MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n",
"OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
"VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\n",
"VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\n",
"VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n",
"VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n",
"VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n",
"use_cuda = False\n",
"\n",
@ -149,10 +122,12 @@
"# CONFIG.windowing = False\n",
"# CONFIG.prenet_dropout = False\n",
"# CONFIG.separate_stopnet = True\n",
"CONFIG.use_forward_attn = True\n",
"# CONFIG.forward_attn_mask = True\n",
"# CONFIG.stopnet = True\n",
"\n",
"# Set the vocoder\n",
"use_gl = True # use GL if True\n",
"use_gl = False # use GL if True\n",
"batched_wavernn = True # use batched wavernn inference if True"
]
},
@ -165,9 +140,17 @@
"# LOAD TTS MODEL\n",
"from utils.text.symbols import symbols, phonemes\n",
"\n",
"# multi speaker \n",
"if CONFIG.use_speaker_embedding:\n",
" speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
" speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
"else:\n",
" speakers = []\n",
" speaker_id = None\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, CONFIG)\n",
"model = setup_model(num_chars, len(speakers), CONFIG)\n",
"\n",
"# load the audio processor\n",
"ap = AudioProcessor(**CONFIG.audio) \n",
@ -184,7 +167,12 @@
"if use_cuda:\n",
" model.cuda()\n",
"model.eval()\n",
"print(cp['step'])"
"print(cp['step'])\n",
"print(cp['r'])\n",
"\n",
"# set model stepsize\n",
"if 'r' in cp:\n",
" model.decoder.set_r(cp['r'])"
]
},
{
@ -196,25 +184,28 @@
"# LOAD WAVERNN\n",
"if use_gl == False:\n",
" from WaveRNN.models.wavernn import Model\n",
" from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
" bits = 10\n",
"\n",
" ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n",
" wavernn = Model(\n",
" rnn_dims=512,\n",
" fc_dims=512,\n",
" mode=\"mold\",\n",
" pad=2,\n",
" upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset\n",
" mode=VOCODER_CONFIG.mode,\n",
" mulaw=VOCODER_CONFIG.mulaw,\n",
" pad=VOCODER_CONFIG.pad,\n",
" upsample_factors=VOCODER_CONFIG.upsample_factors,\n",
" feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n",
" compute_dims=128,\n",
" res_out_dims=128,\n",
" res_blocks=10,\n",
" hop_length=ap.hop_length,\n",
" sample_rate=ap.sample_rate,\n",
" hop_length=ap_vocoder.hop_length,\n",
" sample_rate=ap_vocoder.sample_rate,\n",
" use_upsample_net = True,\n",
" use_aux_net = True\n",
" ).cuda()\n",
"\n",
"\n",
" check = torch.load(VOCODER_MODEL_PATH)\n",
" wavernn.load_state_dict(check['model'])\n",
" wavernn.load_state_dict(check['model'], strict=False)\n",
" if use_cuda:\n",
" wavernn.cuda()\n",
" wavernn.eval();\n",
@ -230,111 +221,67 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-e285d5bde9fb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_decoder_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m2000\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mspeaker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Bill got in the habit of asking himself “Is that thought true?” And if he wasnt absolutely certain it was, he just let it go.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"outputs": [],
"source": [
"model.eval()\n",
"model.decoder.max_decoder_steps = 2000\n",
"speaker_id = 0\n",
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” And if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-621056ffa667>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Be a voice, not an echo.\"\u001b[0m \u001b[0;31m# 'echo' is not in training set.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-26967668a1a1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"The human voice is the most perfect instrument of all.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-28cb5023e353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"I'm sorry Dave. I'm afraid I can't do that.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"speaker_id = None\n",
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
"metadata": {},
"outputs": [],
"source": [
"model.eval()\n",
"model.decoder.max_decoder_steps = 2000\n",
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
"scrolled": true
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This cake is great. It's so delicious and moist.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
@ -347,76 +294,51 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Heres a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
@ -429,136 +351,91 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \" He has read the whole thing.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"He reads books.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Thisss isrealy awhsome.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This is your internet browser, Firefox.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This is your internet browser Firefox.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
@ -568,7 +445,7 @@
"outputs": [],
"source": [
"sentence = \"Eren, how are you?\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
@ -581,107 +458,62 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Encouraged, he started with a minute a day.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"If he decided to watch TV he really watched it.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"scrolled": true
},
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"metadata": {},
"outputs": [],
"source": [
"# for twb dataset\n",
"sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [],
"source": [
"# !zip benchmark_samples/samples.zip benchmark_samples/*"
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
}
],

View File

@ -2,6 +2,7 @@ import os
import time
import argparse
import torch
import json
import string
from TTS.utils.synthesis import synthesis
@ -16,22 +17,28 @@ def tts(model,
VC,
text,
ap,
ap_vocoder,
use_cuda,
batched_vocoder,
speaker_id=None,
figures=False):
t_1 = time.time()
use_vocoder_model = vocoder_model is not None
waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis(
model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars)
waveform, alignment, _, postnet_output, stop_tokens = synthesis(
model, text, C, use_cuda, ap, speaker_id, False,
C.enable_eos_bos_chars)
if C.model == "Tacotron" and use_vocoder_model:
postnet_output = ap.out_linear_to_mel(postnet_output.T).T
# correct if there is a scale difference b/w two models
postnet_output = ap._denormalize(postnet_output)
postnet_output = ap_vocoder._normalize(postnet_output)
if use_vocoder_model:
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
waveform = vocoder_model.generate(
vocoder_input.cuda() if use_cuda else vocoder_input,
batched=batched_vocoder,
target=11000,
overlap=550)
target=8000,
overlap=400)
print(" > Run-time: {}".format(time.time() - t_1))
return alignment, postnet_output, stop_tokens, waveform
@ -39,13 +46,10 @@ def tts(model,
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'text', type=str, help='Text to generate speech.')
parser.add_argument(
'config_path',
parser.add_argument('text', type=str, help='Text to generate speech.')
parser.add_argument('config_path',
type=str,
help='Path to model config file.'
)
help='Path to model config file.')
parser.add_argument(
'model_path',
type=str,
@ -56,8 +60,10 @@ if __name__ == "__main__":
type=str,
help='Path to save final wav file.',
)
parser.add_argument(
'--use_cuda', type=bool, help='Run model on CUDA.', default=False)
parser.add_argument('--use_cuda',
type=bool,
help='Run model on CUDA.',
default=False)
parser.add_argument(
'--vocoder_path',
type=str,
@ -65,8 +71,7 @@ if __name__ == "__main__":
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
default="",
)
parser.add_argument(
'--vocoder_config_path',
parser.add_argument('--vocoder_config_path',
type=str,
help='Path to vocoder model config file.',
default="")
@ -75,12 +80,15 @@ if __name__ == "__main__":
type=bool,
help="If True, vocoder model uses faster batch processing.",
default=True)
parser.add_argument(
'--speakers_json',
parser.add_argument('--speakers_json',
type=str,
help="JSON file for multi-speaker model.",
default=""
)
default="")
parser.add_argument(
'--speaker_id',
type=int,
help="target speaker_id if the model is multi-speaker.",
default=None)
args = parser.parse_args()
if args.vocoder_path != "":
@ -109,13 +117,14 @@ if __name__ == "__main__":
model.eval()
if args.use_cuda:
model.cuda()
model.decoder.set_r(cp['r'])
# load vocoder model
if args.vocoder_path != "":
VC = load_config(args.vocoder_config_path)
ap_vocoder = AudioProcessor(**VC.audio)
bits = 10
vocoder_model = VocoderModel(
rnn_dims=512,
vocoder_model = VocoderModel(rnn_dims=512,
fc_dims=512,
mode=VC.mode,
mulaw=VC.mulaw,
@ -127,7 +136,8 @@ if __name__ == "__main__":
res_blocks=10,
hop_length=ap.hop_length,
sample_rate=ap.sample_rate,
)
use_aux_net=True,
use_upsample_net=True)
check = torch.load(args.vocoder_path)
vocoder_model.load_state_dict(check['model'])
@ -137,23 +147,26 @@ if __name__ == "__main__":
else:
vocoder_model = None
VC = None
ap_vocoder = None
# synthesize voice
print(" > Text: {}".format(args.text))
_, _, _, wav = tts(
model,
_, _, _, wav = tts(model,
vocoder_model,
C,
VC,
args.text,
ap,
ap_vocoder,
args.use_cuda,
args.batched_vocoder,
speaker_id=args.speaker_id,
figures=False)
# save the results
file_name = args.text.replace(" ", "_")
file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', '')))+'.wav'
file_name = file_name.translate(
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(args.out_path, file_name)
print(" > Saving output to {}".format(out_path))
ap.save_wav(wav, out_path)

View File

@ -54,7 +54,8 @@ class DecoderTests(unittest.TestCase):
trans_agent=True,
forward_attn_mask=True,
location_attn=True,
separate_stopnet=True)
separate_stopnet=True,
speaker_embedding_dim=0)
dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80)
@ -66,6 +67,35 @@ class DecoderTests(unittest.TestCase):
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4
@staticmethod
def test_in_out_multispeaker():
layer = Decoder(
in_features=256,
memory_dim=80,
r=2,
memory_size=4,
attn_windowing=False,
attn_norm="sigmoid",
prenet_type='original',
prenet_dropout=True,
forward_attn=True,
trans_agent=True,
forward_attn_mask=True,
location_attn=True,
separate_stopnet=True,
speaker_embedding_dim=80)
dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80)
dummy_embed = T.rand(4, 80)
output, alignment, stop_tokens = layer(
dummy_input, dummy_memory, mask=None, speaker_embeddings=dummy_embed)
assert output.shape[0] == 4
assert output.shape[1] == 1, "size not {}".format(output.shape[1])
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4
class EncoderTests(unittest.TestCase):
def test_in_out(self):

View File

@ -25,8 +25,9 @@ def count_parameters(model):
class TacotronTrainTest(unittest.TestCase):
def test_train_step(self):
input = torch.randint(0, 24, (8, 128)).long().to(device)
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
@ -38,7 +39,7 @@ class TacotronTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input.shape[0],
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
@ -51,9 +52,11 @@ class TacotronTrainTest(unittest.TestCase):
linear_dim=c.audio['num_freq'],
mel_dim=c.audio['num_mels'],
r=c.r,
memory_size=c.memory_size).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(" > Num parameters for Tacotron model:%s"%(count_parameters(model)))
print(" > Num parameters for Tacotron model:%s" %
(count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
@ -63,7 +66,7 @@ class TacotronTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward(
input, input_lengths, mel_spec, speaker_ids)
input_dummy, input_lengths, mel_spec, speaker_ids)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
@ -81,3 +84,66 @@ class TacotronTrainTest(unittest.TestCase):
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
count += 1
class TacotronGSTTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device)
mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
stop_targets = torch.zeros(8, 120, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked().to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
gst=True,
linear_dim=c.audio['num_freq'],
mel_dim=c.audio['num_mels'],
r=c.r,
memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(model)
print(" > Num parameters for Tacotron GST model:%s" %
(count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(10):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, speaker_ids)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
# ignore pre-higway layer since it works conditional
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
count += 1

331
train.py
View File

@ -15,21 +15,21 @@ from distribute import (DistributedSampler, apply_gradient_allreduce,
init_distributed, reduce_tensor)
from TTS.layers.losses import L1LossMasked, MSELossMasked
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters,
create_experiment_folder, get_git_branch,
load_config, remove_experiment_folder,
save_best_model, save_checkpoint, weight_decay,
set_init_dict, copy_config_file, setup_model,
split_dataset, gradual_training_scheduler)
from TTS.utils.generic_utils import (
NoamLR, check_update, count_parameters, create_experiment_folder,
get_git_branch, load_config, remove_experiment_folder, save_best_model,
save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file,
setup_model, gradual_training_scheduler, KeepAverage,
set_weight_decay)
from TTS.utils.logger import Logger
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
get_speakers
from TTS.utils.synthesis import synthesis
from TTS.utils.text.symbols import phonemes, symbols
from TTS.utils.visual import plot_alignment, plot_spectrogram
from TTS.datasets.preprocess import get_preprocessor_by_name
from TTS.datasets.preprocess import load_meta_data
from TTS.utils.radam import RAdam
from TTS.utils.measures import alignment_diagonal_score
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
@ -41,18 +41,6 @@ print(" > Number of GPUs: ", num_gpus)
def setup_loader(ap, is_val=False, verbose=False):
global meta_data_train
global meta_data_eval
if "meta_data_train" not in globals():
if c.meta_file_train is not None:
meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path, c.meta_file_train)
else:
meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path)
if "meta_data_eval" not in globals() and c.run_eval:
if c.meta_file_val is not None:
meta_data_eval = get_preprocessor_by_name(c.dataset)(c.data_path, c.meta_file_val)
else:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
if is_val and not c.run_eval:
loader = None
else:
@ -61,7 +49,8 @@ def setup_loader(ap, is_val=False, verbose=False):
c.text_cleaner,
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
batch_group_size=0 if is_val else c.batch_group_size * c.batch_size,
batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len,
max_seq_len=c.max_seq_len,
phoneme_cache_path=c.phoneme_cache_path,
@ -90,14 +79,21 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
speaker_mapping = load_speaker_mapping(OUT_PATH)
model.train()
epoch_time = 0
avg_postnet_loss = 0
avg_decoder_loss = 0
avg_stop_loss = 0
avg_step_time = 0
avg_loader_time = 0
train_values = {
'avg_postnet_loss': 0,
'avg_decoder_loss': 0,
'avg_stop_loss': 0,
'avg_align_score': 0,
'avg_step_time': 0,
'avg_loader_time': 0,
'avg_alignment_score': 0
}
keep_avg = KeepAverage()
keep_avg.add_values(train_values)
print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True)
if use_cuda:
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus))
batch_n_iter = int(
len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time()
@ -108,7 +104,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
text_input = data[0]
text_lengths = data[1]
speaker_names = data[2]
linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None
linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"
] else None
mel_input = data[4]
mel_lengths = data[5]
stop_targets = data[6]
@ -117,8 +114,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
loader_time = time.time() - end_time
if c.use_speaker_embedding:
speaker_ids = [speaker_mapping[speaker_name]
for speaker_name in speaker_names]
speaker_ids = [
speaker_mapping[speaker_name] for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids)
else:
speaker_ids = None
@ -126,7 +124,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
# set stop targets view, we predict a single stop token per r frames prediction
stop_targets = stop_targets.view(text_input.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze(2)
global_step += 1
@ -143,7 +142,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
text_lengths = text_lengths.cuda(non_blocking=True)
mel_input = mel_input.cuda(non_blocking=True)
mel_lengths = mel_lengths.cuda(non_blocking=True)
linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron", "TacotronGST"] else None
linear_input = linear_input.cuda(
non_blocking=True) if c.model in ["Tacotron", "TacotronGST"
] else None
stop_targets = stop_targets.cuda(non_blocking=True)
if speaker_ids is not None:
speaker_ids = speaker_ids.cuda(non_blocking=True)
@ -153,13 +154,16 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
# loss computation
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
stop_loss = criterion_st(stop_tokens,
stop_targets) if c.stopnet else torch.zeros(1)
if c.loss_masking:
decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
if c.model in ["Tacotron", "TacotronGST"]:
postnet_loss = criterion(postnet_output, linear_input, mel_lengths)
postnet_loss = criterion(postnet_output, linear_input,
mel_lengths)
else:
postnet_loss = criterion(postnet_output, mel_input, mel_lengths)
postnet_loss = criterion(postnet_output, mel_input,
mel_lengths)
else:
decoder_loss = criterion(decoder_output, mel_input)
if c.model in ["Tacotron", "TacotronGST"]:
@ -171,14 +175,18 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
loss += stop_loss
loss.backward()
optimizer, current_lr = weight_decay(optimizer, c.wd)
optimizer, current_lr = adam_weight_decay(optimizer)
grad_norm, _ = check_update(model, c.grad_clip)
optimizer.step()
# compute alignment score
align_score = alignment_diagonal_score(alignments)
keep_avg.update_value('avg_align_score', align_score)
# backpass and check the grad norm for stop loss
if c.separate_stopnet:
stop_loss.backward()
optimizer_st, _ = weight_decay(optimizer_st, c.wd)
optimizer_st, _ = adam_weight_decay(optimizer_st)
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
optimizer_st.step()
else:
@ -189,14 +197,14 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
if global_step % c.print_step == 0:
print(
" | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} "
"DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} "
" | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} "
"DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} "
"GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} "
"LoaderTime:{:.2f} LR:{:.6f}".format(
num_iter, batch_n_iter, global_step, loss.item(),
postnet_loss.item(), decoder_loss.item(), stop_loss.item(),
grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time,
loader_time, current_lr),
num_iter, batch_n_iter, global_step, postnet_loss.item(),
decoder_loss.item(), stop_loss.item(), align_score,
grad_norm, grad_norm_st, avg_text_length, avg_spec_length,
step_time, loader_time, current_lr),
flush=True)
# aggregate losses from processes
@ -204,24 +212,36 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
loss = reduce_tensor(loss.data, num_gpus)
stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss
stop_loss = reduce_tensor(stop_loss.data,
num_gpus) if c.stopnet else stop_loss
if args.rank == 0:
avg_postnet_loss += float(postnet_loss.item())
avg_decoder_loss += float(decoder_loss.item())
avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item())
avg_step_time += step_time
avg_loader_time += loader_time
update_train_values = {
'avg_postnet_loss':
float(postnet_loss.item()),
'avg_decoder_loss':
float(decoder_loss.item()),
'avg_stop_loss':
stop_loss
if isinstance(stop_loss, float) else float(stop_loss.item()),
'avg_step_time':
step_time,
'avg_loader_time':
loader_time
}
keep_avg.update_values(update_train_values)
# Plot Training Iter Stats
# reduce TB load
if global_step % 10 == 0:
iter_stats = {"loss_posnet": postnet_loss.item(),
iter_stats = {
"loss_posnet": postnet_loss.item(),
"loss_decoder": decoder_loss.item(),
"lr": current_lr,
"grad_norm": grad_norm,
"grad_norm_st": grad_norm_st,
"step_time": step_time}
"step_time": step_time
}
tb_logger.tb_train_iter_stats(global_step, iter_stats)
if global_step % c.save_step == 0:
@ -233,7 +253,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
# Diagnostic visualizations
const_spec = postnet_output[0].data.cpu().numpy()
gt_spec = linear_input[0].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy()
gt_spec = linear_input[0].data.cpu().numpy() if c.model in [
"Tacotron", "TacotronGST"
] else mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
figures = {
@ -253,35 +275,31 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
c.audio["sample_rate"])
end_time = time.time()
avg_postnet_loss /= (num_iter + 1)
avg_decoder_loss /= (num_iter + 1)
avg_stop_loss /= (num_iter + 1)
avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss
avg_step_time /= (num_iter + 1)
avg_loader_time /= (num_iter + 1)
# print epoch stats
print(
" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
print(" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
"AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} "
"AvgStopLoss:{:.5f} EpochTime:{:.2f} "
"AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, avg_total_loss,
avg_postnet_loss, avg_decoder_loss,
avg_stop_loss, epoch_time, avg_step_time,
avg_loader_time),
"AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(
global_step, keep_avg['avg_postnet_loss'],
keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'],
keep_avg['avg_align_score'], epoch_time,
keep_avg['avg_step_time'], keep_avg['avg_loader_time']),
flush=True)
# Plot Epoch Stats
if args.rank == 0:
# Plot Training Epoch Stats
epoch_stats = {"loss_postnet": avg_postnet_loss,
"loss_decoder": avg_decoder_loss,
"stop_loss": avg_stop_loss,
"epoch_time": epoch_time}
epoch_stats = {
"loss_postnet": keep_avg['avg_postnet_loss'],
"loss_decoder": keep_avg['avg_decoder_loss'],
"stop_loss": keep_avg['avg_stop_loss'],
"alignment_score": keep_avg['avg_align_score'],
"epoch_time": epoch_time
}
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, global_step)
return avg_postnet_loss, global_step
return keep_avg['avg_postnet_loss'], global_step
def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
@ -290,9 +308,14 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
speaker_mapping = load_speaker_mapping(OUT_PATH)
model.eval()
epoch_time = 0
avg_postnet_loss = 0
avg_decoder_loss = 0
avg_stop_loss = 0
eval_values_dict = {
'avg_postnet_loss': 0,
'avg_decoder_loss': 0,
'avg_stop_loss': 0,
'avg_align_score': 0
}
keep_avg = KeepAverage()
keep_avg.add_values(eval_values_dict)
print("\n > Validation")
if c.test_sentences_file is None:
test_sentences = [
@ -313,14 +336,18 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
text_input = data[0]
text_lengths = data[1]
speaker_names = data[2]
linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None
linear_input = data[3] if c.model in [
"Tacotron", "TacotronGST"
] else None
mel_input = data[4]
mel_lengths = data[5]
stop_targets = data[6]
if c.use_speaker_embedding:
speaker_ids = [speaker_mapping[speaker_name]
for speaker_name in speaker_names]
speaker_ids = [
speaker_mapping[speaker_name]
for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids)
else:
speaker_ids = None
@ -329,14 +356,17 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
stop_targets = stop_targets.view(text_input.shape[0],
stop_targets.size(1) // c.r,
-1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze(2)
# dispatch data to GPU
if use_cuda:
text_input = text_input.cuda()
mel_input = mel_input.cuda()
mel_lengths = mel_lengths.cuda()
linear_input = linear_input.cuda() if c.model in ["Tacotron", "TacotronGST"] else None
linear_input = linear_input.cuda() if c.model in [
"Tacotron", "TacotronGST"
] else None
stop_targets = stop_targets.cuda()
if speaker_ids is not None:
speaker_ids = speaker_ids.cuda()
@ -347,13 +377,17 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
speaker_ids=speaker_ids)
# loss computation
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
stop_loss = criterion_st(
stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
if c.loss_masking:
decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
decoder_loss = criterion(decoder_output, mel_input,
mel_lengths)
if c.model in ["Tacotron", "TacotronGST"]:
postnet_loss = criterion(postnet_output, linear_input, mel_lengths)
postnet_loss = criterion(postnet_output, linear_input,
mel_lengths)
else:
postnet_loss = criterion(postnet_output, mel_input, mel_lengths)
postnet_loss = criterion(postnet_output, mel_input,
mel_lengths)
else:
decoder_loss = criterion(decoder_output, mel_input)
if c.model in ["Tacotron", "TacotronGST"]:
@ -365,14 +399,9 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
step_time = time.time() - start_time
epoch_time += step_time
if num_iter % c.print_step == 0:
print(
" | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} "
"StopLoss: {:.5f} ".format(loss.item(),
postnet_loss.item(),
decoder_loss.item(),
stop_loss.item()),
flush=True)
# compute alignment score
align_score = alignment_diagonal_score(alignments)
keep_avg.update_value('avg_align_score', align_score)
# aggregate losses from processes
if num_gpus > 1:
@ -381,15 +410,34 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
if c.stopnet:
stop_loss = reduce_tensor(stop_loss.data, num_gpus)
avg_postnet_loss += float(postnet_loss.item())
avg_decoder_loss += float(decoder_loss.item())
avg_stop_loss += stop_loss.item()
keep_avg.update_values({
'avg_postnet_loss':
float(postnet_loss.item()),
'avg_decoder_loss':
float(decoder_loss.item()),
'avg_stop_loss':
float(stop_loss.item())
})
if num_iter % c.print_step == 0:
print(
" | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} "
"StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}"
.format(loss.item(), postnet_loss.item(),
keep_avg['avg_postnet_loss'],
decoder_loss.item(),
keep_avg['avg_decoder_loss'], stop_loss.item(),
keep_avg['avg_stop_loss'], align_score,
keep_avg['avg_align_score']),
flush=True)
if args.rank == 0:
# Diagnostic visualizations
idx = np.random.randint(mel_input.shape[0])
const_spec = postnet_output[idx].data.cpu().numpy()
gt_spec = linear_input[idx].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[idx].data.cpu().numpy()
gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
"Tacotron", "TacotronGST"
] else mel_input[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy()
eval_figures = {
@ -404,17 +452,15 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
eval_audio = ap.inv_spectrogram(const_spec.T)
else:
eval_audio = ap.inv_mel_spectrogram(const_spec.T)
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"])
# compute average losses
avg_postnet_loss /= (num_iter + 1)
avg_decoder_loss /= (num_iter + 1)
avg_stop_loss /= (num_iter + 1)
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
c.audio["sample_rate"])
# Plot Validation Stats
epoch_stats = {"loss_postnet": avg_postnet_loss,
"loss_decoder": avg_decoder_loss,
"stop_loss": avg_stop_loss}
epoch_stats = {
"loss_postnet": keep_avg['avg_postnet_loss'],
"loss_decoder": keep_avg['avg_decoder_loss'],
"stop_loss": keep_avg['avg_stop_loss']
}
tb_logger.tb_eval_stats(global_step, epoch_stats)
if args.rank == 0 and epoch > c.test_delay_epochs:
@ -427,7 +473,11 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
for idx, test_sentence in enumerate(test_sentences):
try:
wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
model, test_sentence, c, use_cuda, ap,
model,
test_sentence,
c,
use_cuda,
ap,
speaker_id=speaker_id,
style_wav=style_wav)
file_path = os.path.join(AUDIO_PATH, str(global_step))
@ -436,18 +486,22 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
"TestSentence_{}.wav".format(idx))
ap.save_wav(wav, file_path)
test_audios['{}-audio'.format(idx)] = wav
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(postnet_output, ap)
test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment)
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
postnet_output, ap)
test_figures['{}-alignment'.format(idx)] = plot_alignment(
alignment)
except:
print(" !! Error creating Test Sentence -", idx)
traceback.print_exc()
tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate'])
tb_logger.tb_test_audios(global_step, test_audios,
c.audio['sample_rate'])
tb_logger.tb_test_figures(global_step, test_figures)
return avg_postnet_loss
return keep_avg['avg_postnet_loss']
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
global meta_data_train, meta_data_eval
# Audio processor
ap = AudioProcessor(**c.audio)
@ -457,8 +511,12 @@ def main(args): #pylint: disable=redefined-outer-name
c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
# load data instances
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
# parse speakers
if c.use_speaker_embedding:
speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset)
speakers = get_speakers(meta_data_train)
if args.restore_path:
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)
@ -467,8 +525,7 @@ def main(args): #pylint: disable=redefined-outer-name
"introduce new speakers to " \
"a previously trained model."
else:
speaker_mapping = {name: i
for i, name in enumerate(speakers)}
speaker_mapping = {name: i for i, name in enumerate(speakers)}
save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping)
print("Training with {} speakers: {}".format(num_speakers,
@ -480,18 +537,23 @@ def main(args): #pylint: disable=redefined-outer-name
print(" | > Num output units : {}".format(ap.num_freq), flush=True)
optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0)
params = set_weight_decay(model, c.wd)
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
if c.stopnet and c.separate_stopnet:
optimizer_st = RAdam(
model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0)
optimizer_st = RAdam(model.decoder.stopnet.parameters(),
lr=c.lr,
weight_decay=0)
else:
optimizer_st = None
if c.loss_masking:
criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"] else MSELossMasked()
criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"
] else MSELossMasked()
else:
criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"] else nn.MSELoss()
criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None
criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
] else nn.MSELoss()
criterion_st = nn.BCEWithLogitsLoss(
pos_weight=torch.tensor(20.0)) if c.stopnet else None
if args.restore_path:
checkpoint = torch.load(args.restore_path)
@ -510,8 +572,8 @@ def main(args): #pylint: disable=redefined-outer-name
del model_dict
for group in optimizer.param_groups:
group['lr'] = c.lr
print(
" > Model restored from step %d" % checkpoint['step'], flush=True)
print(" > Model restored from step %d" % checkpoint['step'],
flush=True)
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
@ -527,8 +589,7 @@ def main(args): #pylint: disable=redefined-outer-name
model = apply_gradient_allreduce(model)
if c.lr_decay:
scheduler = NoamLR(
optimizer,
scheduler = NoamLR(optimizer,
warmup_steps=c.warmup_steps,
last_epoch=args.restore_step - 1)
else:
@ -550,11 +611,11 @@ def main(args): #pylint: disable=redefined-outer-name
print(" > Number of outputs per iteration:", model.decoder.r)
train_loss, global_step = train(model, criterion, criterion_st,
optimizer, optimizer_st, scheduler,
ap, global_step, epoch)
val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch)
print(
" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format(
optimizer, optimizer_st, scheduler, ap,
global_step, epoch)
val_loss = evaluate(model, criterion, criterion_st, ap, global_step,
epoch)
print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format(
train_loss, val_loss),
flush=True)
target_loss = train_loss
@ -576,8 +637,7 @@ if __name__ == '__main__':
type=str,
help='Path to config file for training.',
)
parser.add_argument(
'--debug',
parser.add_argument('--debug',
type=bool,
default=True,
help='Do not verify commit integrity to run training.')
@ -586,17 +646,14 @@ if __name__ == '__main__':
type=str,
default='',
help='Defines the data path. It overwrites config.json.')
parser.add_argument(
'--output_path',
parser.add_argument('--output_path',
type=str,
help='path for training outputs.',
default='')
parser.add_argument(
'--output_folder',
parser.add_argument('--output_folder',
type=str,
default='',
help='folder name for training outputs.'
)
help='folder name for training outputs.')
# DISTRUBUTED
parser.add_argument(
@ -604,8 +661,7 @@ if __name__ == '__main__':
type=int,
default=0,
help='DISTRIBUTED: process rank for distributed training.')
parser.add_argument(
'--group_id',
parser.add_argument('--group_id',
type=str,
default="",
help='DISTRIBUTED: process group id.')
@ -635,7 +691,8 @@ if __name__ == '__main__':
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'), new_fields)
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)

View File

@ -24,6 +24,7 @@ class AudioProcessor(object):
clip_norm=True,
griffin_lim_iters=None,
do_trim_silence=False,
sound_norm=False,
**_):
print(" > Setting up Audio Processor...")
@ -45,6 +46,7 @@ class AudioProcessor(object):
self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
members = vars(self)
for key, value in members.items():
@ -210,11 +212,11 @@ class AudioProcessor(object):
return len(wav)
def trim_silence(self, wav):
""" Trim silent parts with a threshold and 0.1 sec margin """
margin = int(self.sample_rate * 0.1)
""" Trim silent parts with a threshold and 0.01 sec margin """
margin = int(self.sample_rate * 0.01)
wav = wav[margin:-margin]
return librosa.effects.trim(
wav, top_db=40, frame_length=1024, hop_length=256)[0]
wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
@staticmethod
def mulaw_encode(wav, qc):
@ -243,6 +245,8 @@ class AudioProcessor(object):
except ValueError:
print(f' [!] File cannot be trimmed for silence - {filename}')
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
if self.sound_norm:
x = x / x.max() * 0.9
return x
@staticmethod

View File

@ -31,7 +31,8 @@ def load_config(config_path):
def get_git_branch():
try:
out = subprocess.check_output(["git", "branch"]).decode("utf8")
current = next(line for line in out.split("\n") if line.startswith("*"))
current = next(line for line in out.split("\n")
if line.startswith("*"))
current.replace("* ", "")
except subprocess.CalledProcessError:
current = "inside_docker"
@ -47,8 +48,8 @@ def get_commit_hash():
# raise RuntimeError(
# " !! Commit before training to get the commit hash.")
try:
commit = subprocess.check_output(['git', 'rev-parse', '--short',
'HEAD']).decode().strip()
commit = subprocess.check_output(
['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
# Not copying .git folder into docker container
except subprocess.CalledProcessError:
commit = "0000000"
@ -168,16 +169,42 @@ def lr_decay(init_lr, global_step, warmup_steps):
return lr
def weight_decay(optimizer, wd):
def adam_weight_decay(optimizer):
"""
Custom weight decay operation, not effecting grad values.
"""
for group in optimizer.param_groups:
for param in group['params']:
current_lr = group['lr']
param.data = param.data.add(-wd * group['lr'], param.data)
weight_decay = group['weight_decay']
param.data = param.data.add(-weight_decay * group['lr'],
param.data)
return optimizer, current_lr
# pylint: disable=dangerous-default-value
def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}):
"""
Skip biases, BatchNorm parameters, rnns.
and attention projection layer v
"""
decay = []
no_decay = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue
if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]):
no_decay.append(param)
else:
decay.append(param)
return [{
'params': no_decay,
'weight_decay': 0.
}, {
'params': decay,
'weight_decay': weight_decay
}]
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1):
@ -187,8 +214,8 @@ class NoamLR(torch.optim.lr_scheduler._LRScheduler):
def get_lr(self):
step = max(self.last_epoch, 1)
return [
base_lr * self.warmup_steps**0.5 * min(
step * self.warmup_steps**-1.5, step**-0.5)
base_lr * self.warmup_steps**0.5 *
min(step * self.warmup_steps**-1.5, step**-0.5)
for base_lr in self.base_lrs
]
@ -243,8 +270,8 @@ def set_init_dict(model_dict, checkpoint, c):
}
# 4. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
print(" | > {} / {} layers are restored.".format(
len(pretrained_dict), len(model_dict)))
print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
len(model_dict)))
return model_dict
@ -252,13 +279,13 @@ def setup_model(num_chars, num_speakers, c):
print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('TTS.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model)
if c.model.lower() in ["tacotron", "tacotrongst"]:
model = MyModel(
num_chars=num_chars,
if c.model.lower() in "tacotron":
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
linear_dim=1025,
mel_dim=80,
gst=c.use_gst,
memory_size=c.memory_size,
attn_win=c.windowing,
attn_norm=c.attention_norm,
@ -270,8 +297,7 @@ def setup_model(num_chars, num_speakers, c):
location_attn=c.location_attn,
separate_stopnet=c.separate_stopnet)
elif c.model.lower() == "tacotron2":
model = MyModel(
num_chars=num_chars,
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
attn_win=c.windowing,
@ -290,7 +316,8 @@ def split_dataset(items):
is_multi_speaker = False
speakers = [item[-1] for item in items]
is_multi_speaker = len(set(speakers)) > 1
eval_split_size = 500 if 500 < len(items) * 0.01 else int(len(items) * 0.01)
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
len(items) * 0.01)
np.random.seed(0)
np.random.shuffle(items)
if is_multi_speaker:
@ -314,3 +341,34 @@ def gradual_training_scheduler(global_step, config):
if global_step >= values[0]:
new_values = values
return new_values[1], new_values[2]
class KeepAverage():
def __init__(self):
self.avg_values = {}
self.iters = {}
def __getitem__(self, key):
return self.avg_values[key]
def add_value(self, name, init_val=0, init_iter=0):
self.avg_values[name] = init_val
self.iters[name] = init_iter
def update_value(self, name, value, weighted_avg=False):
if weighted_avg:
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
self.iters[name] += 1
else:
self.avg_values[name] = self.avg_values[name] * \
self.iters[name] + value
self.iters[name] += 1
self.avg_values[name] /= self.iters[name]
def add_values(self, name_dict):
for key, value in name_dict.items():
self.add_value(key, init_val=value)
def update_values(self, value_dict):
for key, value in value_dict.items():
self.update_value(key, value)

11
utils/measures.py Normal file
View File

@ -0,0 +1,11 @@
def alignment_diagonal_score(alignments):
"""
Compute how diagonal alignment predictions are. It is useful
to measure the alignment consistency of a model
Args:
alignments (torch.Tensor): batch of alignments.
Shape:
alignments : batch x decoder_steps x encoder_steps
"""
return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0).item()

View File

@ -25,9 +25,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
json.dump(speaker_mapping, f, indent=4)
def get_speakers(data_root, meta_file, dataset_type):
def get_speakers(items):
"""Returns a sorted, unique list of speakers in a given dataset."""
preprocessor = get_preprocessor_by_name(dataset_type)
items = preprocessor(data_root, meta_file)
speakers = {e[2] for e in items}
return sorted(speakers)