mirror of https://github.com/coqui-ai/TTS.git
commit
50088cbf3b
8
.compute
8
.compute
|
@ -4,13 +4,13 @@ yes | apt-get install ffmpeg
|
|||
yes | apt-get install espeak
|
||||
yes | apt-get install tmux
|
||||
yes | apt-get install zsh
|
||||
# pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl
|
||||
# wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar
|
||||
wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh
|
||||
pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.0%2Bcu100-cp36-cp36m-linux_x86_64.whl
|
||||
sudo sh install.sh
|
||||
pip install pytorch==1.3.0+cu100
|
||||
python3 setup.py develop
|
||||
# cp -R ${USER_DIR}/GermanData ../tmp/
|
||||
# python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/
|
||||
# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
|
||||
# python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/
|
||||
# python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360
|
||||
# python3 distribute.py --config_path config.json
|
||||
while true; do sleep 1000000; done
|
||||
|
|
24
config.json
24
config.json
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"run_name": "ljspeech",
|
||||
"run_description": "gradual training with prenet frame size 1 + no maxout for cbhg + symmetric norm.",
|
||||
"run_description": "Tacotron ljspeech release training",
|
||||
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
|
@ -55,20 +55,16 @@
|
|||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":16,
|
||||
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled.
|
||||
"gradual_training": [[0, 7, 32], [1, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 10000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"print_step": 25, // Number of steps to log traning on console.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
"data_path": "/home/erogol/Data/LJSpeech-1.1/", // DATASET-RELATED: can overwritten from command argument
|
||||
"meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
|
||||
"meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||
"dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
|
||||
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 150, // DATASET-RELATED: maximum text length
|
||||
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
|
||||
|
@ -79,6 +75,18 @@
|
|||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"style_wav_for_test": null // path to style wav file to be used in TacotronGST inference.
|
||||
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
|
||||
"use_gst": false, // TACOTRON ONLY: use global style tokens
|
||||
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "/data/ro/shared/data/keithito/LJSpeech-1.1/",
|
||||
"meta_file_train": "metadata_train.csv",
|
||||
"meta_file_val": "metadata_val.csv"
|
||||
}
|
||||
]
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,27 @@ import os
|
|||
from glob import glob
|
||||
import re
|
||||
import sys
|
||||
from TTS.utils.generic_utils import split_dataset
|
||||
|
||||
|
||||
def load_meta_data(datasets):
|
||||
meta_data_train_all = []
|
||||
meta_data_eval_all = []
|
||||
for dataset in datasets:
|
||||
name = dataset['name']
|
||||
root_path = dataset['path']
|
||||
meta_file_train = dataset['meta_file_train']
|
||||
meta_file_val = dataset['meta_file_val']
|
||||
preprocessor = get_preprocessor_by_name(name)
|
||||
|
||||
meta_data_train = preprocessor(root_path, meta_file_train)
|
||||
if meta_file_val is None:
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
||||
else:
|
||||
meta_data_eval = preprocessor(root_path, meta_file_val)
|
||||
meta_data_train_all += meta_data_train
|
||||
meta_data_eval_all += meta_data_eval
|
||||
return meta_data_train_all, meta_data_eval_all
|
||||
|
||||
|
||||
def get_preprocessor_by_name(name):
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
|
@ -53,3 +55,18 @@ class MSELossMasked(nn.Module):
|
|||
x * mask, target * mask, reduction="sum")
|
||||
loss = loss / mask.sum()
|
||||
return loss
|
||||
|
||||
|
||||
class AttentionEntropyLoss(nn.Module):
|
||||
# pylint: disable=R0201
|
||||
def forward(self, align):
|
||||
"""
|
||||
Forces attention to be more decisive by penalizing
|
||||
soft attention weights
|
||||
|
||||
TODO: arguments
|
||||
TODO: unit_test
|
||||
"""
|
||||
entropy = torch.distributions.Categorical(probs=align).entropy()
|
||||
loss = (entropy / np.log(align.shape[1])).mean()
|
||||
return loss
|
||||
|
|
|
@ -273,7 +273,7 @@ class Decoder(nn.Module):
|
|||
def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing,
|
||||
attn_norm, prenet_type, prenet_dropout, forward_attn,
|
||||
trans_agent, forward_attn_mask, location_attn,
|
||||
separate_stopnet):
|
||||
separate_stopnet, speaker_embedding_dim):
|
||||
super(Decoder, self).__init__()
|
||||
self.r_init = r
|
||||
self.r = r
|
||||
|
@ -285,8 +285,9 @@ class Decoder(nn.Module):
|
|||
self.separate_stopnet = separate_stopnet
|
||||
self.query_dim = 256
|
||||
# memory -> |Prenet| -> processed_memory
|
||||
prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim
|
||||
self.prenet = Prenet(
|
||||
memory_dim * self.memory_size if self.use_memory_queue else memory_dim,
|
||||
prenet_dim,
|
||||
prenet_type,
|
||||
prenet_dropout,
|
||||
out_features=[256, 128])
|
||||
|
@ -339,13 +340,13 @@ class Decoder(nn.Module):
|
|||
T = inputs.size(1)
|
||||
# go frame as zeros matrix
|
||||
if self.use_memory_queue:
|
||||
self.memory_input = torch.zeros(B, self.memory_dim * self.memory_size, device=inputs.device)
|
||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size)
|
||||
else:
|
||||
self.memory_input = torch.zeros(B, self.memory_dim, device=inputs.device)
|
||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim)
|
||||
# decoder states
|
||||
self.attention_rnn_hidden = torch.zeros(B, 256, device=inputs.device)
|
||||
self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
|
||||
self.decoder_rnn_hiddens = [
|
||||
torch.zeros(B, 256, device=inputs.device)
|
||||
torch.zeros(1, device=inputs.device).repeat(B, 256)
|
||||
for idx in range(len(self.decoder_rnns))
|
||||
]
|
||||
self.context_vec = inputs.data.new(B, self.in_features).zero_()
|
||||
|
@ -405,9 +406,9 @@ class Decoder(nn.Module):
|
|||
self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
|
||||
else:
|
||||
# use only the last frame prediction
|
||||
self.memory_input = new_memory[:, :self.memory_dim]
|
||||
self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):]
|
||||
|
||||
def forward(self, inputs, memory, mask):
|
||||
def forward(self, inputs, memory, mask, speaker_embeddings=None):
|
||||
"""
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
|
@ -432,6 +433,8 @@ class Decoder(nn.Module):
|
|||
if t > 0:
|
||||
new_memory = memory[t - 1]
|
||||
self._update_memory_input(new_memory)
|
||||
if speaker_embeddings is not None:
|
||||
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
|
||||
output, stop_token, attention = self.decode(inputs, mask)
|
||||
outputs += [output]
|
||||
attentions += [attention]
|
||||
|
@ -440,13 +443,15 @@ class Decoder(nn.Module):
|
|||
|
||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||
|
||||
def inference(self, inputs):
|
||||
def inference(self, inputs, speaker_embeddings=None):
|
||||
"""
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
inputs: encoder outputs.
|
||||
speaker_embeddings: speaker vectors.
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x encoder_out_dim
|
||||
- speaker_embeddings: batch x embed_dim
|
||||
"""
|
||||
outputs = []
|
||||
attentions = []
|
||||
|
@ -459,6 +464,8 @@ class Decoder(nn.Module):
|
|||
if t > 0:
|
||||
new_memory = outputs[-1]
|
||||
self._update_memory_input(new_memory)
|
||||
if speaker_embeddings is not None:
|
||||
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
|
||||
output, stop_token, attention = self.decode(inputs, None)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [output]
|
||||
|
|
|
@ -10,8 +10,10 @@ class ConvBNBlock(nn.Module):
|
|||
super(ConvBNBlock, self).__init__()
|
||||
assert (kernel_size - 1) % 2 == 0
|
||||
padding = (kernel_size - 1) // 2
|
||||
conv1d = nn.Conv1d(
|
||||
in_channels, out_channels, kernel_size, padding=padding)
|
||||
conv1d = nn.Conv1d(in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
padding=padding)
|
||||
norm = nn.BatchNorm1d(out_channels)
|
||||
dropout = nn.Dropout(p=0.5)
|
||||
if nonlinear == 'relu':
|
||||
|
@ -52,8 +54,7 @@ class Encoder(nn.Module):
|
|||
convolutions.append(
|
||||
ConvBNBlock(in_features, in_features, 5, 'relu'))
|
||||
self.convolutions = nn.Sequential(*convolutions)
|
||||
self.lstm = nn.LSTM(
|
||||
in_features,
|
||||
self.lstm = nn.LSTM(in_features,
|
||||
int(in_features / 2),
|
||||
num_layers=1,
|
||||
batch_first=True,
|
||||
|
@ -64,8 +65,9 @@ class Encoder(nn.Module):
|
|||
x = self.convolutions(x)
|
||||
x = x.transpose(1, 2)
|
||||
input_lengths = input_lengths.cpu().numpy()
|
||||
x = nn.utils.rnn.pack_padded_sequence(
|
||||
x, input_lengths, batch_first=True)
|
||||
x = nn.utils.rnn.pack_padded_sequence(x,
|
||||
input_lengths,
|
||||
batch_first=True)
|
||||
self.lstm.flatten_parameters()
|
||||
outputs, _ = self.lstm(x)
|
||||
outputs, _ = nn.utils.rnn.pad_packed_sequence(
|
||||
|
@ -101,6 +103,7 @@ class Decoder(nn.Module):
|
|||
forward_attn_mask, location_attn, separate_stopnet):
|
||||
super(Decoder, self).__init__()
|
||||
self.mel_channels = inputs_dim
|
||||
self.r_init = r
|
||||
self.r = r
|
||||
self.encoder_embedding_dim = in_features
|
||||
self.separate_stopnet = separate_stopnet
|
||||
|
@ -111,10 +114,11 @@ class Decoder(nn.Module):
|
|||
self.gate_threshold = 0.5
|
||||
self.p_attention_dropout = 0.1
|
||||
self.p_decoder_dropout = 0.1
|
||||
|
||||
self.prenet = Prenet(self.mel_channels * r, prenet_type,
|
||||
self.prenet = Prenet(self.mel_channels,
|
||||
prenet_type,
|
||||
prenet_dropout,
|
||||
[self.prenet_dim, self.prenet_dim], bias=False)
|
||||
[self.prenet_dim, self.prenet_dim],
|
||||
bias=False)
|
||||
|
||||
self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features,
|
||||
self.query_dim)
|
||||
|
@ -135,51 +139,46 @@ class Decoder(nn.Module):
|
|||
self.decoder_rnn_dim, 1)
|
||||
|
||||
self.linear_projection = Linear(self.decoder_rnn_dim + in_features,
|
||||
self.mel_channels * r)
|
||||
self.mel_channels * self.r_init)
|
||||
|
||||
self.stopnet = nn.Sequential(
|
||||
nn.Dropout(0.1),
|
||||
Linear(
|
||||
self.decoder_rnn_dim + self.mel_channels * r,
|
||||
Linear(self.decoder_rnn_dim + self.mel_channels * self.r_init,
|
||||
1,
|
||||
bias=True,
|
||||
init_gain='sigmoid'))
|
||||
|
||||
self.attention_rnn_init = nn.Embedding(1, self.query_dim)
|
||||
self.go_frame_init = nn.Embedding(1, self.mel_channels * r)
|
||||
self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim)
|
||||
self.memory_truncated = None
|
||||
|
||||
def set_r(self, new_r):
|
||||
self.r = new_r
|
||||
|
||||
def get_go_frame(self, inputs):
|
||||
B = inputs.size(0)
|
||||
memory = self.go_frame_init(inputs.data.new_zeros(B).long())
|
||||
memory = torch.zeros(1, device=inputs.device).repeat(B,
|
||||
self.mel_channels * self.r)
|
||||
return memory
|
||||
|
||||
def _init_states(self, inputs, mask, keep_states=False):
|
||||
B = inputs.size(0)
|
||||
# T = inputs.size(1)
|
||||
|
||||
if not keep_states:
|
||||
self.query = self.attention_rnn_init(
|
||||
inputs.data.new_zeros(B).long())
|
||||
self.attention_rnn_cell_state = Variable(
|
||||
inputs.data.new(B, self.query_dim).zero_())
|
||||
|
||||
self.decoder_hidden = self.decoder_rnn_inits(
|
||||
inputs.data.new_zeros(B).long())
|
||||
self.decoder_cell = Variable(
|
||||
inputs.data.new(B, self.decoder_rnn_dim).zero_())
|
||||
|
||||
self.context = Variable(
|
||||
inputs.data.new(B, self.encoder_embedding_dim).zero_())
|
||||
|
||||
self.query = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.query_dim)
|
||||
self.attention_rnn_cell_state = torch.zeros(
|
||||
1, device=inputs.device).repeat(B, self.query_dim)
|
||||
self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.decoder_rnn_dim)
|
||||
self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.decoder_rnn_dim)
|
||||
self.context = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.encoder_embedding_dim)
|
||||
self.inputs = inputs
|
||||
self.processed_inputs = self.attention.inputs_layer(inputs)
|
||||
self.mask = mask
|
||||
|
||||
def _reshape_memory(self, memories):
|
||||
memories = memories.view(
|
||||
memories.size(0), int(memories.size(1) / self.r), -1)
|
||||
memories = memories.view(memories.size(0),
|
||||
int(memories.size(1) / self.r), -1)
|
||||
memories = memories.transpose(0, 1)
|
||||
return memories
|
||||
|
||||
|
@ -192,14 +191,20 @@ class Decoder(nn.Module):
|
|||
outputs = outputs.transpose(1, 2)
|
||||
return outputs, stop_tokens, alignments
|
||||
|
||||
def _update_memory(self, memory):
|
||||
if len(memory.shape) == 2:
|
||||
return memory[:, self.mel_channels * (self.r - 1):]
|
||||
return memory[:, :, self.mel_channels * (self.r - 1):]
|
||||
|
||||
def decode(self, memory):
|
||||
query_input = torch.cat((memory, self.context), -1)
|
||||
self.query, self.attention_rnn_cell_state = self.attention_rnn(
|
||||
query_input, (self.query, self.attention_rnn_cell_state))
|
||||
self.query = F.dropout(
|
||||
self.query, self.p_attention_dropout, self.training)
|
||||
self.query = F.dropout(self.query, self.p_attention_dropout,
|
||||
self.training)
|
||||
self.attention_rnn_cell_state = F.dropout(
|
||||
self.attention_rnn_cell_state, self.p_attention_dropout, self.training)
|
||||
self.attention_rnn_cell_state, self.p_attention_dropout,
|
||||
self.training)
|
||||
|
||||
self.context = self.attention(self.query, self.inputs,
|
||||
self.processed_inputs, self.mask)
|
||||
|
@ -223,13 +228,14 @@ class Decoder(nn.Module):
|
|||
stop_token = self.stopnet(stopnet_input.detach())
|
||||
else:
|
||||
stop_token = self.stopnet(stopnet_input)
|
||||
decoder_output = decoder_output[:, :self.r * self.mel_channels]
|
||||
return decoder_output, stop_token, self.attention.attention_weights
|
||||
|
||||
def forward(self, inputs, memories, mask):
|
||||
memory = self.get_go_frame(inputs).unsqueeze(0)
|
||||
memories = self._reshape_memory(memories)
|
||||
memories = torch.cat((memory, memories), dim=0)
|
||||
memories = self.prenet(memories)
|
||||
memories = self.prenet(self._update_memory(memories))
|
||||
|
||||
self._init_states(inputs, mask=mask)
|
||||
self.attention.init_states(inputs)
|
||||
|
@ -249,6 +255,8 @@ class Decoder(nn.Module):
|
|||
|
||||
def inference(self, inputs):
|
||||
memory = self.get_go_frame(inputs)
|
||||
memory = self._update_memory(memory)
|
||||
|
||||
self._init_states(inputs, mask=None)
|
||||
|
||||
self.attention.init_win_idx()
|
||||
|
@ -256,7 +264,6 @@ class Decoder(nn.Module):
|
|||
|
||||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||
stop_flags = [True, False, False]
|
||||
stop_count = 0
|
||||
while True:
|
||||
memory = self.prenet(memory)
|
||||
mel_output, stop_token, alignment = self.decode(memory)
|
||||
|
@ -270,14 +277,12 @@ class Decoder(nn.Module):
|
|||
and t > inputs.shape[1])
|
||||
stop_flags[2] = t > inputs.shape[1] * 2
|
||||
if all(stop_flags):
|
||||
stop_count += 1
|
||||
if stop_count > 20:
|
||||
break
|
||||
elif len(outputs) == self.max_decoder_steps:
|
||||
if len(outputs) == self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
|
||||
memory = mel_output
|
||||
memory = self._update_memory(mel_output)
|
||||
t += 1
|
||||
|
||||
outputs, stop_tokens, alignments = self._parse_outputs(
|
||||
|
@ -299,7 +304,6 @@ class Decoder(nn.Module):
|
|||
self.attention.init_states(inputs)
|
||||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||
stop_flags = [True, False, False]
|
||||
stop_count = 0
|
||||
while True:
|
||||
memory = self.prenet(self.memory_truncated)
|
||||
mel_output, stop_token, alignment = self.decode(memory)
|
||||
|
@ -313,10 +317,8 @@ class Decoder(nn.Module):
|
|||
and t > inputs.shape[1])
|
||||
stop_flags[2] = t > inputs.shape[1] * 2
|
||||
if all(stop_flags):
|
||||
stop_count += 1
|
||||
if stop_count > 20:
|
||||
break
|
||||
elif len(outputs) == self.max_decoder_steps:
|
||||
if len(outputs) == self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
# coding: utf-8
|
||||
import torch
|
||||
from torch import nn
|
||||
from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
from TTS.layers.gst_layers import GST
|
||||
|
||||
|
||||
class Tacotron(nn.Module):
|
||||
|
@ -13,6 +15,7 @@ class Tacotron(nn.Module):
|
|||
mel_dim=80,
|
||||
memory_size=5,
|
||||
attn_win=False,
|
||||
gst=False,
|
||||
attn_norm="sigmoid",
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
|
@ -25,55 +28,117 @@ class Tacotron(nn.Module):
|
|||
self.r = r
|
||||
self.mel_dim = mel_dim
|
||||
self.linear_dim = linear_dim
|
||||
self.gst = gst
|
||||
self.num_speakers = num_speakers
|
||||
self.embedding = nn.Embedding(num_chars, 256)
|
||||
self.embedding.weight.data.normal_(0, 0.3)
|
||||
decoder_dim = 512 if num_speakers > 1 else 256
|
||||
encoder_dim = 512 if num_speakers > 1 else 256
|
||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
||||
# boilerplate model
|
||||
self.encoder = Encoder(encoder_dim)
|
||||
self.decoder = Decoder(decoder_dim, mel_dim, r, memory_size, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, separate_stopnet,
|
||||
proj_speaker_dim)
|
||||
self.postnet = PostCBHG(mel_dim)
|
||||
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
|
||||
linear_dim)
|
||||
# speaker embedding layers
|
||||
if num_speakers > 1:
|
||||
self.speaker_embedding = nn.Embedding(num_speakers, 256)
|
||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||
self.encoder = Encoder(256)
|
||||
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, separate_stopnet)
|
||||
self.postnet = PostCBHG(mel_dim)
|
||||
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim)
|
||||
self.speaker_project_mel = nn.Sequential(
|
||||
nn.Linear(256, proj_speaker_dim), nn.Tanh())
|
||||
self.speaker_embeddings = None
|
||||
self.speaker_embeddings_projected = None
|
||||
# global style token layers
|
||||
if self.gst:
|
||||
gst_embedding_dim = 256
|
||||
self.gst_layer = GST(num_mel=80,
|
||||
num_heads=4,
|
||||
num_style_tokens=10,
|
||||
embedding_dim=gst_embedding_dim)
|
||||
|
||||
def _init_states(self):
|
||||
self.speaker_embeddings = None
|
||||
self.speaker_embeddings_projected = None
|
||||
|
||||
def compute_speaker_embedding(self, speaker_ids):
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
||||
raise RuntimeError(
|
||||
" [!] Model has speaker embedding layer but speaker_id is not provided"
|
||||
)
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||
self.speaker_embeddings = self._compute_speaker_embedding(
|
||||
speaker_ids)
|
||||
self.speaker_embeddings_projected = self.speaker_project_mel(
|
||||
self.speaker_embeddings).squeeze(1)
|
||||
|
||||
def compute_gst(self, inputs, mel_specs):
|
||||
gst_outputs = self.gst_layer(mel_specs)
|
||||
inputs = self._add_speaker_embedding(inputs, gst_outputs)
|
||||
return inputs
|
||||
|
||||
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
|
||||
B = characters.size(0)
|
||||
mask = sequence_mask(text_lengths).to(characters.device)
|
||||
inputs = self.embedding(characters)
|
||||
self._init_states()
|
||||
self.compute_speaker_embedding(speaker_ids)
|
||||
if self.num_speakers > 1:
|
||||
inputs = self._concat_speaker_embedding(inputs,
|
||||
self.speaker_embeddings)
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||
speaker_ids)
|
||||
if self.gst:
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
||||
if self.num_speakers > 1:
|
||||
encoder_outputs = self._concat_speaker_embedding(
|
||||
encoder_outputs, self.speaker_embeddings)
|
||||
mel_outputs, alignments, stop_tokens = self.decoder(
|
||||
encoder_outputs, mel_specs, mask)
|
||||
encoder_outputs, mel_specs, mask,
|
||||
self.speaker_embeddings_projected)
|
||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||
linear_outputs = self.postnet(mel_outputs)
|
||||
linear_outputs = self.last_linear(linear_outputs)
|
||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||
|
||||
def inference(self, characters, speaker_ids=None):
|
||||
def inference(self, characters, speaker_ids=None, style_mel=None):
|
||||
B = characters.size(0)
|
||||
inputs = self.embedding(characters)
|
||||
self._init_states()
|
||||
self.compute_speaker_embedding(speaker_ids)
|
||||
if self.num_speakers > 1:
|
||||
inputs = self._concat_speaker_embedding(inputs,
|
||||
self.speaker_embeddings)
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||
speaker_ids)
|
||||
if self.gst and style_mel is not None:
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
||||
if self.num_speakers > 1:
|
||||
encoder_outputs = self._concat_speaker_embedding(
|
||||
encoder_outputs, self.speaker_embeddings)
|
||||
mel_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||
encoder_outputs)
|
||||
encoder_outputs, self.speaker_embeddings_projected)
|
||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||
linear_outputs = self.postnet(mel_outputs)
|
||||
linear_outputs = self.last_linear(linear_outputs)
|
||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||
|
||||
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
||||
raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||
def _compute_speaker_embedding(self, speaker_ids):
|
||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
||||
return speaker_embeddings.unsqueeze_(1)
|
||||
|
||||
speaker_embeddings.unsqueeze_(1)
|
||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
||||
encoder_outputs.size(1),
|
||||
-1)
|
||||
encoder_outputs = encoder_outputs + speaker_embeddings
|
||||
return encoder_outputs
|
||||
@staticmethod
|
||||
def _add_speaker_embedding(outputs, speaker_embeddings):
|
||||
speaker_embeddings_ = speaker_embeddings.expand(
|
||||
outputs.size(0), outputs.size(1), -1)
|
||||
outputs = outputs + speaker_embeddings_
|
||||
return outputs
|
||||
|
||||
@staticmethod
|
||||
def _concat_speaker_embedding(outputs, speaker_embeddings):
|
||||
speaker_embeddings_ = speaker_embeddings.expand(
|
||||
outputs.size(0), outputs.size(1), -1)
|
||||
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
|
||||
return outputs
|
||||
|
|
|
@ -1,87 +0,0 @@
|
|||
# coding: utf-8
|
||||
from torch import nn
|
||||
from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
|
||||
from TTS.layers.gst_layers import GST
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
|
||||
|
||||
class TacotronGST(nn.Module):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
num_speakers,
|
||||
r=5,
|
||||
linear_dim=1025,
|
||||
mel_dim=80,
|
||||
memory_size=5,
|
||||
attn_win=False,
|
||||
attn_norm="sigmoid",
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
forward_attn=False,
|
||||
trans_agent=False,
|
||||
forward_attn_mask=False,
|
||||
location_attn=True,
|
||||
separate_stopnet=True):
|
||||
super(TacotronGST, self).__init__()
|
||||
self.r = r
|
||||
self.mel_dim = mel_dim
|
||||
self.linear_dim = linear_dim
|
||||
self.embedding = nn.Embedding(num_chars, 256)
|
||||
self.embedding.weight.data.normal_(0, 0.3)
|
||||
if num_speakers > 1:
|
||||
self.speaker_embedding = nn.Embedding(num_speakers, 256)
|
||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||
self.encoder = Encoder(256)
|
||||
self.gst = GST(num_mel=80, num_heads=4, num_style_tokens=10, embedding_dim=256)
|
||||
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, separate_stopnet)
|
||||
self.postnet = PostCBHG(mel_dim)
|
||||
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim)
|
||||
|
||||
|
||||
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
|
||||
B = characters.size(0)
|
||||
mask = sequence_mask(text_lengths).to(characters.device)
|
||||
inputs = self.embedding(characters)
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||
speaker_ids)
|
||||
gst_outputs = self.gst(mel_specs)
|
||||
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
|
||||
encoder_outputs = encoder_outputs + gst_outputs
|
||||
mel_outputs, alignments, stop_tokens = self.decoder(
|
||||
encoder_outputs, mel_specs, mask)
|
||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||
linear_outputs = self.postnet(mel_outputs)
|
||||
linear_outputs = self.last_linear(linear_outputs)
|
||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||
|
||||
def inference(self, characters, speaker_ids=None, style_mel=None):
|
||||
B = characters.size(0)
|
||||
inputs = self.embedding(characters)
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||
speaker_ids)
|
||||
if style_mel is not None:
|
||||
gst_outputs = self.gst(style_mel)
|
||||
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
|
||||
encoder_outputs = encoder_outputs + gst_outputs
|
||||
mel_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||
encoder_outputs)
|
||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||
linear_outputs = self.postnet(mel_outputs)
|
||||
linear_outputs = self.last_linear(linear_outputs)
|
||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||
|
||||
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
||||
|
||||
speaker_embeddings.unsqueeze_(1)
|
||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
||||
encoder_outputs.size(1),
|
||||
-1)
|
||||
encoder_outputs = encoder_outputs + speaker_embeddings
|
||||
return encoder_outputs
|
|
@ -19,7 +19,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -29,28 +29,11 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Populating the interactive namespace from numpy and matplotlib\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/erogol/miniconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']\n",
|
||||
"`%matplotlib` prevents importing * from pylab and numpy\n",
|
||||
" \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
|
@ -59,6 +42,7 @@
|
|||
"import io\n",
|
||||
"import torch \n",
|
||||
"import time\n",
|
||||
"import json\n",
|
||||
"import numpy as np\n",
|
||||
"from collections import OrderedDict\n",
|
||||
"from matplotlib import pylab as plt\n",
|
||||
|
@ -86,23 +70,25 @@
|
|||
"from IPython.display import Audio\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
|
||||
"os.environ['OMP_NUM_THREADS']='1'\n"
|
||||
"os.environ['CUDA_VISIBLE_DEVICES']='1'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n",
|
||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False, speaker_id=speaker_id, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
|
||||
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
|
||||
" # coorect the normalization differences b/w TTS and the Vocoder.\n",
|
||||
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
|
||||
" mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
|
||||
" mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)\n",
|
||||
" waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n",
|
||||
"\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" if figures: \n",
|
||||
|
@ -117,31 +103,18 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "FileNotFoundError",
|
||||
"evalue": "[Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-9-3306702a6bbc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mVOCODER_MODEL_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mVOCODER_CONFIG_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mVOCODER_CONFIG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mVOCODER_CONFIG_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0muse_cuda\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/projects/TTS/tts_namespace/TTS/utils/generic_utils.py\u001b[0m in \u001b[0;36mload_config\u001b[0;34m(config_path)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mconfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAttrDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'\\\\\\n'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set constants\n",
|
||||
"ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'\n",
|
||||
"MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n",
|
||||
"ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5099/'\n",
|
||||
"MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
|
||||
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
|
||||
"OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n",
|
||||
"OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
|
||||
"CONFIG = load_config(CONFIG_PATH)\n",
|
||||
"VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\n",
|
||||
"VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\n",
|
||||
"VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n",
|
||||
"VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n",
|
||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n",
|
||||
"use_cuda = False\n",
|
||||
"\n",
|
||||
|
@ -149,10 +122,12 @@
|
|||
"# CONFIG.windowing = False\n",
|
||||
"# CONFIG.prenet_dropout = False\n",
|
||||
"# CONFIG.separate_stopnet = True\n",
|
||||
"CONFIG.use_forward_attn = True\n",
|
||||
"# CONFIG.forward_attn_mask = True\n",
|
||||
"# CONFIG.stopnet = True\n",
|
||||
"\n",
|
||||
"# Set the vocoder\n",
|
||||
"use_gl = True # use GL if True\n",
|
||||
"use_gl = False # use GL if True\n",
|
||||
"batched_wavernn = True # use batched wavernn inference if True"
|
||||
]
|
||||
},
|
||||
|
@ -165,9 +140,17 @@
|
|||
"# LOAD TTS MODEL\n",
|
||||
"from utils.text.symbols import symbols, phonemes\n",
|
||||
"\n",
|
||||
"# multi speaker \n",
|
||||
"if CONFIG.use_speaker_embedding:\n",
|
||||
" speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
|
||||
" speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
|
||||
"else:\n",
|
||||
" speakers = []\n",
|
||||
" speaker_id = None\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, CONFIG)\n",
|
||||
"model = setup_model(num_chars, len(speakers), CONFIG)\n",
|
||||
"\n",
|
||||
"# load the audio processor\n",
|
||||
"ap = AudioProcessor(**CONFIG.audio) \n",
|
||||
|
@ -184,7 +167,12 @@
|
|||
"if use_cuda:\n",
|
||||
" model.cuda()\n",
|
||||
"model.eval()\n",
|
||||
"print(cp['step'])"
|
||||
"print(cp['step'])\n",
|
||||
"print(cp['r'])\n",
|
||||
"\n",
|
||||
"# set model stepsize\n",
|
||||
"if 'r' in cp:\n",
|
||||
" model.decoder.set_r(cp['r'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -196,25 +184,28 @@
|
|||
"# LOAD WAVERNN\n",
|
||||
"if use_gl == False:\n",
|
||||
" from WaveRNN.models.wavernn import Model\n",
|
||||
" from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
|
||||
" bits = 10\n",
|
||||
"\n",
|
||||
" ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n",
|
||||
" wavernn = Model(\n",
|
||||
" rnn_dims=512,\n",
|
||||
" fc_dims=512,\n",
|
||||
" mode=\"mold\",\n",
|
||||
" pad=2,\n",
|
||||
" upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset\n",
|
||||
" mode=VOCODER_CONFIG.mode,\n",
|
||||
" mulaw=VOCODER_CONFIG.mulaw,\n",
|
||||
" pad=VOCODER_CONFIG.pad,\n",
|
||||
" upsample_factors=VOCODER_CONFIG.upsample_factors,\n",
|
||||
" feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n",
|
||||
" compute_dims=128,\n",
|
||||
" res_out_dims=128,\n",
|
||||
" res_blocks=10,\n",
|
||||
" hop_length=ap.hop_length,\n",
|
||||
" sample_rate=ap.sample_rate,\n",
|
||||
" hop_length=ap_vocoder.hop_length,\n",
|
||||
" sample_rate=ap_vocoder.sample_rate,\n",
|
||||
" use_upsample_net = True,\n",
|
||||
" use_aux_net = True\n",
|
||||
" ).cuda()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" check = torch.load(VOCODER_MODEL_PATH)\n",
|
||||
" wavernn.load_state_dict(check['model'])\n",
|
||||
" wavernn.load_state_dict(check['model'], strict=False)\n",
|
||||
" if use_cuda:\n",
|
||||
" wavernn.cuda()\n",
|
||||
" wavernn.eval();\n",
|
||||
|
@ -230,111 +221,67 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'model' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-5-e285d5bde9fb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_decoder_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m2000\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mspeaker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.eval()\n",
|
||||
"model.decoder.max_decoder_steps = 2000\n",
|
||||
"speaker_id = 0\n",
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'model' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-6-621056ffa667>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Be a voice, not an echo.\"\u001b[0m \u001b[0;31m# 'echo' is not in training set.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'model' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-7-26967668a1a1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"The human voice is the most perfect instrument of all.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'model' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-8-28cb5023e353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"I'm sorry Dave. I'm afraid I can't do that.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"speaker_id = None\n",
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.eval()\n",
|
||||
"model.decoder.max_decoder_steps = 2000\n",
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
"scrolled": true
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"This cake is great. It's so delicious and moist.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -347,76 +294,51 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -429,136 +351,91 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \" He has read the whole thing.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"He reads books.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Thisss isrealy awhsome.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"This is your internet browser, Firefox.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"This is your internet browser Firefox.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -568,7 +445,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Eren, how are you?\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -581,107 +458,62 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Encouraged, he started with a minute a day.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"If he decided to watch TV he really watched it.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
},
|
||||
"scrolled": true
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# for twb dataset\n",
|
||||
"sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !zip benchmark_samples/samples.zip benchmark_samples/*"
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
@ -2,6 +2,7 @@ import os
|
|||
import time
|
||||
import argparse
|
||||
import torch
|
||||
import json
|
||||
import string
|
||||
|
||||
from TTS.utils.synthesis import synthesis
|
||||
|
@ -16,22 +17,28 @@ def tts(model,
|
|||
VC,
|
||||
text,
|
||||
ap,
|
||||
ap_vocoder,
|
||||
use_cuda,
|
||||
batched_vocoder,
|
||||
speaker_id=None,
|
||||
figures=False):
|
||||
t_1 = time.time()
|
||||
use_vocoder_model = vocoder_model is not None
|
||||
waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis(
|
||||
model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars)
|
||||
waveform, alignment, _, postnet_output, stop_tokens = synthesis(
|
||||
model, text, C, use_cuda, ap, speaker_id, False,
|
||||
C.enable_eos_bos_chars)
|
||||
if C.model == "Tacotron" and use_vocoder_model:
|
||||
postnet_output = ap.out_linear_to_mel(postnet_output.T).T
|
||||
# correct if there is a scale difference b/w two models
|
||||
postnet_output = ap._denormalize(postnet_output)
|
||||
postnet_output = ap_vocoder._normalize(postnet_output)
|
||||
if use_vocoder_model:
|
||||
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
|
||||
waveform = vocoder_model.generate(
|
||||
vocoder_input.cuda() if use_cuda else vocoder_input,
|
||||
batched=batched_vocoder,
|
||||
target=11000,
|
||||
overlap=550)
|
||||
target=8000,
|
||||
overlap=400)
|
||||
print(" > Run-time: {}".format(time.time() - t_1))
|
||||
return alignment, postnet_output, stop_tokens, waveform
|
||||
|
||||
|
@ -39,13 +46,10 @@ def tts(model,
|
|||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'text', type=str, help='Text to generate speech.')
|
||||
parser.add_argument(
|
||||
'config_path',
|
||||
parser.add_argument('text', type=str, help='Text to generate speech.')
|
||||
parser.add_argument('config_path',
|
||||
type=str,
|
||||
help='Path to model config file.'
|
||||
)
|
||||
help='Path to model config file.')
|
||||
parser.add_argument(
|
||||
'model_path',
|
||||
type=str,
|
||||
|
@ -56,8 +60,10 @@ if __name__ == "__main__":
|
|||
type=str,
|
||||
help='Path to save final wav file.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--use_cuda', type=bool, help='Run model on CUDA.', default=False)
|
||||
parser.add_argument('--use_cuda',
|
||||
type=bool,
|
||||
help='Run model on CUDA.',
|
||||
default=False)
|
||||
parser.add_argument(
|
||||
'--vocoder_path',
|
||||
type=str,
|
||||
|
@ -65,8 +71,7 @@ if __name__ == "__main__":
|
|||
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
|
||||
default="",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--vocoder_config_path',
|
||||
parser.add_argument('--vocoder_config_path',
|
||||
type=str,
|
||||
help='Path to vocoder model config file.',
|
||||
default="")
|
||||
|
@ -75,12 +80,15 @@ if __name__ == "__main__":
|
|||
type=bool,
|
||||
help="If True, vocoder model uses faster batch processing.",
|
||||
default=True)
|
||||
parser.add_argument(
|
||||
'--speakers_json',
|
||||
parser.add_argument('--speakers_json',
|
||||
type=str,
|
||||
help="JSON file for multi-speaker model.",
|
||||
default=""
|
||||
)
|
||||
default="")
|
||||
parser.add_argument(
|
||||
'--speaker_id',
|
||||
type=int,
|
||||
help="target speaker_id if the model is multi-speaker.",
|
||||
default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.vocoder_path != "":
|
||||
|
@ -109,13 +117,14 @@ if __name__ == "__main__":
|
|||
model.eval()
|
||||
if args.use_cuda:
|
||||
model.cuda()
|
||||
model.decoder.set_r(cp['r'])
|
||||
|
||||
# load vocoder model
|
||||
if args.vocoder_path != "":
|
||||
VC = load_config(args.vocoder_config_path)
|
||||
ap_vocoder = AudioProcessor(**VC.audio)
|
||||
bits = 10
|
||||
vocoder_model = VocoderModel(
|
||||
rnn_dims=512,
|
||||
vocoder_model = VocoderModel(rnn_dims=512,
|
||||
fc_dims=512,
|
||||
mode=VC.mode,
|
||||
mulaw=VC.mulaw,
|
||||
|
@ -127,7 +136,8 @@ if __name__ == "__main__":
|
|||
res_blocks=10,
|
||||
hop_length=ap.hop_length,
|
||||
sample_rate=ap.sample_rate,
|
||||
)
|
||||
use_aux_net=True,
|
||||
use_upsample_net=True)
|
||||
|
||||
check = torch.load(args.vocoder_path)
|
||||
vocoder_model.load_state_dict(check['model'])
|
||||
|
@ -137,23 +147,26 @@ if __name__ == "__main__":
|
|||
else:
|
||||
vocoder_model = None
|
||||
VC = None
|
||||
ap_vocoder = None
|
||||
|
||||
# synthesize voice
|
||||
print(" > Text: {}".format(args.text))
|
||||
_, _, _, wav = tts(
|
||||
model,
|
||||
_, _, _, wav = tts(model,
|
||||
vocoder_model,
|
||||
C,
|
||||
VC,
|
||||
args.text,
|
||||
ap,
|
||||
ap_vocoder,
|
||||
args.use_cuda,
|
||||
args.batched_vocoder,
|
||||
speaker_id=args.speaker_id,
|
||||
figures=False)
|
||||
|
||||
# save the results
|
||||
file_name = args.text.replace(" ", "_")
|
||||
file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', '')))+'.wav'
|
||||
file_name = file_name.translate(
|
||||
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
|
||||
out_path = os.path.join(args.out_path, file_name)
|
||||
print(" > Saving output to {}".format(out_path))
|
||||
ap.save_wav(wav, out_path)
|
||||
|
|
|
@ -54,7 +54,8 @@ class DecoderTests(unittest.TestCase):
|
|||
trans_agent=True,
|
||||
forward_attn_mask=True,
|
||||
location_attn=True,
|
||||
separate_stopnet=True)
|
||||
separate_stopnet=True,
|
||||
speaker_embedding_dim=0)
|
||||
dummy_input = T.rand(4, 8, 256)
|
||||
dummy_memory = T.rand(4, 2, 80)
|
||||
|
||||
|
@ -66,6 +67,35 @@ class DecoderTests(unittest.TestCase):
|
|||
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
|
||||
assert stop_tokens.shape[0] == 4
|
||||
|
||||
@staticmethod
|
||||
def test_in_out_multispeaker():
|
||||
layer = Decoder(
|
||||
in_features=256,
|
||||
memory_dim=80,
|
||||
r=2,
|
||||
memory_size=4,
|
||||
attn_windowing=False,
|
||||
attn_norm="sigmoid",
|
||||
prenet_type='original',
|
||||
prenet_dropout=True,
|
||||
forward_attn=True,
|
||||
trans_agent=True,
|
||||
forward_attn_mask=True,
|
||||
location_attn=True,
|
||||
separate_stopnet=True,
|
||||
speaker_embedding_dim=80)
|
||||
dummy_input = T.rand(4, 8, 256)
|
||||
dummy_memory = T.rand(4, 2, 80)
|
||||
dummy_embed = T.rand(4, 80)
|
||||
|
||||
output, alignment, stop_tokens = layer(
|
||||
dummy_input, dummy_memory, mask=None, speaker_embeddings=dummy_embed)
|
||||
|
||||
assert output.shape[0] == 4
|
||||
assert output.shape[1] == 1, "size not {}".format(output.shape[1])
|
||||
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
|
||||
assert stop_tokens.shape[0] == 4
|
||||
|
||||
|
||||
class EncoderTests(unittest.TestCase):
|
||||
def test_in_out(self):
|
||||
|
|
|
@ -25,8 +25,9 @@ def count_parameters(model):
|
|||
|
||||
|
||||
class TacotronTrainTest(unittest.TestCase):
|
||||
def test_train_step(self):
|
||||
input = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
@staticmethod
|
||||
def test_train_step():
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||
|
@ -38,7 +39,7 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()):, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input.shape[0],
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) >
|
||||
0.0).unsqueeze(2).float().squeeze()
|
||||
|
@ -51,9 +52,11 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
linear_dim=c.audio['num_freq'],
|
||||
mel_dim=c.audio['num_mels'],
|
||||
r=c.r,
|
||||
memory_size=c.memory_size).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
memory_size=c.memory_size
|
||||
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model.train()
|
||||
print(" > Num parameters for Tacotron model:%s"%(count_parameters(model)))
|
||||
print(" > Num parameters for Tacotron model:%s" %
|
||||
(count_parameters(model)))
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(),
|
||||
|
@ -63,7 +66,7 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for _ in range(5):
|
||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||
input, input_lengths, mel_spec, speaker_ids)
|
||||
input_dummy, input_lengths, mel_spec, speaker_ids)
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
|
@ -81,3 +84,66 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
||||
count, param.shape, param, param_ref)
|
||||
count += 1
|
||||
|
||||
|
||||
class TacotronGSTTrainTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def test_train_step():
|
||||
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
|
||||
linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device)
|
||||
mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
|
||||
stop_targets = torch.zeros(8, 120, 1).float().to(device)
|
||||
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
||||
|
||||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()):, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input_dummy.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) >
|
||||
0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = L1LossMasked().to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron(
|
||||
num_chars=32,
|
||||
num_speakers=5,
|
||||
gst=True,
|
||||
linear_dim=c.audio['num_freq'],
|
||||
mel_dim=c.audio['num_mels'],
|
||||
r=c.r,
|
||||
memory_size=c.memory_size
|
||||
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
|
||||
model.train()
|
||||
print(model)
|
||||
print(" > Num parameters for Tacotron GST model:%s" %
|
||||
(count_parameters(model)))
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(),
|
||||
model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for _ in range(10):
|
||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, speaker_ids)
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(linear_out, linear_spec,
|
||||
mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(),
|
||||
model_ref.parameters()):
|
||||
# ignore pre-higway layer since it works conditional
|
||||
assert (param != param_ref).any(
|
||||
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
||||
count, param.shape, param, param_ref)
|
||||
count += 1
|
||||
|
|
331
train.py
331
train.py
|
@ -15,21 +15,21 @@ from distribute import (DistributedSampler, apply_gradient_allreduce,
|
|||
init_distributed, reduce_tensor)
|
||||
from TTS.layers.losses import L1LossMasked, MSELossMasked
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
load_config, remove_experiment_folder,
|
||||
save_best_model, save_checkpoint, weight_decay,
|
||||
set_init_dict, copy_config_file, setup_model,
|
||||
split_dataset, gradual_training_scheduler)
|
||||
from TTS.utils.generic_utils import (
|
||||
NoamLR, check_update, count_parameters, create_experiment_folder,
|
||||
get_git_branch, load_config, remove_experiment_folder, save_best_model,
|
||||
save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file,
|
||||
setup_model, gradual_training_scheduler, KeepAverage,
|
||||
set_weight_decay)
|
||||
from TTS.utils.logger import Logger
|
||||
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
|
||||
get_speakers
|
||||
from TTS.utils.synthesis import synthesis
|
||||
from TTS.utils.text.symbols import phonemes, symbols
|
||||
from TTS.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.datasets.preprocess import get_preprocessor_by_name
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.utils.radam import RAdam
|
||||
|
||||
from TTS.utils.measures import alignment_diagonal_score
|
||||
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
@ -41,18 +41,6 @@ print(" > Number of GPUs: ", num_gpus)
|
|||
|
||||
|
||||
def setup_loader(ap, is_val=False, verbose=False):
|
||||
global meta_data_train
|
||||
global meta_data_eval
|
||||
if "meta_data_train" not in globals():
|
||||
if c.meta_file_train is not None:
|
||||
meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path, c.meta_file_train)
|
||||
else:
|
||||
meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path)
|
||||
if "meta_data_eval" not in globals() and c.run_eval:
|
||||
if c.meta_file_val is not None:
|
||||
meta_data_eval = get_preprocessor_by_name(c.dataset)(c.data_path, c.meta_file_val)
|
||||
else:
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
||||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
|
@ -61,7 +49,8 @@ def setup_loader(ap, is_val=False, verbose=False):
|
|||
c.text_cleaner,
|
||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||
ap=ap,
|
||||
batch_group_size=0 if is_val else c.batch_group_size * c.batch_size,
|
||||
batch_group_size=0 if is_val else c.batch_group_size *
|
||||
c.batch_size,
|
||||
min_seq_len=c.min_seq_len,
|
||||
max_seq_len=c.max_seq_len,
|
||||
phoneme_cache_path=c.phoneme_cache_path,
|
||||
|
@ -90,14 +79,21 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
avg_postnet_loss = 0
|
||||
avg_decoder_loss = 0
|
||||
avg_stop_loss = 0
|
||||
avg_step_time = 0
|
||||
avg_loader_time = 0
|
||||
train_values = {
|
||||
'avg_postnet_loss': 0,
|
||||
'avg_decoder_loss': 0,
|
||||
'avg_stop_loss': 0,
|
||||
'avg_align_score': 0,
|
||||
'avg_step_time': 0,
|
||||
'avg_loader_time': 0,
|
||||
'avg_alignment_score': 0
|
||||
}
|
||||
keep_avg = KeepAverage()
|
||||
keep_avg.add_values(train_values)
|
||||
print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True)
|
||||
if use_cuda:
|
||||
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||
batch_n_iter = int(
|
||||
len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||
else:
|
||||
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
|
||||
end_time = time.time()
|
||||
|
@ -108,7 +104,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
speaker_names = data[2]
|
||||
linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None
|
||||
linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"
|
||||
] else None
|
||||
mel_input = data[4]
|
||||
mel_lengths = data[5]
|
||||
stop_targets = data[6]
|
||||
|
@ -117,8 +114,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
loader_time = time.time() - end_time
|
||||
|
||||
if c.use_speaker_embedding:
|
||||
speaker_ids = [speaker_mapping[speaker_name]
|
||||
for speaker_name in speaker_names]
|
||||
speaker_ids = [
|
||||
speaker_mapping[speaker_name] for speaker_name in speaker_names
|
||||
]
|
||||
speaker_ids = torch.LongTensor(speaker_ids)
|
||||
else:
|
||||
speaker_ids = None
|
||||
|
@ -126,7 +124,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
# set stop targets view, we predict a single stop token per r frames prediction
|
||||
stop_targets = stop_targets.view(text_input.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
|
||||
stop_targets = (stop_targets.sum(2) >
|
||||
0.0).unsqueeze(2).float().squeeze(2)
|
||||
|
||||
global_step += 1
|
||||
|
||||
|
@ -143,7 +142,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
text_lengths = text_lengths.cuda(non_blocking=True)
|
||||
mel_input = mel_input.cuda(non_blocking=True)
|
||||
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
||||
linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron", "TacotronGST"] else None
|
||||
linear_input = linear_input.cuda(
|
||||
non_blocking=True) if c.model in ["Tacotron", "TacotronGST"
|
||||
] else None
|
||||
stop_targets = stop_targets.cuda(non_blocking=True)
|
||||
if speaker_ids is not None:
|
||||
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
||||
|
@ -153,13 +154,16 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
||||
|
||||
# loss computation
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
||||
stop_loss = criterion_st(stop_tokens,
|
||||
stop_targets) if c.stopnet else torch.zeros(1)
|
||||
if c.loss_masking:
|
||||
decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
|
||||
if c.model in ["Tacotron", "TacotronGST"]:
|
||||
postnet_loss = criterion(postnet_output, linear_input, mel_lengths)
|
||||
postnet_loss = criterion(postnet_output, linear_input,
|
||||
mel_lengths)
|
||||
else:
|
||||
postnet_loss = criterion(postnet_output, mel_input, mel_lengths)
|
||||
postnet_loss = criterion(postnet_output, mel_input,
|
||||
mel_lengths)
|
||||
else:
|
||||
decoder_loss = criterion(decoder_output, mel_input)
|
||||
if c.model in ["Tacotron", "TacotronGST"]:
|
||||
|
@ -171,14 +175,18 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
loss += stop_loss
|
||||
|
||||
loss.backward()
|
||||
optimizer, current_lr = weight_decay(optimizer, c.wd)
|
||||
optimizer, current_lr = adam_weight_decay(optimizer)
|
||||
grad_norm, _ = check_update(model, c.grad_clip)
|
||||
optimizer.step()
|
||||
|
||||
# compute alignment score
|
||||
align_score = alignment_diagonal_score(alignments)
|
||||
keep_avg.update_value('avg_align_score', align_score)
|
||||
|
||||
# backpass and check the grad norm for stop loss
|
||||
if c.separate_stopnet:
|
||||
stop_loss.backward()
|
||||
optimizer_st, _ = weight_decay(optimizer_st, c.wd)
|
||||
optimizer_st, _ = adam_weight_decay(optimizer_st)
|
||||
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
|
||||
optimizer_st.step()
|
||||
else:
|
||||
|
@ -189,14 +197,14 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
|
||||
if global_step % c.print_step == 0:
|
||||
print(
|
||||
" | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} "
|
||||
"DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} "
|
||||
" | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} "
|
||||
"DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} "
|
||||
"GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} "
|
||||
"LoaderTime:{:.2f} LR:{:.6f}".format(
|
||||
num_iter, batch_n_iter, global_step, loss.item(),
|
||||
postnet_loss.item(), decoder_loss.item(), stop_loss.item(),
|
||||
grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time,
|
||||
loader_time, current_lr),
|
||||
num_iter, batch_n_iter, global_step, postnet_loss.item(),
|
||||
decoder_loss.item(), stop_loss.item(), align_score,
|
||||
grad_norm, grad_norm_st, avg_text_length, avg_spec_length,
|
||||
step_time, loader_time, current_lr),
|
||||
flush=True)
|
||||
|
||||
# aggregate losses from processes
|
||||
|
@ -204,24 +212,36 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
|
||||
decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
|
||||
loss = reduce_tensor(loss.data, num_gpus)
|
||||
stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss
|
||||
stop_loss = reduce_tensor(stop_loss.data,
|
||||
num_gpus) if c.stopnet else stop_loss
|
||||
|
||||
if args.rank == 0:
|
||||
avg_postnet_loss += float(postnet_loss.item())
|
||||
avg_decoder_loss += float(decoder_loss.item())
|
||||
avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item())
|
||||
avg_step_time += step_time
|
||||
avg_loader_time += loader_time
|
||||
update_train_values = {
|
||||
'avg_postnet_loss':
|
||||
float(postnet_loss.item()),
|
||||
'avg_decoder_loss':
|
||||
float(decoder_loss.item()),
|
||||
'avg_stop_loss':
|
||||
stop_loss
|
||||
if isinstance(stop_loss, float) else float(stop_loss.item()),
|
||||
'avg_step_time':
|
||||
step_time,
|
||||
'avg_loader_time':
|
||||
loader_time
|
||||
}
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
# Plot Training Iter Stats
|
||||
# reduce TB load
|
||||
if global_step % 10 == 0:
|
||||
iter_stats = {"loss_posnet": postnet_loss.item(),
|
||||
iter_stats = {
|
||||
"loss_posnet": postnet_loss.item(),
|
||||
"loss_decoder": decoder_loss.item(),
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"grad_norm_st": grad_norm_st,
|
||||
"step_time": step_time}
|
||||
"step_time": step_time
|
||||
}
|
||||
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||
|
||||
if global_step % c.save_step == 0:
|
||||
|
@ -233,7 +253,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
|
||||
# Diagnostic visualizations
|
||||
const_spec = postnet_output[0].data.cpu().numpy()
|
||||
gt_spec = linear_input[0].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy()
|
||||
gt_spec = linear_input[0].data.cpu().numpy() if c.model in [
|
||||
"Tacotron", "TacotronGST"
|
||||
] else mel_input[0].data.cpu().numpy()
|
||||
align_img = alignments[0].data.cpu().numpy()
|
||||
|
||||
figures = {
|
||||
|
@ -253,35 +275,31 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
c.audio["sample_rate"])
|
||||
end_time = time.time()
|
||||
|
||||
avg_postnet_loss /= (num_iter + 1)
|
||||
avg_decoder_loss /= (num_iter + 1)
|
||||
avg_stop_loss /= (num_iter + 1)
|
||||
avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss
|
||||
avg_step_time /= (num_iter + 1)
|
||||
avg_loader_time /= (num_iter + 1)
|
||||
|
||||
# print epoch stats
|
||||
print(
|
||||
" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
|
||||
print(" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
|
||||
"AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} "
|
||||
"AvgStopLoss:{:.5f} EpochTime:{:.2f} "
|
||||
"AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, avg_total_loss,
|
||||
avg_postnet_loss, avg_decoder_loss,
|
||||
avg_stop_loss, epoch_time, avg_step_time,
|
||||
avg_loader_time),
|
||||
"AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(
|
||||
global_step, keep_avg['avg_postnet_loss'],
|
||||
keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'],
|
||||
keep_avg['avg_align_score'], epoch_time,
|
||||
keep_avg['avg_step_time'], keep_avg['avg_loader_time']),
|
||||
flush=True)
|
||||
|
||||
# Plot Epoch Stats
|
||||
if args.rank == 0:
|
||||
# Plot Training Epoch Stats
|
||||
epoch_stats = {"loss_postnet": avg_postnet_loss,
|
||||
"loss_decoder": avg_decoder_loss,
|
||||
"stop_loss": avg_stop_loss,
|
||||
"epoch_time": epoch_time}
|
||||
epoch_stats = {
|
||||
"loss_postnet": keep_avg['avg_postnet_loss'],
|
||||
"loss_decoder": keep_avg['avg_decoder_loss'],
|
||||
"stop_loss": keep_avg['avg_stop_loss'],
|
||||
"alignment_score": keep_avg['avg_align_score'],
|
||||
"epoch_time": epoch_time
|
||||
}
|
||||
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
|
||||
if c.tb_model_param_stats:
|
||||
tb_logger.tb_model_weights(model, global_step)
|
||||
return avg_postnet_loss, global_step
|
||||
return keep_avg['avg_postnet_loss'], global_step
|
||||
|
||||
|
||||
def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
||||
|
@ -290,9 +308,14 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
avg_postnet_loss = 0
|
||||
avg_decoder_loss = 0
|
||||
avg_stop_loss = 0
|
||||
eval_values_dict = {
|
||||
'avg_postnet_loss': 0,
|
||||
'avg_decoder_loss': 0,
|
||||
'avg_stop_loss': 0,
|
||||
'avg_align_score': 0
|
||||
}
|
||||
keep_avg = KeepAverage()
|
||||
keep_avg.add_values(eval_values_dict)
|
||||
print("\n > Validation")
|
||||
if c.test_sentences_file is None:
|
||||
test_sentences = [
|
||||
|
@ -313,14 +336,18 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
speaker_names = data[2]
|
||||
linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None
|
||||
linear_input = data[3] if c.model in [
|
||||
"Tacotron", "TacotronGST"
|
||||
] else None
|
||||
mel_input = data[4]
|
||||
mel_lengths = data[5]
|
||||
stop_targets = data[6]
|
||||
|
||||
if c.use_speaker_embedding:
|
||||
speaker_ids = [speaker_mapping[speaker_name]
|
||||
for speaker_name in speaker_names]
|
||||
speaker_ids = [
|
||||
speaker_mapping[speaker_name]
|
||||
for speaker_name in speaker_names
|
||||
]
|
||||
speaker_ids = torch.LongTensor(speaker_ids)
|
||||
else:
|
||||
speaker_ids = None
|
||||
|
@ -329,14 +356,17 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
stop_targets = stop_targets.view(text_input.shape[0],
|
||||
stop_targets.size(1) // c.r,
|
||||
-1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
|
||||
stop_targets = (stop_targets.sum(2) >
|
||||
0.0).unsqueeze(2).float().squeeze(2)
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
text_input = text_input.cuda()
|
||||
mel_input = mel_input.cuda()
|
||||
mel_lengths = mel_lengths.cuda()
|
||||
linear_input = linear_input.cuda() if c.model in ["Tacotron", "TacotronGST"] else None
|
||||
linear_input = linear_input.cuda() if c.model in [
|
||||
"Tacotron", "TacotronGST"
|
||||
] else None
|
||||
stop_targets = stop_targets.cuda()
|
||||
if speaker_ids is not None:
|
||||
speaker_ids = speaker_ids.cuda()
|
||||
|
@ -347,13 +377,17 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
speaker_ids=speaker_ids)
|
||||
|
||||
# loss computation
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
||||
stop_loss = criterion_st(
|
||||
stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
||||
if c.loss_masking:
|
||||
decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
|
||||
decoder_loss = criterion(decoder_output, mel_input,
|
||||
mel_lengths)
|
||||
if c.model in ["Tacotron", "TacotronGST"]:
|
||||
postnet_loss = criterion(postnet_output, linear_input, mel_lengths)
|
||||
postnet_loss = criterion(postnet_output, linear_input,
|
||||
mel_lengths)
|
||||
else:
|
||||
postnet_loss = criterion(postnet_output, mel_input, mel_lengths)
|
||||
postnet_loss = criterion(postnet_output, mel_input,
|
||||
mel_lengths)
|
||||
else:
|
||||
decoder_loss = criterion(decoder_output, mel_input)
|
||||
if c.model in ["Tacotron", "TacotronGST"]:
|
||||
|
@ -365,14 +399,9 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
if num_iter % c.print_step == 0:
|
||||
print(
|
||||
" | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} "
|
||||
"StopLoss: {:.5f} ".format(loss.item(),
|
||||
postnet_loss.item(),
|
||||
decoder_loss.item(),
|
||||
stop_loss.item()),
|
||||
flush=True)
|
||||
# compute alignment score
|
||||
align_score = alignment_diagonal_score(alignments)
|
||||
keep_avg.update_value('avg_align_score', align_score)
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
|
@ -381,15 +410,34 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
if c.stopnet:
|
||||
stop_loss = reduce_tensor(stop_loss.data, num_gpus)
|
||||
|
||||
avg_postnet_loss += float(postnet_loss.item())
|
||||
avg_decoder_loss += float(decoder_loss.item())
|
||||
avg_stop_loss += stop_loss.item()
|
||||
keep_avg.update_values({
|
||||
'avg_postnet_loss':
|
||||
float(postnet_loss.item()),
|
||||
'avg_decoder_loss':
|
||||
float(decoder_loss.item()),
|
||||
'avg_stop_loss':
|
||||
float(stop_loss.item())
|
||||
})
|
||||
|
||||
if num_iter % c.print_step == 0:
|
||||
print(
|
||||
" | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} "
|
||||
"StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}"
|
||||
.format(loss.item(), postnet_loss.item(),
|
||||
keep_avg['avg_postnet_loss'],
|
||||
decoder_loss.item(),
|
||||
keep_avg['avg_decoder_loss'], stop_loss.item(),
|
||||
keep_avg['avg_stop_loss'], align_score,
|
||||
keep_avg['avg_align_score']),
|
||||
flush=True)
|
||||
|
||||
if args.rank == 0:
|
||||
# Diagnostic visualizations
|
||||
idx = np.random.randint(mel_input.shape[0])
|
||||
const_spec = postnet_output[idx].data.cpu().numpy()
|
||||
gt_spec = linear_input[idx].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[idx].data.cpu().numpy()
|
||||
gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
|
||||
"Tacotron", "TacotronGST"
|
||||
] else mel_input[idx].data.cpu().numpy()
|
||||
align_img = alignments[idx].data.cpu().numpy()
|
||||
|
||||
eval_figures = {
|
||||
|
@ -404,17 +452,15 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
eval_audio = ap.inv_spectrogram(const_spec.T)
|
||||
else:
|
||||
eval_audio = ap.inv_mel_spectrogram(const_spec.T)
|
||||
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"])
|
||||
|
||||
# compute average losses
|
||||
avg_postnet_loss /= (num_iter + 1)
|
||||
avg_decoder_loss /= (num_iter + 1)
|
||||
avg_stop_loss /= (num_iter + 1)
|
||||
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
|
||||
c.audio["sample_rate"])
|
||||
|
||||
# Plot Validation Stats
|
||||
epoch_stats = {"loss_postnet": avg_postnet_loss,
|
||||
"loss_decoder": avg_decoder_loss,
|
||||
"stop_loss": avg_stop_loss}
|
||||
epoch_stats = {
|
||||
"loss_postnet": keep_avg['avg_postnet_loss'],
|
||||
"loss_decoder": keep_avg['avg_decoder_loss'],
|
||||
"stop_loss": keep_avg['avg_stop_loss']
|
||||
}
|
||||
tb_logger.tb_eval_stats(global_step, epoch_stats)
|
||||
|
||||
if args.rank == 0 and epoch > c.test_delay_epochs:
|
||||
|
@ -427,7 +473,11 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
for idx, test_sentence in enumerate(test_sentences):
|
||||
try:
|
||||
wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
|
||||
model, test_sentence, c, use_cuda, ap,
|
||||
model,
|
||||
test_sentence,
|
||||
c,
|
||||
use_cuda,
|
||||
ap,
|
||||
speaker_id=speaker_id,
|
||||
style_wav=style_wav)
|
||||
file_path = os.path.join(AUDIO_PATH, str(global_step))
|
||||
|
@ -436,18 +486,22 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
"TestSentence_{}.wav".format(idx))
|
||||
ap.save_wav(wav, file_path)
|
||||
test_audios['{}-audio'.format(idx)] = wav
|
||||
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(postnet_output, ap)
|
||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment)
|
||||
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
|
||||
postnet_output, ap)
|
||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(
|
||||
alignment)
|
||||
except:
|
||||
print(" !! Error creating Test Sentence -", idx)
|
||||
traceback.print_exc()
|
||||
tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate'])
|
||||
tb_logger.tb_test_audios(global_step, test_audios,
|
||||
c.audio['sample_rate'])
|
||||
tb_logger.tb_test_figures(global_step, test_figures)
|
||||
return avg_postnet_loss
|
||||
return keep_avg['avg_postnet_loss']
|
||||
|
||||
|
||||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
global meta_data_train, meta_data_eval
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
|
||||
|
@ -457,8 +511,12 @@ def main(args): #pylint: disable=redefined-outer-name
|
|||
c.distributed["backend"], c.distributed["url"])
|
||||
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
|
||||
|
||||
# load data instances
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
|
||||
# parse speakers
|
||||
if c.use_speaker_embedding:
|
||||
speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset)
|
||||
speakers = get_speakers(meta_data_train)
|
||||
if args.restore_path:
|
||||
prev_out_path = os.path.dirname(args.restore_path)
|
||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||
|
@ -467,8 +525,7 @@ def main(args): #pylint: disable=redefined-outer-name
|
|||
"introduce new speakers to " \
|
||||
"a previously trained model."
|
||||
else:
|
||||
speaker_mapping = {name: i
|
||||
for i, name in enumerate(speakers)}
|
||||
speaker_mapping = {name: i for i, name in enumerate(speakers)}
|
||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||
num_speakers = len(speaker_mapping)
|
||||
print("Training with {} speakers: {}".format(num_speakers,
|
||||
|
@ -480,18 +537,23 @@ def main(args): #pylint: disable=redefined-outer-name
|
|||
|
||||
print(" | > Num output units : {}".format(ap.num_freq), flush=True)
|
||||
|
||||
optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0)
|
||||
params = set_weight_decay(model, c.wd)
|
||||
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
|
||||
if c.stopnet and c.separate_stopnet:
|
||||
optimizer_st = RAdam(
|
||||
model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0)
|
||||
optimizer_st = RAdam(model.decoder.stopnet.parameters(),
|
||||
lr=c.lr,
|
||||
weight_decay=0)
|
||||
else:
|
||||
optimizer_st = None
|
||||
|
||||
if c.loss_masking:
|
||||
criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"] else MSELossMasked()
|
||||
criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"
|
||||
] else MSELossMasked()
|
||||
else:
|
||||
criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"] else nn.MSELoss()
|
||||
criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None
|
||||
criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
|
||||
] else nn.MSELoss()
|
||||
criterion_st = nn.BCEWithLogitsLoss(
|
||||
pos_weight=torch.tensor(20.0)) if c.stopnet else None
|
||||
|
||||
if args.restore_path:
|
||||
checkpoint = torch.load(args.restore_path)
|
||||
|
@ -510,8 +572,8 @@ def main(args): #pylint: disable=redefined-outer-name
|
|||
del model_dict
|
||||
for group in optimizer.param_groups:
|
||||
group['lr'] = c.lr
|
||||
print(
|
||||
" > Model restored from step %d" % checkpoint['step'], flush=True)
|
||||
print(" > Model restored from step %d" % checkpoint['step'],
|
||||
flush=True)
|
||||
args.restore_step = checkpoint['step']
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
@ -527,8 +589,7 @@ def main(args): #pylint: disable=redefined-outer-name
|
|||
model = apply_gradient_allreduce(model)
|
||||
|
||||
if c.lr_decay:
|
||||
scheduler = NoamLR(
|
||||
optimizer,
|
||||
scheduler = NoamLR(optimizer,
|
||||
warmup_steps=c.warmup_steps,
|
||||
last_epoch=args.restore_step - 1)
|
||||
else:
|
||||
|
@ -550,11 +611,11 @@ def main(args): #pylint: disable=redefined-outer-name
|
|||
print(" > Number of outputs per iteration:", model.decoder.r)
|
||||
|
||||
train_loss, global_step = train(model, criterion, criterion_st,
|
||||
optimizer, optimizer_st, scheduler,
|
||||
ap, global_step, epoch)
|
||||
val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch)
|
||||
print(
|
||||
" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format(
|
||||
optimizer, optimizer_st, scheduler, ap,
|
||||
global_step, epoch)
|
||||
val_loss = evaluate(model, criterion, criterion_st, ap, global_step,
|
||||
epoch)
|
||||
print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format(
|
||||
train_loss, val_loss),
|
||||
flush=True)
|
||||
target_loss = train_loss
|
||||
|
@ -576,8 +637,7 @@ if __name__ == '__main__':
|
|||
type=str,
|
||||
help='Path to config file for training.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--debug',
|
||||
parser.add_argument('--debug',
|
||||
type=bool,
|
||||
default=True,
|
||||
help='Do not verify commit integrity to run training.')
|
||||
|
@ -586,17 +646,14 @@ if __name__ == '__main__':
|
|||
type=str,
|
||||
default='',
|
||||
help='Defines the data path. It overwrites config.json.')
|
||||
parser.add_argument(
|
||||
'--output_path',
|
||||
parser.add_argument('--output_path',
|
||||
type=str,
|
||||
help='path for training outputs.',
|
||||
default='')
|
||||
parser.add_argument(
|
||||
'--output_folder',
|
||||
parser.add_argument('--output_folder',
|
||||
type=str,
|
||||
default='',
|
||||
help='folder name for training outputs.'
|
||||
)
|
||||
help='folder name for training outputs.')
|
||||
|
||||
# DISTRUBUTED
|
||||
parser.add_argument(
|
||||
|
@ -604,8 +661,7 @@ if __name__ == '__main__':
|
|||
type=int,
|
||||
default=0,
|
||||
help='DISTRIBUTED: process rank for distributed training.')
|
||||
parser.add_argument(
|
||||
'--group_id',
|
||||
parser.add_argument('--group_id',
|
||||
type=str,
|
||||
default="",
|
||||
help='DISTRIBUTED: process group id.')
|
||||
|
@ -635,7 +691,8 @@ if __name__ == '__main__':
|
|||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
copy_config_file(args.config_path,
|
||||
os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ class AudioProcessor(object):
|
|||
clip_norm=True,
|
||||
griffin_lim_iters=None,
|
||||
do_trim_silence=False,
|
||||
sound_norm=False,
|
||||
**_):
|
||||
|
||||
print(" > Setting up Audio Processor...")
|
||||
|
@ -45,6 +46,7 @@ class AudioProcessor(object):
|
|||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||
self.clip_norm = clip_norm
|
||||
self.do_trim_silence = do_trim_silence
|
||||
self.sound_norm = sound_norm
|
||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
||||
members = vars(self)
|
||||
for key, value in members.items():
|
||||
|
@ -210,11 +212,11 @@ class AudioProcessor(object):
|
|||
return len(wav)
|
||||
|
||||
def trim_silence(self, wav):
|
||||
""" Trim silent parts with a threshold and 0.1 sec margin """
|
||||
margin = int(self.sample_rate * 0.1)
|
||||
""" Trim silent parts with a threshold and 0.01 sec margin """
|
||||
margin = int(self.sample_rate * 0.01)
|
||||
wav = wav[margin:-margin]
|
||||
return librosa.effects.trim(
|
||||
wav, top_db=40, frame_length=1024, hop_length=256)[0]
|
||||
wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
|
||||
|
||||
@staticmethod
|
||||
def mulaw_encode(wav, qc):
|
||||
|
@ -243,6 +245,8 @@ class AudioProcessor(object):
|
|||
except ValueError:
|
||||
print(f' [!] File cannot be trimmed for silence - {filename}')
|
||||
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
|
||||
if self.sound_norm:
|
||||
x = x / x.max() * 0.9
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -31,7 +31,8 @@ def load_config(config_path):
|
|||
def get_git_branch():
|
||||
try:
|
||||
out = subprocess.check_output(["git", "branch"]).decode("utf8")
|
||||
current = next(line for line in out.split("\n") if line.startswith("*"))
|
||||
current = next(line for line in out.split("\n")
|
||||
if line.startswith("*"))
|
||||
current.replace("* ", "")
|
||||
except subprocess.CalledProcessError:
|
||||
current = "inside_docker"
|
||||
|
@ -47,8 +48,8 @@ def get_commit_hash():
|
|||
# raise RuntimeError(
|
||||
# " !! Commit before training to get the commit hash.")
|
||||
try:
|
||||
commit = subprocess.check_output(['git', 'rev-parse', '--short',
|
||||
'HEAD']).decode().strip()
|
||||
commit = subprocess.check_output(
|
||||
['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
|
||||
# Not copying .git folder into docker container
|
||||
except subprocess.CalledProcessError:
|
||||
commit = "0000000"
|
||||
|
@ -168,16 +169,42 @@ def lr_decay(init_lr, global_step, warmup_steps):
|
|||
return lr
|
||||
|
||||
|
||||
def weight_decay(optimizer, wd):
|
||||
def adam_weight_decay(optimizer):
|
||||
"""
|
||||
Custom weight decay operation, not effecting grad values.
|
||||
"""
|
||||
for group in optimizer.param_groups:
|
||||
for param in group['params']:
|
||||
current_lr = group['lr']
|
||||
param.data = param.data.add(-wd * group['lr'], param.data)
|
||||
weight_decay = group['weight_decay']
|
||||
param.data = param.data.add(-weight_decay * group['lr'],
|
||||
param.data)
|
||||
return optimizer, current_lr
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}):
|
||||
"""
|
||||
Skip biases, BatchNorm parameters, rnns.
|
||||
and attention projection layer v
|
||||
"""
|
||||
decay = []
|
||||
no_decay = []
|
||||
for name, param in model.named_parameters():
|
||||
if not param.requires_grad:
|
||||
continue
|
||||
|
||||
if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]):
|
||||
no_decay.append(param)
|
||||
else:
|
||||
decay.append(param)
|
||||
return [{
|
||||
'params': no_decay,
|
||||
'weight_decay': 0.
|
||||
}, {
|
||||
'params': decay,
|
||||
'weight_decay': weight_decay
|
||||
}]
|
||||
|
||||
|
||||
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
|
||||
def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1):
|
||||
|
@ -187,8 +214,8 @@ class NoamLR(torch.optim.lr_scheduler._LRScheduler):
|
|||
def get_lr(self):
|
||||
step = max(self.last_epoch, 1)
|
||||
return [
|
||||
base_lr * self.warmup_steps**0.5 * min(
|
||||
step * self.warmup_steps**-1.5, step**-0.5)
|
||||
base_lr * self.warmup_steps**0.5 *
|
||||
min(step * self.warmup_steps**-1.5, step**-0.5)
|
||||
for base_lr in self.base_lrs
|
||||
]
|
||||
|
||||
|
@ -243,8 +270,8 @@ def set_init_dict(model_dict, checkpoint, c):
|
|||
}
|
||||
# 4. overwrite entries in the existing state dict
|
||||
model_dict.update(pretrained_dict)
|
||||
print(" | > {} / {} layers are restored.".format(
|
||||
len(pretrained_dict), len(model_dict)))
|
||||
print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
|
||||
len(model_dict)))
|
||||
return model_dict
|
||||
|
||||
|
||||
|
@ -252,13 +279,13 @@ def setup_model(num_chars, num_speakers, c):
|
|||
print(" > Using model: {}".format(c.model))
|
||||
MyModel = importlib.import_module('TTS.models.' + c.model.lower())
|
||||
MyModel = getattr(MyModel, c.model)
|
||||
if c.model.lower() in ["tacotron", "tacotrongst"]:
|
||||
model = MyModel(
|
||||
num_chars=num_chars,
|
||||
if c.model.lower() in "tacotron":
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
linear_dim=1025,
|
||||
mel_dim=80,
|
||||
gst=c.use_gst,
|
||||
memory_size=c.memory_size,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
|
@ -270,8 +297,7 @@ def setup_model(num_chars, num_speakers, c):
|
|||
location_attn=c.location_attn,
|
||||
separate_stopnet=c.separate_stopnet)
|
||||
elif c.model.lower() == "tacotron2":
|
||||
model = MyModel(
|
||||
num_chars=num_chars,
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
attn_win=c.windowing,
|
||||
|
@ -290,7 +316,8 @@ def split_dataset(items):
|
|||
is_multi_speaker = False
|
||||
speakers = [item[-1] for item in items]
|
||||
is_multi_speaker = len(set(speakers)) > 1
|
||||
eval_split_size = 500 if 500 < len(items) * 0.01 else int(len(items) * 0.01)
|
||||
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
|
||||
len(items) * 0.01)
|
||||
np.random.seed(0)
|
||||
np.random.shuffle(items)
|
||||
if is_multi_speaker:
|
||||
|
@ -314,3 +341,34 @@ def gradual_training_scheduler(global_step, config):
|
|||
if global_step >= values[0]:
|
||||
new_values = values
|
||||
return new_values[1], new_values[2]
|
||||
|
||||
|
||||
class KeepAverage():
|
||||
def __init__(self):
|
||||
self.avg_values = {}
|
||||
self.iters = {}
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.avg_values[key]
|
||||
|
||||
def add_value(self, name, init_val=0, init_iter=0):
|
||||
self.avg_values[name] = init_val
|
||||
self.iters[name] = init_iter
|
||||
|
||||
def update_value(self, name, value, weighted_avg=False):
|
||||
if weighted_avg:
|
||||
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
|
||||
self.iters[name] += 1
|
||||
else:
|
||||
self.avg_values[name] = self.avg_values[name] * \
|
||||
self.iters[name] + value
|
||||
self.iters[name] += 1
|
||||
self.avg_values[name] /= self.iters[name]
|
||||
|
||||
def add_values(self, name_dict):
|
||||
for key, value in name_dict.items():
|
||||
self.add_value(key, init_val=value)
|
||||
|
||||
def update_values(self, value_dict):
|
||||
for key, value in value_dict.items():
|
||||
self.update_value(key, value)
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
|
||||
def alignment_diagonal_score(alignments):
|
||||
"""
|
||||
Compute how diagonal alignment predictions are. It is useful
|
||||
to measure the alignment consistency of a model
|
||||
Args:
|
||||
alignments (torch.Tensor): batch of alignments.
|
||||
Shape:
|
||||
alignments : batch x decoder_steps x encoder_steps
|
||||
"""
|
||||
return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0).item()
|
|
@ -25,9 +25,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
|
|||
json.dump(speaker_mapping, f, indent=4)
|
||||
|
||||
|
||||
def get_speakers(data_root, meta_file, dataset_type):
|
||||
def get_speakers(items):
|
||||
"""Returns a sorted, unique list of speakers in a given dataset."""
|
||||
preprocessor = get_preprocessor_by_name(dataset_type)
|
||||
items = preprocessor(data_root, meta_file)
|
||||
speakers = {e[2] for e in items}
|
||||
return sorted(speakers)
|
||||
|
|
Loading…
Reference in New Issue