Merge pull request #489 from mozilla/multi-speaker

Multi speaker
pull/10/head
Eren Gölge 2020-08-10 14:57:43 +02:00 committed by GitHub
commit 3f34829b78
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 28773 additions and 318 deletions

View File

@ -18,9 +18,9 @@ from mozilla_voice_tts.utils.io import load_config
from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_id):
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):
t_1 = time.time()
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, use_gl)
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
if CONFIG.model == "Tacotron" and not use_gl:
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
if not use_gl:
@ -80,10 +80,15 @@ if __name__ == "__main__":
help="JSON file for multi-speaker model.",
default="")
parser.add_argument(
'--speaker_id',
type=int,
help="target speaker_id if the model is multi-speaker.",
'--speaker_fileid',
type=str,
help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
default=None)
parser.add_argument(
'--gst_style',
help="Wav path file for GST stylereference.",
default=None)
args = parser.parse_args()
# load the config
@ -97,16 +102,24 @@ if __name__ == "__main__":
if 'characters' in C.keys():
symbols, phonemes = make_symbols(**C.characters)
speaker_embedding = None
speaker_embedding_dim = None
num_speakers = 0
# load speakers
if args.speakers_json != '':
speakers = json.load(open(args.speakers_json, 'r'))
num_speakers = len(speakers)
else:
num_speakers = 0
speaker_mapping = json.load(open(args.speakers_json, 'r'))
num_speakers = len(speaker_mapping)
if C.use_external_speaker_embedding_file:
if args.speaker_fileid is not None:
speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding']
else: # if speaker_fileid is not specificated use the first sample in speakers.json
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
speaker_embedding_dim = len(speaker_embedding)
# load the model
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, C)
model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
cp = torch.load(args.model_path, map_location=torch.device('cpu'))
model.load_state_dict(cp['model'])
model.eval()
@ -130,7 +143,27 @@ if __name__ == "__main__":
# synthesize voice
use_griffin_lim = args.vocoder_path == ""
print(" > Text: {}".format(args.text))
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_id)
if not C.use_external_speaker_embedding_file:
if args.speaker_fileid.isdigit():
args.speaker_fileid = int(args.speaker_fileid)
else:
args.speaker_fileid = None
else:
args.speaker_fileid = None
if args.gst_style is None:
gst_style = C.gst['gst_style_input']
else:
# check if gst_style string is a dict, if is dict convert else use string
try:
gst_style = json.loads(args.gst_style)
if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']:
raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), C.gst['gst_style_tokens']))
except ValueError:
gst_style = args.gst_style
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style)
# save the results
file_name = args.text.replace(" ", "_")

View File

@ -10,21 +10,21 @@ import traceback
import torch
from torch.utils.data import DataLoader
from mozilla_voice_tts.generic_utils import count_parameters
from mozilla_voice_tts.speaker_encoder.dataset import MyDataset
from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
from mozilla_voice_tts.tts.utils.audio import AudioProcessor
from mozilla_voice_tts.tts.utils.generic_utils import (
create_experiment_folder, get_git_branch, remove_experiment_folder,
set_init_dict)
from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config
from mozilla_voice_tts.tts.utils.radam import RAdam
from mozilla_voice_tts.tts.utils.tensorboard_logger import TensorboardLogger
from mozilla_voice_tts.tts.utils.training import NoamLR, check_update
from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.generic_utils import count_parameters
from mozilla_voice_tts.utils.radam import RAdam
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
from mozilla_voice_tts.utils.training import NoamLR, check_update
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
@ -100,7 +100,7 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
if global_step % c.steps_plot_stats == 0:
# Plot Training Epoch Stats
train_stats = {
"GE2Eloss": avg_loss,
"loss": avg_loss,
"lr": current_lr,
"grad_norm": grad_norm,
"step_time": step_time
@ -135,12 +135,18 @@ def main(args): # pylint: disable=redefined-outer-name
global meta_data_eval
ap = AudioProcessor(**c.audio)
model = SpeakerEncoder(input_dim=40,
proj_dim=128,
lstm_dim=384,
num_lstm_layers=3)
model = SpeakerEncoder(input_dim=c.model['input_dim'],
proj_dim=c.model['proj_dim'],
lstm_dim=c.model['lstm_dim'],
num_lstm_layers=c.model['num_lstm_layers'])
optimizer = RAdam(model.parameters(), lr=c.lr)
criterion = GE2ELoss(loss_method='softmax')
if c.loss == "ge2e":
criterion = GE2ELoss(loss_method='softmax')
elif c.loss == "angleproto":
criterion = AngleProtoLoss()
else:
raise Exception("The %s not is a loss supported" % c.loss)
if args.restore_path:
checkpoint = torch.load(args.restore_path)
@ -242,7 +248,7 @@ if __name__ == '__main__':
new_fields)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR)
tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder')
try:
main(args)

View File

@ -49,7 +49,7 @@ from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay,
use_cuda, num_gpus = setup_torch_training_env(True, False)
def setup_loader(ap, r, is_val=False, verbose=False):
def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
if is_val and not c.run_eval:
loader = None
else:
@ -68,7 +68,8 @@ def setup_loader(ap, r, is_val=False, verbose=False):
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars,
verbose=verbose)
verbose=verbose,
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
@ -82,9 +83,8 @@ def setup_loader(ap, r, is_val=False, verbose=False):
pin_memory=False)
return loader
def format_data(data):
if c.use_speaker_embedding:
def format_data(data, speaker_mapping=None):
if speaker_mapping is None and c.use_speaker_embedding and not c.use_external_speaker_embedding_file:
speaker_mapping = load_speaker_mapping(OUT_PATH)
# setup input data
@ -99,13 +99,20 @@ def format_data(data):
avg_spec_length = torch.mean(mel_lengths.float())
if c.use_speaker_embedding:
speaker_ids = [
speaker_mapping[speaker_name] for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids)
if c.use_external_speaker_embedding_file:
speaker_embeddings = data[8]
speaker_ids = None
else:
speaker_ids = [
speaker_mapping[speaker_name] for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids)
speaker_embeddings = None
else:
speaker_embeddings = None
speaker_ids = None
# set stop targets view, we predict a single stop token per iteration.
stop_targets = stop_targets.view(text_input.shape[0],
stop_targets.size(1) // c.r, -1)
@ -122,13 +129,16 @@ def format_data(data):
stop_targets = stop_targets.cuda(non_blocking=True)
if speaker_ids is not None:
speaker_ids = speaker_ids.cuda(non_blocking=True)
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length
if speaker_embeddings is not None:
speaker_embeddings = speaker_embeddings.cuda(non_blocking=True)
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length
def train(model, criterion, optimizer, optimizer_st, scheduler,
ap, global_step, epoch, amp):
ap, global_step, epoch, amp, speaker_mapping=None):
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
verbose=(epoch == 0))
verbose=(epoch == 0), speaker_mapping=speaker_mapping)
model.train()
epoch_time = 0
keep_avg = KeepAverage()
@ -143,7 +153,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data(data)
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length = format_data(data, speaker_mapping)
loader_time = time.time() - end_time
global_step += 1
@ -158,10 +168,10 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
# forward pass model
if c.bidirectional_decoder or c.double_decoder_consistency:
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
else:
decoder_output, postnet_output, alignments, stop_tokens = model(
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
decoder_backward_output = None
alignments_backward = None
@ -312,8 +322,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
@torch.no_grad()
def evaluate(model, criterion, ap, global_step, epoch):
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
data_loader = setup_loader(ap, model.decoder.r, is_val=True, speaker_mapping=speaker_mapping)
model.eval()
epoch_time = 0
keep_avg = KeepAverage()
@ -323,16 +333,16 @@ def evaluate(model, criterion, ap, global_step, epoch):
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data, speaker_mapping)
assert mel_input.shape[1] % model.decoder.r == 0
# forward pass model
if c.bidirectional_decoder or c.double_decoder_consistency:
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
else:
decoder_output, postnet_output, alignments, stop_tokens = model(
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
decoder_backward_output = None
alignments_backward = None
@ -494,22 +504,41 @@ def main(args): # pylint: disable=redefined-outer-name
if c.use_speaker_embedding:
speakers = get_speakers(meta_data_train)
if args.restore_path:
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)
assert all([speaker in speaker_mapping
for speaker in speakers]), "As of now you, you cannot " \
"introduce new speakers to " \
"a previously trained model."
else:
if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)
if not speaker_mapping:
print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file")
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
if not speaker_mapping:
raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file")
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)
speaker_embedding_dim = None
assert all([speaker in speaker_mapping
for speaker in speakers]), "As of now you, you cannot " \
"introduce new speakers to " \
"a previously trained model."
elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
else: # if start new train and don't use External Embedding file
speaker_mapping = {name: i for i, name in enumerate(speakers)}
speaker_embedding_dim = None
save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping)
print("Training with {} speakers: {}".format(num_speakers,
", ".join(speakers)))
else:
num_speakers = 0
speaker_embedding_dim = None
speaker_mapping = None
model = setup_model(num_chars, num_speakers, c)
model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim)
params = set_weight_decay(model, c.wd)
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
@ -544,6 +573,8 @@ def main(args): # pylint: disable=redefined-outer-name
print(" > Partial model initialization.")
model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
# torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt'))
# print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt'))
model.load_state_dict(model_dict)
del model_dict
@ -592,7 +623,7 @@ def main(args): # pylint: disable=redefined-outer-name
print("\n > Number of output frames:", model.decoder.r)
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
optimizer_st, scheduler, ap,
global_step, epoch, amp)
global_step, epoch, amp, speaker_mapping)
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = train_avg_loss_dict['avg_postnet_loss']

View File

@ -1,26 +1,33 @@
{
"run_name": "libritts_360-half",
"run_description": "train speaker encoder for libritts 360",
"audio": {
"run_name": "Model compatible to CorentinJ/Real-Time-Voice-Cloning",
"run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
"audio":{
// Audio processing parameters
"num_mels": 40, // size of the mel spec frame.
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"frame_length_ms": 50, // stft window length in ms.
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
"num_mels": 40, // size of the mel spec frame.
"fft_size": 400, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"win_length": 400, // stft window length in ms.
"hop_length": 160, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// Normalization parameters
"signal_norm": true, // normalize the spec values in range [0, 1]
"signal_norm": true, // normalize the spec values in range [0, 1]
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
},
"reinit_layers": [],
"loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
"grad_clip": 3.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
@ -29,29 +36,24 @@
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"steps_plot_stats": 10, // number of steps to plot embeddings.
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 1, // Number of steps to log traning on console.
"output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
"model": {
"input_dim": 40,
"proj_dim": 128,
"lstm_dim": 384,
"num_lstm_layers": 3
"proj_dim": 256,
"lstm_dim": 256,
"num_lstm_layers": 3,
"use_lstm_with_projection": false
},
"datasets":
[
{
"name": "libri_tts",
"path": "/home/erogol/Data/Libri-TTS/train-clean-360/",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "libri_tts",
"path": "/home/erogol/Data/Libri-TTS/train-clean-100/",
"name": "vctk",
"path": "../../../datasets/VCTK-Corpus-removed-silence/",
"meta_file_train": null,
"meta_file_val": null
}

View File

@ -31,7 +31,7 @@ class MyDataset(Dataset):
print(f" | > Num speakers: {len(self.speakers)}")
def load_wav(self, filename):
audio = self.ap.load_wav(filename)
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
return audio
def load_data(self, idx):

View File

@ -15,7 +15,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path,
'optimizer': optimizer.state_dict() if optimizer is not None else None,
'step': current_step,
'epoch': epoch,
'GE2Eloss': model_loss,
'loss': model_loss,
'date': datetime.date.today().strftime("%B %d, %Y"),
}
torch.save(state, checkpoint_path)
@ -29,7 +29,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
'model': new_state_dict,
'optimizer': optimizer.state_dict(),
'step': current_step,
'GE2Eloss': model_loss,
'loss': model_loss,
'date': datetime.date.today().strftime("%B %d, %Y"),
}
best_loss = model_loss

View File

@ -1,7 +1,7 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# adapted from https://github.com/cvqluu/GE2E-Loss
class GE2ELoss(nn.Module):
@ -23,6 +23,8 @@ class GE2ELoss(nn.Module):
self.b = nn.Parameter(torch.tensor(init_b))
self.loss_method = loss_method
print(' > Initialised Generalized End-to-End loss')
assert self.loss_method in ["softmax", "contrast"]
if self.loss_method == "softmax":
@ -119,3 +121,40 @@ class GE2ELoss(nn.Module):
cos_sim_matrix = self.w * cos_sim_matrix + self.b
L = self.embed_loss(dvecs, cos_sim_matrix)
return L.mean()
# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
class AngleProtoLoss(nn.Module):
"""
Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
Accepts an input of size (N, M, D)
where N is the number of speakers in the batch,
M is the number of utterances per speaker,
and D is the dimensionality of the embedding vector
Args:
- init_w (float): defines the initial value of w
- init_b (float): definies the initial value of b
"""
def __init__(self, init_w=10.0, init_b=-5.0):
super(AngleProtoLoss, self).__init__()
# pylint: disable=E1102
self.w = nn.Parameter(torch.tensor(init_w))
# pylint: disable=E1102
self.b = nn.Parameter(torch.tensor(init_b))
self.criterion = torch.nn.CrossEntropyLoss()
print(' > Initialised Angular Prototypical loss')
def forward(self, x):
"""
Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
"""
out_anchor = torch.mean(x[:, 1:, :], 1)
out_positive = x[:, 0, :]
num_speakers = out_anchor.size()[0]
cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2))
torch.clamp(self.w, 1e-6)
cos_sim_matrix = cos_sim_matrix * self.w + self.b
label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device)
L = self.criterion(cos_sim_matrix, label)
return L

View File

@ -16,15 +16,33 @@ class LSTMWithProjection(nn.Module):
o, (_, _) = self.lstm(x)
return self.linear(o)
class LSTMWithoutProjection(nn.Module):
def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
super().__init__()
self.lstm = nn.LSTM(input_size=input_dim,
hidden_size=lstm_dim,
num_layers=num_lstm_layers,
batch_first=True)
self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
self.relu = nn.ReLU()
def forward(self, x):
_, (hidden, _) = self.lstm(x)
return self.relu(self.linear(hidden[-1]))
class SpeakerEncoder(nn.Module):
def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3):
def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
super().__init__()
self.use_lstm_with_projection = use_lstm_with_projection
layers = []
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
for _ in range(num_lstm_layers - 1):
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
self.layers = nn.Sequential(*layers)
# choise LSTM layer
if use_lstm_with_projection:
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
for _ in range(num_lstm_layers - 1):
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
self.layers = nn.Sequential(*layers)
else:
self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
self._init_layers()
def _init_layers(self):
@ -37,12 +55,18 @@ class SpeakerEncoder(nn.Module):
def forward(self, x):
# TODO: implement state passing for lstms
d = self.layers(x)
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
if self.use_lstm_with_projection:
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
else:
d = torch.nn.functional.normalize(d, p=2, dim=1)
return d
def inference(self, x):
d = self.layers.forward(x)
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
if self.use_lstm_with_projection:
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
else:
d = torch.nn.functional.normalize(d, p=2, dim=1)
return d
def compute_embedding(self, x, num_frames=160, overlap=0.5):

View File

@ -123,28 +123,37 @@
"max_seq_len": 153, // DATASET-RELATED: maximum text length
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",
"output_path": "../../Mozilla-TTS/vctk-test/",
// PHONEMES
"phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
"phoneme_cache_path": "../../Mozilla-TTS/vctk-test/", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
"use_gst": false, // TACOTRON ONLY: use global style tokens
"use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning.
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"use_gst": true, // use global style tokens
"gst": { // gst parameter if gst is enabled
"gst_style_input": null, // Condition the style input either on a
// -> wave file [path to wave] or
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
// with the dictionary being len(dict) <= len(gst_style_tokens).
"gst_embedding_dim": 512,
"gst_num_heads": 4,
"gst_style_tokens": 10
},
// DATASETS
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "/home/erogol/Data/LJSpeech-1.1/",
"meta_file_train": "metadata.csv",
"name": "vctk",
"path": "../../../datasets/VCTK-Corpus-removed-silence/",
"meta_file_train": ["p225", "p234", "p238", "p245", "p248", "p261", "p294", "p302", "p326", "p335", "p347"], // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
"meta_file_val": null
}
]
}

View File

@ -24,6 +24,7 @@ class MyDataset(Dataset):
phoneme_cache_path=None,
phoneme_language="en-us",
enable_eos_bos=False,
speaker_mapping=None,
verbose=False):
"""
Args:
@ -58,6 +59,7 @@ class MyDataset(Dataset):
self.phoneme_cache_path = phoneme_cache_path
self.phoneme_language = phoneme_language
self.enable_eos_bos = enable_eos_bos
self.speaker_mapping = speaker_mapping
self.verbose = verbose
if use_phonemes and not os.path.isdir(phoneme_cache_path):
os.makedirs(phoneme_cache_path, exist_ok=True)
@ -127,7 +129,8 @@ class MyDataset(Dataset):
'text': text,
'wav': wav,
'item_idx': self.items[idx][1],
'speaker_name': speaker_name
'speaker_name': speaker_name,
'wav_file_name': os.path.basename(wav_file)
}
return sample
@ -191,9 +194,15 @@ class MyDataset(Dataset):
batch[idx]['item_idx'] for idx in ids_sorted_decreasing
]
text = [batch[idx]['text'] for idx in ids_sorted_decreasing]
speaker_name = [batch[idx]['speaker_name']
for idx in ids_sorted_decreasing]
# get speaker embeddings
if self.speaker_mapping is not None:
wav_files_names = [batch[idx]['wav_file_name'] for idx in ids_sorted_decreasing]
speaker_embedding = [self.speaker_mapping[w]['embedding'] for w in wav_files_names]
else:
speaker_embedding = None
# compute features
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
@ -224,6 +233,9 @@ class MyDataset(Dataset):
mel_lengths = torch.LongTensor(mel_lengths)
stop_targets = torch.FloatTensor(stop_targets)
if speaker_embedding is not None:
speaker_embedding = torch.FloatTensor(speaker_embedding)
# compute linear spectrogram
if self.compute_linear_spec:
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
@ -234,7 +246,7 @@ class MyDataset(Dataset):
else:
linear = None
return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \
stop_targets, item_idxs
stop_targets, item_idxs, speaker_embedding
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
found {}".format(type(batch[0]))))

View File

@ -93,9 +93,10 @@ def mozilla_de(root_path, meta_file):
def mailabs(root_path, meta_files=None):
"""Normalizes M-AI-Labs meta data files to TTS format"""
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
speaker_regex = re.compile(
"by_book/(male|female)/(?P<speaker_name>[^/]+)/")
if meta_files is None:
csv_files = glob(root_path+"/**/metadata.csv", recursive=True)
csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
else:
csv_files = meta_files
# meta_files = [f.strip() for f in meta_files.split(",")]
@ -115,12 +116,15 @@ def mailabs(root_path, meta_files=None):
if meta_files is None:
wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav')
else:
wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav')
wav_file = os.path.join(root_path,
folder.replace("metadata.csv", ""),
'wavs', cols[0] + '.wav')
if os.path.isfile(wav_file):
text = cols[1].strip()
items.append([text, wav_file, speaker_name])
else:
raise RuntimeError("> File %s does not exist!"%(wav_file))
raise RuntimeError("> File %s does not exist!" %
(wav_file))
return items
@ -185,7 +189,8 @@ def libri_tts(root_path, meta_files=None):
text = cols[1]
items.append([text, wav_file, speaker_name])
for item in items:
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
assert os.path.exists(
item[1]), f" [!] wav files don't exist - {item[1]}"
return items
@ -197,7 +202,8 @@ def custom_turkish(root_path, meta_file):
with open(txt_file, 'r', encoding='utf-8') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
wav_file = os.path.join(root_path, 'wavs',
cols[0].strip() + '.wav')
if not os.path.exists(wav_file):
skipped_files.append(wav_file)
continue
@ -205,3 +211,44 @@ def custom_turkish(root_path, meta_file):
items.append([text, wav_file, speaker_name])
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
return items
# ToDo: add the dataset link when the dataset is released publicly
def brspeech(root_path, meta_file):
'''BRSpeech 3.0 beta'''
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, 'r') as ttf:
for line in ttf:
if line.startswith("wav_filename"):
continue
cols = line.split('|')
#print(cols)
wav_file = os.path.join(root_path, cols[0])
text = cols[2]
speaker_name = cols[3]
items.append([text, wav_file, speaker_name])
return items
def vctk(root_path, meta_files=None, wavs_path='wav48'):
"""homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
test_speakers = meta_files
items = []
meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt",
recursive=True)
for meta_file in meta_files:
_, speaker_id, txt_file = os.path.relpath(meta_file,
root_path).split(os.sep)
file_id = txt_file.split('.')[0]
if isinstance(test_speakers,
list): # if is list ignore this speakers ids
if speaker_id in test_speakers:
continue
with open(meta_file) as file_text:
text = file_text.readlines()[0]
wav_file = os.path.join(root_path, wavs_path, speaker_id,
file_id + '.wav')
items.append([text, wav_file, speaker_id])
return items

View File

@ -96,7 +96,7 @@ class StyleTokenLayer(nn.Module):
self.key_dim = embedding_dim // num_heads
self.style_tokens = nn.Parameter(
torch.FloatTensor(num_style_tokens, self.key_dim))
nn.init.orthogonal_(self.style_tokens)
nn.init.normal_(self.style_tokens, mean=0, std=0.5)
self.attention = MultiHeadAttention(
query_dim=self.query_dim,
key_dim=self.key_dim,

View File

@ -291,7 +291,7 @@ class Decoder(nn.Module):
def __init__(self, in_channels, frame_channels, r, memory_size, attn_type, attn_windowing,
attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn, attn_K,
separate_stopnet, speaker_embedding_dim):
separate_stopnet):
super(Decoder, self).__init__()
self.r_init = r
self.r = r
@ -303,7 +303,7 @@ class Decoder(nn.Module):
self.separate_stopnet = separate_stopnet
self.query_dim = 256
# memory -> |Prenet| -> processed_memory
prenet_dim = frame_channels * self.memory_size + speaker_embedding_dim if self.use_memory_queue else frame_channels + speaker_embedding_dim
prenet_dim = frame_channels * self.memory_size if self.use_memory_queue else frame_channels
self.prenet = Prenet(
prenet_dim,
prenet_type,
@ -429,7 +429,7 @@ class Decoder(nn.Module):
# assert new_memory.shape[-1] == self.r * self.frame_channels
self.memory_input = new_memory[:, self.frame_channels * (self.r - 1):]
def forward(self, inputs, memory, mask, speaker_embeddings=None):
def forward(self, inputs, memory, mask):
"""
Args:
inputs: Encoder outputs.
@ -454,8 +454,7 @@ class Decoder(nn.Module):
if t > 0:
new_memory = memory[t - 1]
self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, mask)
outputs += [output]
attentions += [attention]
@ -463,15 +462,12 @@ class Decoder(nn.Module):
t += 1
return self._parse_outputs(outputs, attentions, stop_tokens)
def inference(self, inputs, speaker_embeddings=None):
def inference(self, inputs):
"""
Args:
inputs: encoder outputs.
speaker_embeddings: speaker vectors.
Shapes:
- inputs: (B, T, D_out_enc)
- speaker_embeddings: (B, D_embed)
- inputs: batch x time x encoder_out_dim
"""
outputs = []
attentions = []
@ -484,8 +480,6 @@ class Decoder(nn.Module):
if t > 0:
new_memory = outputs[-1]
self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, None)
stop_token = torch.sigmoid(stop_token.data)
outputs += [output]

View File

@ -141,14 +141,12 @@ class Decoder(nn.Module):
location_attn (bool): if true, use location sensitive attention.
attn_K (int): number of attention heads for GravesAttention.
separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training.
"""
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, in_channels, frame_channels, r, attn_type, attn_win, attn_norm,
prenet_type, prenet_dropout, forward_attn, trans_agent,
forward_attn_mask, location_attn, attn_K, separate_stopnet,
speaker_embedding_dim):
forward_attn_mask, location_attn, attn_K, separate_stopnet):
super(Decoder, self).__init__()
self.frame_channels = frame_channels
self.r_init = r
@ -157,7 +155,6 @@ class Decoder(nn.Module):
self.separate_stopnet = separate_stopnet
self.max_decoder_steps = 1000
self.stop_threshold = 0.5
self.speaker_embedding_dim = speaker_embedding_dim
# model dimensions
self.query_dim = 1024
@ -300,7 +297,7 @@ class Decoder(nn.Module):
decoder_output = decoder_output[:, :self.r * self.frame_channels]
return decoder_output, self.attention.attention_weights, stop_token
def forward(self, inputs, memories, mask, speaker_embeddings=None):
def forward(self, inputs, memories, mask):
r"""Train Decoder with teacher forcing.
Args:
inputs: Encoder outputs.
@ -318,8 +315,6 @@ class Decoder(nn.Module):
memories = self._reshape_memory(memories)
memories = torch.cat((memory, memories), dim=0)
memories = self._update_memory(memories)
if speaker_embeddings is not None:
memories = torch.cat([memories, speaker_embeddings], dim=-1)
memories = self.prenet(memories)
self._init_states(inputs, mask=mask)
@ -337,16 +332,14 @@ class Decoder(nn.Module):
outputs, stop_tokens, alignments)
return outputs, alignments, stop_tokens
def inference(self, inputs, speaker_embeddings=None):
def inference(self, inputs):
r"""Decoder inference without teacher forcing and use
Stopnet to stop decoder.
Args:
inputs: Encoder outputs.
speaker_embeddings: speaker embedding vectors.
Shapes:
- inputs: (B, T, D_out_enc)
- speaker_embeddings: (B, D_embed)
- outputs: (B, T_mel, D_mel)
- alignments: (B, T_in, T_out)
- stop_tokens: (B, T_out)
@ -360,8 +353,6 @@ class Decoder(nn.Module):
outputs, stop_tokens, alignments, t = [], [], [], 0
while True:
memory = self.prenet(memory)
if speaker_embeddings is not None:
memory = torch.cat([memory, speaker_embeddings], dim=-1)
decoder_output, alignment, stop_token = self.decode(memory)
stop_token = torch.sigmoid(stop_token.data)
outputs += [decoder_output.squeeze(1)]

View File

@ -28,7 +28,13 @@ class Tacotron(TacotronAbstract):
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
encoder_in_features=256,
decoder_in_features=256,
speaker_embedding_dim=None,
gst=False,
gst_embedding_dim=256,
gst_num_heads=4,
gst_style_tokens=10,
memory_size=5):
super(Tacotron,
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
@ -37,37 +43,41 @@ class Tacotron(TacotronAbstract):
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet,
bidirectional_decoder, double_decoder_consistency,
ddc_r, gst)
decoder_in_features = 512 if num_speakers > 1 else 256
encoder_in_features = 512 if num_speakers > 1 else 256
speaker_embedding_dim = 256
proj_speaker_dim = 80 if num_speakers > 1 else 0
# base model layers
ddc_r, encoder_in_features, decoder_in_features,
speaker_embedding_dim, gst, gst_embedding_dim,
gst_num_heads, gst_style_tokens)
# speaker embedding layers
if self.num_speakers > 1:
if not self.embeddings_per_sample:
speaker_embedding_dim = 256
self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3)
# speaker and gst embeddings is concat in decoder input
if self.num_speakers > 1:
self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim
# embedding layer
self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
self.embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(encoder_in_features)
self.decoder = Decoder(decoder_in_features, decoder_output_dim, r,
# base model layers
self.encoder = Encoder(self.encoder_in_features)
self.decoder = Decoder(self.decoder_in_features, decoder_output_dim, r,
memory_size, attn_type, attn_win, attn_norm,
prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn,
attn_K, separate_stopnet, proj_speaker_dim)
attn_K, separate_stopnet)
self.postnet = PostCBHG(decoder_output_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
postnet_output_dim)
# speaker embedding layers
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.speaker_project_mel = nn.Sequential(
nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh())
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
# global style token layers
if self.gst:
gst_embedding_dim = 256
self.gst_layer = GST(num_mel=80,
num_heads=4,
num_style_tokens=10,
num_heads=gst_num_heads,
num_style_tokens=gst_style_tokens,
embedding_dim=gst_embedding_dim)
# backward pass decoder
if self.bidirectional_decoder:
@ -75,13 +85,12 @@ class Tacotron(TacotronAbstract):
# setup DDC
if self.double_decoder_consistency:
self.coarse_decoder = Decoder(
decoder_in_features, decoder_output_dim, ddc_r, memory_size,
self.decoder_in_features, decoder_output_dim, ddc_r, memory_size,
attn_type, attn_win, attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask, location_attn,
attn_K, separate_stopnet, proj_speaker_dim)
attn_K, separate_stopnet)
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None):
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
"""
Shapes:
- characters: B x T_in
@ -89,17 +98,9 @@ class Tacotron(TacotronAbstract):
- mel_specs: B x T_out x D
- speaker_ids: B x 1
"""
self._init_states()
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x T_in x embed_dim
inputs = self.embedding(characters)
# B x speaker_embed_dim
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
# B x T_in x embed_dim + speaker_embed_dim
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
# B x T_in x encoder_in_features
encoder_outputs = self.encoder(inputs)
# sequence masking
@ -108,15 +109,20 @@ class Tacotron(TacotronAbstract):
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
# speaker embedding
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
if not self.embeddings_per_sample:
# B x 1 x speaker_embed_dim
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
else:
# B x 1 x speaker_embed_dim
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
# decoder_outputs: B x decoder_in_features x T_out
# alignments: B x T_in x encoder_in_features
# stop_tokens: B x T_in
decoder_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, input_mask,
self.speaker_embeddings_projected)
encoder_outputs, mel_specs, input_mask)
# sequence masking
if output_mask is not None:
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
@ -138,22 +144,22 @@ class Tacotron(TacotronAbstract):
return decoder_outputs, postnet_outputs, alignments, stop_tokens
@torch.no_grad()
def inference(self, characters, speaker_ids=None, style_mel=None):
def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None):
inputs = self.embedding(characters)
self._init_states()
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
encoder_outputs = self.encoder(inputs)
if self.gst and style_mel is not None:
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
if not self.embeddings_per_sample:
# B x 1 x speaker_embed_dim
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
else:
# B x 1 x speaker_embed_dim
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs, self.speaker_embeddings_projected)
encoder_outputs)
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = self.last_linear(postnet_outputs)
decoder_outputs = decoder_outputs.transpose(1, 2)

View File

@ -5,7 +5,6 @@ from mozilla_voice_tts.tts.layers.gst_layers import GST
from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet
from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract
# TODO: match function arguments with tacotron
class Tacotron2(TacotronAbstract):
def __init__(self,
@ -28,7 +27,13 @@ class Tacotron2(TacotronAbstract):
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
gst=False):
encoder_in_features=512,
decoder_in_features=512,
speaker_embedding_dim=None,
gst=False,
gst_embedding_dim=512,
gst_num_heads=4,
gst_style_tokens=10):
super(Tacotron2,
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
decoder_output_dim, attn_type, attn_win,
@ -36,38 +41,48 @@ class Tacotron2(TacotronAbstract):
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet,
bidirectional_decoder, double_decoder_consistency,
ddc_r, gst)
decoder_in_features = 512 if num_speakers > 1 else 512
encoder_in_features = 512 if num_speakers > 1 else 512
proj_speaker_dim = 80 if num_speakers > 1 else 0
# base layers
ddc_r, encoder_in_features, decoder_in_features,
speaker_embedding_dim, gst, gst_embedding_dim,
gst_num_heads, gst_style_tokens)
# speaker embedding layer
if self.num_speakers > 1:
if not self.embeddings_per_sample:
speaker_embedding_dim = 512
self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3)
# speaker and gst embeddings is concat in decoder input
if self.num_speakers > 1:
self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim
# embedding layer
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, 512)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(encoder_in_features)
self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
# base model layers
self.encoder = Encoder(self.encoder_in_features)
self.decoder = Decoder(self.decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
location_attn, attn_K, separate_stopnet)
self.postnet = Postnet(self.postnet_output_dim)
# global style token layers
if self.gst:
gst_embedding_dim = encoder_in_features
self.gst_layer = GST(num_mel=80,
num_heads=4,
num_style_tokens=10,
embedding_dim=gst_embedding_dim)
num_heads=self.gst_num_heads,
num_style_tokens=self.gst_style_tokens,
embedding_dim=self.gst_embedding_dim)
# backward pass decoder
if self.bidirectional_decoder:
self._init_backward_decoder()
# setup DDC
if self.double_decoder_consistency:
self.coarse_decoder = Decoder(
decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
self.decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn, attn_K,
separate_stopnet, proj_speaker_dim)
separate_stopnet)
@staticmethod
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
@ -75,8 +90,7 @@ class Tacotron2(TacotronAbstract):
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
return mel_outputs, mel_outputs_postnet, alignments
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
self._init_states()
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
# compute mask for padding
# B x T_in_max (boolean)
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
@ -84,20 +98,22 @@ class Tacotron2(TacotronAbstract):
embedded_inputs = self.embedding(text).transpose(1, 2)
# B x T_in_max x D_en
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
# adding speaker embeddding to encoder output
# TODO: multi-speaker
# B x speaker_embed_dim
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
# B x T_in x embed_dim + speaker_embed_dim
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
self.speaker_embeddings)
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# global style token
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
if self.num_speakers > 1:
if not self.embeddings_per_sample:
# B x 1 x speaker_embed_dim
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
else:
# B x 1 x speaker_embed_dim
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
decoder_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, input_mask)
@ -122,14 +138,19 @@ class Tacotron2(TacotronAbstract):
return decoder_outputs, postnet_outputs, alignments, stop_tokens
@torch.no_grad()
def inference(self, text, speaker_ids=None):
def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None):
embedded_inputs = self.embedding(text).transpose(1, 2)
encoder_outputs = self.encoder.inference(embedded_inputs)
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
if self.num_speakers > 1:
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
self.speaker_embeddings)
if not self.embeddings_per_sample:
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs)
postnet_outputs = self.postnet(decoder_outputs)
@ -138,14 +159,22 @@ class Tacotron2(TacotronAbstract):
decoder_outputs, postnet_outputs, alignments)
return decoder_outputs, postnet_outputs, alignments, stop_tokens
def inference_truncated(self, text, speaker_ids=None):
def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None):
"""
Preserve model states for continuous inference
"""
embedded_inputs = self.embedding(text).transpose(1, 2)
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
if self.num_speakers > 1:
if not self.embeddings_per_sample:
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
encoder_outputs)
mel_outputs_postnet = self.postnet(mel_outputs)
@ -153,17 +182,3 @@ class Tacotron2(TacotronAbstract):
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
mel_outputs, mel_outputs_postnet, alignments)
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
def _speaker_embedding_pass(self, encoder_outputs, speaker_ids):
# TODO: multi-speaker
# if hasattr(self, "speaker_embedding") and speaker_ids is None:
# raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
# if hasattr(self, "speaker_embedding") and speaker_ids is not None:
# speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
# encoder_outputs.size(1),
# -1)
# encoder_outputs = encoder_outputs + speaker_embeddings
# return encoder_outputs
pass

View File

@ -28,7 +28,13 @@ class TacotronAbstract(ABC, nn.Module):
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
gst=False):
encoder_in_features=512,
decoder_in_features=512,
speaker_embedding_dim=None,
gst=False,
gst_embedding_dim=512,
gst_num_heads=4,
gst_style_tokens=10):
""" Abstract Tacotron class """
super().__init__()
self.num_chars = num_chars
@ -36,6 +42,9 @@ class TacotronAbstract(ABC, nn.Module):
self.decoder_output_dim = decoder_output_dim
self.postnet_output_dim = postnet_output_dim
self.gst = gst
self.gst_embedding_dim = gst_embedding_dim
self.gst_num_heads = gst_num_heads
self.gst_style_tokens = gst_style_tokens
self.num_speakers = num_speakers
self.bidirectional_decoder = bidirectional_decoder
self.double_decoder_consistency = double_decoder_consistency
@ -51,6 +60,9 @@ class TacotronAbstract(ABC, nn.Module):
self.location_attn = location_attn
self.attn_K = attn_K
self.separate_stopnet = separate_stopnet
self.encoder_in_features = encoder_in_features
self.decoder_in_features = decoder_in_features
self.speaker_embedding_dim = speaker_embedding_dim
# layers
self.embedding = None
@ -58,8 +70,17 @@ class TacotronAbstract(ABC, nn.Module):
self.decoder = None
self.postnet = None
# multispeaker
if self.speaker_embedding_dim is None:
# if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim
self.embeddings_per_sample = False
else:
# if speaker_embedding_dim is not None we need use speaker embedding per sample
self.embeddings_per_sample = True
# global style token
if self.gst:
self.decoder_in_features += gst_embedding_dim # add gst embedding dim
self.gst_layer = None
# model states
@ -158,11 +179,22 @@ class TacotronAbstract(ABC, nn.Module):
self.speaker_embeddings_projected = self.speaker_project_mel(
self.speaker_embeddings).squeeze(1)
def compute_gst(self, inputs, mel_specs):
def compute_gst(self, inputs, style_input):
""" Compute global style token """
# pylint: disable=not-callable
gst_outputs = self.gst_layer(mel_specs)
inputs = self._add_speaker_embedding(inputs, gst_outputs)
device = inputs.device
if isinstance(style_input, dict):
query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
_GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
for k_token, v_amplifier in style_input.items():
key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
elif style_input is None:
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
else:
gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
inputs = self._concat_speaker_embedding(inputs, gst_outputs)
return inputs
@staticmethod

View File

@ -44,7 +44,7 @@ def sequence_mask(sequence_length, max_len=None):
return seq_range_expand < seq_length_expand
def setup_model(num_chars, num_speakers, c):
def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('mozilla_voice_tts.tts.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model)
@ -55,6 +55,9 @@ def setup_model(num_chars, num_speakers, c):
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
decoder_output_dim=c.audio['num_mels'],
gst=c.use_gst,
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
memory_size=c.memory_size,
attn_type=c.attention_type,
attn_win=c.windowing,
@ -69,7 +72,8 @@ def setup_model(num_chars, num_speakers, c):
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r)
ddc_r=c.ddc_r,
speaker_embedding_dim=speaker_embedding_dim)
elif c.model.lower() == "tacotron2":
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
@ -77,6 +81,9 @@ def setup_model(num_chars, num_speakers, c):
postnet_output_dim=c.audio['num_mels'],
decoder_output_dim=c.audio['num_mels'],
gst=c.use_gst,
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
@ -90,9 +97,11 @@ def setup_model(num_chars, num_speakers, c):
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r)
ddc_r=c.ddc_r,
speaker_embedding_dim=speaker_embedding_dim)
return model
class KeepAverage():
def __init__(self):
self.avg_values = {}
@ -168,7 +177,7 @@ def check_config(c):
check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100)
check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100)
check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
check_argument('trim_db', c['audio'], restricted=True, val_type=int)
@ -239,15 +248,21 @@ def check_config(c):
# paths
check_argument('output_path', c, restricted=True, val_type=str)
# multi-speaker gst
# multi-speaker and gst
check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
check_argument('style_wav_for_test', c, restricted=True, val_type=str)
check_argument('use_external_speaker_embedding_file', c, restricted=True, val_type=bool)
check_argument('external_speaker_embedding_file', c, restricted=True, val_type=str)
check_argument('use_gst', c, restricted=True, val_type=bool)
check_argument('gst', c, restricted=True, val_type=dict)
check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict])
check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000)
check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10)
check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000)
# datasets - checking only the first entry
check_argument('datasets', c, restricted=True, val_type=list)
for dataset_entry in c['datasets']:
check_argument('name', dataset_entry, restricted=True, val_type=str)
check_argument('path', dataset_entry, restricted=True, val_type=str)
check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list])
check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)

View File

@ -10,12 +10,15 @@ def make_speakers_json_path(out_path):
def load_speaker_mapping(out_path):
"""Loads speaker mapping if already present."""
try:
with open(make_speakers_json_path(out_path)) as f:
if os.path.splitext(out_path)[1] == '.json':
json_file = out_path
else:
json_file = make_speakers_json_path(out_path)
with open(json_file) as f:
return json.load(f)
except FileNotFoundError:
return {}
def save_speaker_mapping(out_path, speaker_mapping):
"""Saves speaker mapping if not yet present."""
speakers_json_path = make_speakers_json_path(out_path)

View File

@ -37,23 +37,25 @@ def numpy_to_tf(np_array, dtype):
return tensor
def compute_style_mel(style_wav, ap):
style_mel = ap.melspectrogram(
ap.load_wav(style_wav)).expand_dims(0)
def compute_style_mel(style_wav, ap, cuda=False):
style_mel = torch.FloatTensor(ap.melspectrogram(
ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0)
if cuda:
return style_mel.cuda()
return style_mel
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None):
if CONFIG.use_gst:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, style_mel=style_mel, speaker_ids=speaker_id)
inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
else:
if truncated:
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
inputs, speaker_ids=speaker_id)
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
else:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, speaker_ids=speaker_id)
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
return decoder_output, postnet_output, alignments, stop_tokens
@ -129,13 +131,24 @@ def inv_spectrogram(postnet_output, ap, CONFIG):
return wav
def id_to_torch(speaker_id):
def id_to_torch(speaker_id, cuda=False):
if speaker_id is not None:
speaker_id = np.asarray(speaker_id)
speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
if cuda:
return speaker_id.cuda()
return speaker_id
def embedding_to_torch(speaker_embedding, cuda=False):
if speaker_embedding is not None:
speaker_embedding = np.asarray(speaker_embedding)
speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor)
if cuda:
return speaker_embedding.cuda()
return speaker_embedding
# TODO: perform GL with pytorch for batching
def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
'''Apply griffin-lim to each sample iterating throught the first dimension.
@ -165,6 +178,7 @@ def synthesis(model,
enable_eos_bos_chars=False, #pylint: disable=unused-argument
use_griffin_lim=False,
do_trim_silence=False,
speaker_embedding=None,
backend='torch'):
"""Synthesize voice for the given text.
@ -185,14 +199,23 @@ def synthesis(model,
"""
# GST processing
style_mel = None
if CONFIG.model == "TacotronGST" and style_wav is not None:
style_mel = compute_style_mel(style_wav, ap)
if CONFIG.use_gst and style_wav is not None:
if isinstance(style_wav, dict):
style_mel = style_wav
else:
style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda)
# preprocess the given text
inputs = text_to_seqvec(text, CONFIG)
# pass tensors to backend
if backend == 'torch':
speaker_id = id_to_torch(speaker_id)
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
if speaker_id is not None:
speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
if speaker_embedding is not None:
speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda)
if not isinstance(style_mel, dict):
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda)
inputs = inputs.unsqueeze(0)
elif backend == 'tf':
@ -207,7 +230,7 @@ def synthesis(model,
# synthesize voice
if backend == 'torch':
decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
model, inputs, CONFIG, truncated, speaker_id, style_mel)
model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding)
postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch(
postnet_output, decoder_output, alignments, stop_tokens)
elif backend == 'tf':

View File

@ -67,15 +67,16 @@ def remove_aux_symbols(text):
text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text)
return text
def replace_symbols(text):
def replace_symbols(text, lang='en'):
text = text.replace(';', ',')
text = text.replace('-', ' ')
text = text.replace(':', ',')
text = text.replace('&', 'and')
text = text.replace(':', ' ')
if lang == 'en':
text = text.replace('&', 'and')
elif lang == 'pt':
text = text.replace('&', ' e ')
return text
def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
@ -91,6 +92,13 @@ def transliteration_cleaners(text):
return text
def basic_german_cleaners(text):
'''Pipeline for German text'''
text = lowercase(text)
text = collapse_whitespace(text)
return text
# TODO: elaborate it
def basic_turkish_cleaners(text):
'''Pipeline for Turkish text'''
@ -99,7 +107,6 @@ def basic_turkish_cleaners(text):
text = collapse_whitespace(text)
return text
def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
@ -111,6 +118,14 @@ def english_cleaners(text):
text = collapse_whitespace(text)
return text
def portuguese_cleaners(text):
'''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that'''
text = lowercase(text)
text = replace_symbols(text, lang='pt')
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''

View File

@ -146,5 +146,11 @@ def check_argument(name, c, enum_list=None, max_val=None, min_val=None, restrict
assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
if enum_list:
assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
if val_type:
if isinstance(val_type, list):
is_valid = False
for typ in val_type:
if isinstance(c[name], typ):
is_valid = True
assert is_valid or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
elif val_type:
assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'

View File

@ -0,0 +1,163 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n",
"\n",
"Before running this script please DON'T FORGET: \n",
"- to set file paths.\n",
"- to download related model files from TTS.\n",
"- download or clone related repos, linked below.\n",
"- setup the repositories. ```python setup.py install```\n",
"- to checkout right commit versions (given next to the model) of TTS.\n",
"- to set the right paths in the cell below.\n",
"\n",
"Repository:\n",
"- TTS: https://github.com/mozilla/TTS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"import importlib\n",
"import random\n",
"import librosa\n",
"import torch\n",
"\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
"\n",
"# you may need to change this depending on your system\n",
"os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
"\n",
"\n",
"from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.io import load_config"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should also adjust all the path constants to point at the relevant locations for you locally"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n",
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
"\n",
"\n",
"DATASETS_NAME = ['vctk'] # list the datasets\n",
"DATASETS_PATH = ['../../../datasets/VCTK/']\n",
"DATASETS_METAFILE = ['']\n",
"\n",
"USE_CUDA = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Preprocess dataset\n",
"meta_data = []\n",
"for i in range(len(DATASETS_NAME)):\n",
" preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
" preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
" meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
" \n",
"meta_data= list(meta_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"c = load_config(CONFIG_PATH)\n",
"ap = AudioProcessor(**c['audio'])\n",
"\n",
"model = SpeakerEncoder(**c.model)\n",
"model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
"model.eval()\n",
"if USE_CUDA:\n",
" model.cuda()\n",
"\n",
"embeddings_dict = {}\n",
"len_meta_data= len(meta_data)\n",
"\n",
"for i in tqdm(range(len_meta_data)):\n",
" _, wav_file, speaker_id = meta_data[i]\n",
" wav_file_name = os.path.basename(wav_file)\n",
" mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
" if USE_CUDA:\n",
" mel_spec = mel_spec.cuda()\n",
" embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
" embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create and export speakers.json\n",
"speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
"save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#test load integrity\n",
"speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
"assert speaker_mapping == speaker_mapping_load\n",
"print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,637 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018.ipynb",
"provenance": [],
"collapsed_sections": [
"vnV-FigfvsS2",
"hkvv7gRcx4WV",
"QJ6VgT2a4vHW"
]
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "yZK6UdwSFnOO",
"colab_type": "text"
},
"source": [
"# **Download and install Mozilla TTS**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yvb0pX3WY6MN",
"colab_type": "code",
"colab": {}
},
"source": [
"import os \n",
"!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "iB9nl2UEG3SY",
"colab_type": "code",
"colab": {}
},
"source": [
"!apt-get install espeak\n",
"os.chdir('TTS')\n",
"!pip install -r requirements.txt\n",
"!python setup.py develop\n",
"os.chdir('..')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "w6Krn8k1inC_",
"colab_type": "text"
},
"source": [
"\n",
"\n",
"**Download Checkpoint**\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "PiYHf3lKhi9z",
"colab_type": "code",
"colab": {}
},
"source": [
"!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n",
"!unzip ./TTS-checkpoint.zip\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "MpYNgqrZcJKn",
"colab_type": "text"
},
"source": [
"**Utils Functions**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4KZA4b_CbMqx",
"colab_type": "code",
"colab": {}
},
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import argparse\n",
"import json\n",
"# pylint: disable=redefined-outer-name, unused-argument\n",
"import os\n",
"import string\n",
"import time\n",
"import sys\n",
"import numpy as np\n",
"\n",
"TTS_PATH = \"../content/TTS\"\n",
"# add libraries into environment\n",
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
"\n",
"import torch\n",
"\n",
"from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.tts.utils.synthesis import synthesis\n",
"from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.io import load_config\n",
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
"\n",
"\n",
"def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):\n",
" t_1 = time.time()\n",
" waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" if use_cuda and not use_gl:\n",
" waveform = waveform.cpu()\n",
" if not use_gl:\n",
" waveform = waveform.numpy()\n",
" waveform = waveform.squeeze()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" return waveform\n",
"\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ENA2OumIVeMA",
"colab_type": "text"
},
"source": [
"# **Vars definitions**\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "jPD0d_XpVXmY",
"colab_type": "code",
"colab": {}
},
"source": [
"TEXT = ''\n",
"OUT_PATH = 'tests-audios/'\n",
"# create output path\n",
"os.makedirs(OUT_PATH, exist_ok=True)\n",
"\n",
"SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n",
"\n",
"# model vars \n",
"MODEL_PATH = 'best_model.pth.tar'\n",
"CONFIG_PATH = 'config.json'\n",
"SPEAKER_JSON = 'speakers.json'\n",
"\n",
"# vocoder vars\n",
"VOCODER_PATH = ''\n",
"VOCODER_CONFIG_PATH = ''\n",
"\n",
"USE_CUDA = True"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "dV6cXXlfi72r",
"colab_type": "text"
},
"source": [
"# **Restore TTS Model**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "x1WgLFauWUPe",
"colab_type": "code",
"colab": {}
},
"source": [
"# load the config\n",
"C = load_config(CONFIG_PATH)\n",
"C.forward_attn_mask = True\n",
"\n",
"# load the audio processor\n",
"ap = AudioProcessor(**C.audio)\n",
"\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in C.keys():\n",
" symbols, phonemes = make_symbols(**C.characters)\n",
"\n",
"speaker_embedding = None\n",
"speaker_embedding_dim = None\n",
"num_speakers = 0\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" num_speakers = len(speaker_mapping)\n",
" if C.use_external_speaker_embedding_file:\n",
" if SPEAKER_FILEID is not None:\n",
" speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n",
" else: # if speaker_fileid is not specificated use the first sample in speakers.json\n",
" choise_speaker = list(speaker_mapping.keys())[0]\n",
" print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n",
" speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n",
" speaker_embedding_dim = len(speaker_embedding)\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n",
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
"model.load_state_dict(cp['model'])\n",
"model.eval()\n",
"\n",
"if USE_CUDA:\n",
" model.cuda()\n",
"\n",
"model.decoder.set_r(cp['r'])\n",
"\n",
"# load vocoder model\n",
"if VOCODER_PATH!= \"\":\n",
" VC = load_config(VOCODER_CONFIG_PATH)\n",
" vocoder_model = setup_generator(VC)\n",
" vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n",
" vocoder_model.remove_weight_norm()\n",
" if USE_CUDA:\n",
" vocoder_model.cuda()\n",
" vocoder_model.eval()\n",
"else:\n",
" vocoder_model = None\n",
" VC = None\n",
"\n",
"# synthesize voice\n",
"use_griffin_lim = VOCODER_PATH== \"\"\n",
"\n",
"if not C.use_external_speaker_embedding_file:\n",
" if SPEAKER_FILEID.isdigit():\n",
" SPEAKER_FILEID = int(SPEAKER_FILEID)\n",
" else:\n",
" SPEAKER_FILEID = None\n",
"else:\n",
" SPEAKER_FILEID = None\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "tNvVEoE30qY6",
"colab_type": "text"
},
"source": [
"Synthesize sentence with Speaker\n",
"\n",
"> Stop running the cell to leave!\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "2o8fXkVSyXOa",
"colab_type": "code",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "vnV-FigfvsS2",
"colab_type": "text"
},
"source": [
"# **Select Speaker**\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "RuCGOnJ_fgDV",
"colab_type": "code",
"colab": {}
},
"source": [
"\n",
"# VCTK speakers not seen in training (new speakers)\n",
"VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n",
"\n",
"# VCTK speakers seen in training\n",
"VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n",
"\n",
"\n",
"num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "hkvv7gRcx4WV",
"colab_type": "text"
},
"source": [
"## **Example select a VCTK seen speaker in training**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "BviNMI9UyCYz",
"colab_type": "code",
"colab": {}
},
"source": [
"# get embedding\n",
"Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" if C.use_external_speaker_embedding_file:\n",
" speaker_embeddings = []\n",
" for key in list(speaker_mapping.keys()):\n",
" if Speaker_choise in key:\n",
" if len(speaker_embeddings) < num_samples_speaker:\n",
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
" # takes the average of the embedings samples of the announcers\n",
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
" "
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "5e5_XnLsx3jg",
"colab_type": "code",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "QJ6VgT2a4vHW"
},
"source": [
"## **Example select a VCTK not seen speaker in training (new Speakers)**\n",
"\n",
"\n",
"> Fitting new Speakers :)\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "SZS57ZK-4vHa",
"colab": {}
},
"source": [
"# get embedding\n",
"Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" if C.use_external_speaker_embedding_file:\n",
" speaker_embeddings = []\n",
" for key in list(speaker_mapping.keys()):\n",
" if Speaker_choise in key:\n",
" if len(speaker_embeddings) < num_samples_speaker:\n",
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
" # takes the average of the embedings samples of the announcers\n",
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
" "
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "bbs85vzz4vHo",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "LEE6mQLh5Who"
},
"source": [
"# **Example Synthesizing with your own voice :)**\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "La70gSB65nrs",
"colab_type": "text"
},
"source": [
" Download and load GE2E Speaker Encoder "
]
},
{
"cell_type": "code",
"metadata": {
"id": "r0IEFZ0B5vQg",
"colab_type": "code",
"colab": {}
},
"source": [
"!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n",
"!unzip ./SpeakerEncoder-checkpoint.zip"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "jEH8HCTh5mF6",
"colab_type": "code",
"colab": {}
},
"source": [
"SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n",
"SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n",
"SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n",
"USE_CUDA = True"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "tOwkfQqT6-Qo",
"colab_type": "code",
"colab": {}
},
"source": [
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
"se_config = load_config(SE_CONFIG_PATH)\n",
"se_ap = AudioProcessor(**se_config['audio'])\n",
"\n",
"se_model = SpeakerEncoder(**se_config.model)\n",
"se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n",
"se_model.eval()\n",
"if USE_CUDA:\n",
" se_model.cuda()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0TLlbUFG8O36",
"colab_type": "text"
},
"source": [
"Upload a wav audio file in your voice.\n",
"\n",
"\n",
"> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "_FWwHPjJ8NXl",
"colab_type": "code",
"colab": {}
},
"source": [
"from google.colab import files\n",
"file_list = files.upload()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "WWOf6sgbBbGY",
"colab_type": "code",
"colab": {}
},
"source": [
"# extract embedding from wav files\n",
"speaker_embeddings = []\n",
"for name in file_list.keys():\n",
" if '.wav' in name:\n",
" mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n",
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
" if USE_CUDA:\n",
" mel_spec = mel_spec.cuda()\n",
" embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
" speaker_embeddings.append(embedd)\n",
" else:\n",
" print(\" You need upload Wav files, others files is not supported !!\")\n",
"\n",
"# takes the average of the embedings samples of the announcers\n",
"speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "xmItcGac5WiG",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
}
]
}

View File

@ -0,0 +1,834 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018-With-GST.ipynb",
"provenance": [],
"collapsed_sections": [
"yZK6UdwSFnOO",
"ENA2OumIVeMA",
"dV6cXXlfi72r",
"vnV-FigfvsS2",
"g_G_HweN04W-",
"LEE6mQLh5Who"
],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "yZK6UdwSFnOO",
"colab_type": "text"
},
"source": [
"# **Download and install Mozilla TTS**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yvb0pX3WY6MN",
"colab_type": "code",
"colab": {}
},
"source": [
"import os \n",
"!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "iB9nl2UEG3SY",
"colab_type": "code",
"colab": {}
},
"source": [
"!apt-get install espeak\n",
"os.chdir('TTS')\n",
"!pip install -r requirements.txt\n",
"!python setup.py develop\n",
"os.chdir('..')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "w6Krn8k1inC_",
"colab_type": "text"
},
"source": [
"\n",
"\n",
"**Download Checkpoint**\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "PiYHf3lKhi9z",
"colab_type": "code",
"colab": {}
},
"source": [
"!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018-with-GST.zip\n",
"!unzip ./TTS-checkpoint.zip\n",
"\n",
"# Download gst style example\n",
"!wget https://github.com/Edresson/TTS/releases/download/v1.0.0/gst-style-example.wav"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "MpYNgqrZcJKn",
"colab_type": "text"
},
"source": [
"**Utils Functions**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4KZA4b_CbMqx",
"colab_type": "code",
"colab": {}
},
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import argparse\n",
"import json\n",
"# pylint: disable=redefined-outer-name, unused-argument\n",
"import os\n",
"import string\n",
"import time\n",
"import sys\n",
"import numpy as np\n",
"\n",
"TTS_PATH = \"../content/TTS\"\n",
"# add libraries into environment\n",
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
"\n",
"import torch\n",
"\n",
"from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.tts.utils.synthesis import synthesis\n",
"from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.io import load_config\n",
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
"\n",
"\n",
"def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):\n",
" t_1 = time.time()\n",
" waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" if use_cuda and not use_gl:\n",
" waveform = waveform.cpu()\n",
" if not use_gl:\n",
" waveform = waveform.numpy()\n",
" waveform = waveform.squeeze()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" return waveform\n",
"\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ENA2OumIVeMA",
"colab_type": "text"
},
"source": [
"# **Vars definitions**\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "jPD0d_XpVXmY",
"colab_type": "code",
"colab": {}
},
"source": [
"TEXT = ''\n",
"OUT_PATH = 'tests-audios/'\n",
"# create output path\n",
"os.makedirs(OUT_PATH, exist_ok=True)\n",
"\n",
"SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n",
"\n",
"# model vars \n",
"MODEL_PATH = 'best_model.pth.tar'\n",
"CONFIG_PATH = 'config.json'\n",
"SPEAKER_JSON = 'speakers.json'\n",
"\n",
"# vocoder vars\n",
"VOCODER_PATH = ''\n",
"VOCODER_CONFIG_PATH = ''\n",
"\n",
"USE_CUDA = True"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "dV6cXXlfi72r",
"colab_type": "text"
},
"source": [
"# **Restore TTS Model**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "x1WgLFauWUPe",
"colab_type": "code",
"colab": {}
},
"source": [
"# load the config\n",
"C = load_config(CONFIG_PATH)\n",
"C.forward_attn_mask = True\n",
"\n",
"# load the audio processor\n",
"ap = AudioProcessor(**C.audio)\n",
"\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in C.keys():\n",
" symbols, phonemes = make_symbols(**C.characters)\n",
"\n",
"speaker_embedding = None\n",
"speaker_embedding_dim = None\n",
"num_speakers = 0\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" num_speakers = len(speaker_mapping)\n",
" if C.use_external_speaker_embedding_file:\n",
" if SPEAKER_FILEID is not None:\n",
" speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n",
" else: # if speaker_fileid is not specificated use the first sample in speakers.json\n",
" choise_speaker = list(speaker_mapping.keys())[0]\n",
" print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n",
" speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n",
" speaker_embedding_dim = len(speaker_embedding)\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n",
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
"model.load_state_dict(cp['model'])\n",
"model.eval()\n",
"\n",
"if USE_CUDA:\n",
" model.cuda()\n",
"\n",
"model.decoder.set_r(cp['r'])\n",
"\n",
"# load vocoder model\n",
"if VOCODER_PATH!= \"\":\n",
" VC = load_config(VOCODER_CONFIG_PATH)\n",
" vocoder_model = setup_generator(VC)\n",
" vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n",
" vocoder_model.remove_weight_norm()\n",
" if USE_CUDA:\n",
" vocoder_model.cuda()\n",
" vocoder_model.eval()\n",
"else:\n",
" vocoder_model = None\n",
" VC = None\n",
"\n",
"# synthesize voice\n",
"use_griffin_lim = VOCODER_PATH== \"\"\n",
"\n",
"if not C.use_external_speaker_embedding_file:\n",
" if SPEAKER_FILEID.isdigit():\n",
" SPEAKER_FILEID = int(SPEAKER_FILEID)\n",
" else:\n",
" SPEAKER_FILEID = None\n",
"else:\n",
" SPEAKER_FILEID = None\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "tNvVEoE30qY6",
"colab_type": "text"
},
"source": [
"Synthesize sentence with Speaker\n",
"\n",
"> Stop running the cell to leave!\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "2o8fXkVSyXOa",
"colab_type": "code",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n",
"gst_style = 'gst-style-example.wav'\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "vnV-FigfvsS2",
"colab_type": "text"
},
"source": [
"# **Select Speaker**\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "RuCGOnJ_fgDV",
"colab_type": "code",
"colab": {}
},
"source": [
"\n",
"# VCTK speakers not seen in training (new speakers)\n",
"VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n",
"\n",
"# VCTK speakers seen in training\n",
"VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n",
"\n",
"\n",
"num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "hkvv7gRcx4WV",
"colab_type": "text"
},
"source": [
"## **Example select a VCTK seen speaker in training**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "BviNMI9UyCYz",
"colab_type": "code",
"colab": {}
},
"source": [
"# get embedding\n",
"Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" if C.use_external_speaker_embedding_file:\n",
" speaker_embeddings = []\n",
" for key in list(speaker_mapping.keys()):\n",
" if Speaker_choise in key:\n",
" if len(speaker_embeddings) < num_samples_speaker:\n",
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
" # takes the average of the embedings samples of the announcers\n",
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
" "
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "5e5_XnLsx3jg",
"colab_type": "code",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n",
"gst_style = 'gst-style-example.wav'\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "QJ6VgT2a4vHW"
},
"source": [
"## **Example select a VCTK not seen speaker in training (new Speakers)**\n",
"\n",
"\n",
"> Fitting new Speakers :)\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "SZS57ZK-4vHa",
"colab": {}
},
"source": [
"# get embedding\n",
"Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" if C.use_external_speaker_embedding_file:\n",
" speaker_embeddings = []\n",
" for key in list(speaker_mapping.keys()):\n",
" if Speaker_choise in key:\n",
" if len(speaker_embeddings) < num_samples_speaker:\n",
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
" # takes the average of the embedings samples of the announcers\n",
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
" "
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "bbs85vzz4vHo",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"gst_style = 'gst-style-example.wav'\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "g_G_HweN04W-",
"colab_type": "text"
},
"source": [
"# **Changing GST tokens manually (without wav reference)**"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jyFP5syW2bjt",
"colab_type": "text"
},
"source": [
"You can define tokens manually, this way you can increase/decrease the function of a given GST token. For example a token is responsible for the length of the speaker's pauses, if you increase the value of that token you will have longer pauses and if you decrease it you will have shorter pauses."
]
},
{
"cell_type": "code",
"metadata": {
"id": "SpwjDjCM2a3Y",
"colab_type": "code",
"colab": {}
},
"source": [
"# set gst tokens, in this model we have 5 tokens\n",
"gst_style = {\"0\": 0, \"1\": 0, \"3\": 0, \"4\": 0}"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "qWChMbI_0z5X",
"colab_type": "code",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "uFjUi9xQ3mG3",
"colab_type": "code",
"colab": {}
},
"source": [
"gst_style = {\"0\": 0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Uw0d6gWg4L27",
"colab_type": "code",
"colab": {}
},
"source": [
"gst_style = {\"0\": -0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "V9izw4-54-Tl",
"colab_type": "code",
"colab": {}
},
"source": [
"gst_style = {\"0\": 0, \"1\": 0.9, \"3\": 0, \"4\": 0}\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "LEE6mQLh5Who"
},
"source": [
"# **Example Synthesizing with your own voice :)**\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "La70gSB65nrs",
"colab_type": "text"
},
"source": [
" Download and load GE2E Speaker Encoder "
]
},
{
"cell_type": "code",
"metadata": {
"id": "r0IEFZ0B5vQg",
"colab_type": "code",
"colab": {}
},
"source": [
"!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n",
"!unzip ./SpeakerEncoder-checkpoint.zip"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "jEH8HCTh5mF6",
"colab_type": "code",
"colab": {}
},
"source": [
"SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n",
"SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n",
"SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n",
"USE_CUDA = True"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "tOwkfQqT6-Qo",
"colab_type": "code",
"colab": {}
},
"source": [
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
"se_config = load_config(SE_CONFIG_PATH)\n",
"se_ap = AudioProcessor(**se_config['audio'])\n",
"\n",
"se_model = SpeakerEncoder(**se_config.model)\n",
"se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n",
"se_model.eval()\n",
"if USE_CUDA:\n",
" se_model.cuda()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0TLlbUFG8O36",
"colab_type": "text"
},
"source": [
"Upload one or more wav audio files in your voice.\n",
"\n",
"\n",
"> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "_FWwHPjJ8NXl",
"colab_type": "code",
"colab": {}
},
"source": [
"# select one or more wav files\n",
"from google.colab import files\n",
"file_list = files.upload()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "WWOf6sgbBbGY",
"colab_type": "code",
"colab": {}
},
"source": [
"# extract embedding from wav files\n",
"speaker_embeddings = []\n",
"for name in file_list.keys():\n",
" if '.wav' in name:\n",
" mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n",
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
" if USE_CUDA:\n",
" mel_spec = mel_spec.cuda()\n",
" embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
" speaker_embeddings.append(embedd)\n",
" else:\n",
" print(\"You need upload Wav files, others files is not supported !!\")\n",
"\n",
"# takes the average of the embedings samples of the announcers\n",
"speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "AQ7eP31d9yzq",
"colab_type": "code",
"colab": {}
},
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
"gst_style = {\"0\": 0, \"1\": 0.0, \"3\": 0, \"4\": 0}\n",
"gst_style = 'gst-style-example.wav'\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "11i10yE1-LMJ",
"colab_type": "text"
},
"source": [
"Uploading your own GST reference wav file"
]
},
{
"cell_type": "code",
"metadata": {
"id": "eKohSQG1-KkT",
"colab_type": "code",
"colab": {}
},
"source": [
"# select one wav file for GST reference\n",
"from google.colab import files\n",
"file_list = files.upload()\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "xmItcGac5WiG",
"colab": {}
},
"source": [
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
"gst_style = list(file_list.keys())[0]\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
],
"execution_count": null,
"outputs": []
}
]
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,163 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is a noteboook used to generate the speaker embeddings with the GE2E speaker encoder model for multi-speaker training.\n",
"\n",
"Before running this script please DON'T FORGET: \n",
"- to set file paths.\n",
"- to download related model files from TTS.\n",
"- download or clone related repos, linked below.\n",
"- setup the repositories. ```python setup.py install```\n",
"- to checkout right commit versions (given next to the model) of TTS.\n",
"- to set the right paths in the cell below.\n",
"\n",
"Repository:\n",
"- TTS: https://github.com/mozilla/TTS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"import importlib\n",
"import random\n",
"import librosa\n",
"import torch\n",
"\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
"\n",
"# you may need to change this depending on your system\n",
"os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
"\n",
"\n",
"from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.io import load_config"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should also adjust all the path constants to point at the relevant locations for you locally"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n",
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
"\n",
"\n",
"DATASETS_NAME = ['vctk'] # list the datasets\n",
"DATASETS_PATH = ['../../../datasets/VCTK/']\n",
"DATASETS_METAFILE = ['']\n",
"\n",
"USE_CUDA = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Preprocess dataset\n",
"meta_data = []\n",
"for i in range(len(DATASETS_NAME)):\n",
" preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
" preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
" meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
" \n",
"meta_data= list(meta_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"c = load_config(CONFIG_PATH)\n",
"ap = AudioProcessor(**c['audio'])\n",
"\n",
"model = SpeakerEncoder(**c.model)\n",
"model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
"model.eval()\n",
"if USE_CUDA:\n",
" model.cuda()\n",
"\n",
"embeddings_dict = {}\n",
"len_meta_data= len(meta_data)\n",
"\n",
"for i in tqdm(range(len_meta_data)):\n",
" _, wav_file, speaker_id = meta_data[i]\n",
" wav_file_name = os.path.basename(wav_file)\n",
" mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
" if USE_CUDA:\n",
" mel_spec = mel_spec.cuda()\n",
" embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
" embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create and export speakers.json\n",
"speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
"save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#test load integrity\n",
"speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
"assert speaker_mapping == speaker_mapping_load\n",
"print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -2,7 +2,7 @@
"audio":{
"audio_processor": "audio", // to use dictate different audio processors, if available.
"num_mels": 80, // size of the mel spec frame.
"num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
"frame_length_ms": null, // stft window length in ms.
"frame_shift_ms": null, // stft window hop-lengh in ms.
@ -51,5 +51,18 @@
"output_path": "result",
"min_seq_len": 0,
"max_seq_len": 300,
"log_dir": "tests/outputs/"
}
"log_dir": "tests/outputs/",
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_gst": true, // use global style tokens
"gst": { // gst parameter if gst is enabled
"gst_style_input": null, // Condition the style input either on a
// -> wave file [path to wave] or
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
// with the dictionary being len(dict) <= len(gst_style_tokens).
"gst_embedding_dim": 512,
"gst_num_heads": 4,
"gst_style_tokens": 10
}
}

View File

@ -1,3 +1,4 @@
<<<<<<< HEAD:tests/inputs/test_train_config.json
{
"model": "Tacotron2",
"run_name": "test_sample_dataset_run",
@ -150,3 +151,161 @@
}
=======
{
"model": "Tacotron2",
"run_name": "ljspeech-ddc-bn",
"run_description": "tacotron2 with ddc and batch-normalization",
// AUDIO PARAMETERS
"audio":{
// stft parameters
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 20,
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
// "characters":{
// "pad": "_",
// "eos": "~",
// "bos": "^",
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
// "punctuations":"!'(),-.:;? ",
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
// },
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16,
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule": false, // use noam warmup and lr schedule.
"grad_clip": 1.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"wd": 0.000001, // Weight decay weight.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
// TACOTRON PRENET
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
"prenet_type": "bn", // "original" or "bn".
"prenet_dropout": false, // enable/disable dropout at prenet.
// TACOTRON ATTENTION
"attention_type": "original", // 'original' or 'graves'
"attention_heads": 4, // number of attention heads (only for 'graves')
"attention_norm": "sigmoid", // softmax or sigmoid.
"windowing": false, // Enables attention windowing. Used only in eval mode.
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
"transition_agent": false, // enable/disable transition agent of forward attention.
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
"double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
"ddc_r": 7, // reduction rate for coarse decoder.
// STOPNET
"stopnet": true, // Train stopnet predicting the end of synthesis.
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console.
"tb_plot_step:": 100, // Number of steps to plot TB training figures.
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 153, // DATASET-RELATED: maximum text length
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",
// PHONEMES
"phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_gst": true, // use global style tokens
"gst": { // gst parameter if gst is enabled
"gst_style_input": null, // Condition the style input either on a
// -> wave file [path to wave] or
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
// with the dictionary being len(dict) == len(gst_style_tokens).
"gst_embedding_dim": 512,
"gst_num_heads": 4,
"gst_style_tokens": 10
},
// DATASETS
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "/home/erogol/Data/LJSpeech-1.1/",
"meta_file_train": "metadata.csv",
"meta_file_val": null
}
]
}
>>>>>>> Added support for Tacotron2 GST + abbility to condition style input with wav or tokens:config.json

View File

@ -83,6 +83,20 @@
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
"text_cleaner": "phoneme_cleaners",
"use_speaker_embedding": false // whether to use additional embeddings for separate speakers
"use_speaker_embedding": false, // whether to use additional embeddings for separate speakers
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_gst": true, // use global style tokens
"gst": { // gst parameter if gst is enabled
"gst_style_input": null, // Condition the style input either on a
// -> wave file [path to wave] or
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
// with the dictionary being len(dict) <= len(gst_style_tokens).
"gst_embedding_dim": 512,
"gst_num_heads": 4,
"gst_style_tokens": 10
}
}

View File

@ -4,7 +4,7 @@ import unittest
import torch as T
from tests import get_tests_input_path
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.utils.io import load_config
@ -59,6 +59,7 @@ class GE2ELossTests(unittest.TestCase):
dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim
loss = GE2ELoss(loss_method="softmax")
output = loss.forward(dummy_input)
assert output.item() >= 0.0
# check speaker loss with orthogonal d-vectors
dummy_input = T.empty(3, 64)
dummy_input = T.nn.init.orthogonal(dummy_input)
@ -73,6 +74,34 @@ class GE2ELossTests(unittest.TestCase):
output = loss.forward(dummy_input)
assert output.item() < 0.005
class AngleProtoLossTests(unittest.TestCase):
# pylint: disable=R0201
def test_in_out(self):
# check random input
dummy_input = T.rand(4, 5, 64) # num_speaker x num_utterance x dim
loss = AngleProtoLoss()
output = loss.forward(dummy_input)
assert output.item() >= 0.0
# check all zeros
dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim
loss = AngleProtoLoss()
output = loss.forward(dummy_input)
assert output.item() >= 0.0
# check speaker loss with orthogonal d-vectors
dummy_input = T.empty(3, 64)
dummy_input = T.nn.init.orthogonal(dummy_input)
dummy_input = T.cat(
[
dummy_input[0].repeat(5, 1, 1).transpose(0, 1),
dummy_input[1].repeat(5, 1, 1).transpose(0, 1),
dummy_input[2].repeat(5, 1, 1).transpose(0, 1),
]
) # num_speaker x num_utterance x dim
loss = AngleProtoLoss()
output = loss.forward(dummy_input)
assert output.item() < 0.005
# class LoaderTest(unittest.TestCase):
# def test_output(self):

View File

@ -58,8 +58,7 @@ class DecoderTests(unittest.TestCase):
trans_agent=True,
forward_attn_mask=True,
location_attn=True,
separate_stopnet=True,
speaker_embedding_dim=0)
separate_stopnet=True)
dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80)
@ -71,38 +70,6 @@ class DecoderTests(unittest.TestCase):
assert output.shape[2] == 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4
@staticmethod
def test_in_out_multispeaker():
layer = Decoder(
in_channels=256,
frame_channels=80,
r=2,
memory_size=4,
attn_windowing=False,
attn_norm="sigmoid",
attn_K=5,
attn_type="graves",
prenet_type='original',
prenet_dropout=True,
forward_attn=True,
trans_agent=True,
forward_attn_mask=True,
location_attn=True,
separate_stopnet=True,
speaker_embedding_dim=80)
dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80)
dummy_embed = T.rand(4, 80)
output, alignment, stop_tokens = layer(
dummy_input, dummy_memory, mask=None, speaker_embeddings=dummy_embed)
assert output.shape[0] == 4
assert output.shape[1] == 80, "size not {}".format(output.shape[1])
assert output.shape[2] == 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4
class EncoderTests(unittest.TestCase):
def test_in_out(self): #pylint: disable=no-self-use
layer = Encoder(128)

View File

@ -9,6 +9,7 @@ from torch import nn, optim
from mozilla_voice_tts.tts.layers.losses import MSELossMasked
from mozilla_voice_tts.tts.models.tacotron2 import Tacotron2
from mozilla_voice_tts.utils.io import load_config
from mozilla_voice_tts.utils.audio import AudioProcessor
#pylint: disable=unused-variable
@ -18,6 +19,9 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
class TacotronTrainTest(unittest.TestCase):
def test_train_step(self): # pylint: disable=no-self-use
@ -70,3 +74,167 @@ class TacotronTrainTest(unittest.TestCase):
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
count += 1
class MultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_embeddings = torch.rand(8, 55).to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
count += 1
class TacotronGSTTrainTest(unittest.TestCase):
#pylint: disable=no-self-use
def test_train_step(self):
# with random gst mel style
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(10):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
name, param = name_param
if name == 'gst_layer.encoder.recurrence.weight_hh_l0':
#print(param.grad)
continue
assert (param != param_ref).any(
), "param {} {} with shape {} not updated!! \n{}\n{}".format(
name, count, param.shape, param, param_ref)
count += 1
# with file gst style
mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :30].unsqueeze(0).transpose(1, 2).to(device)
mel_spec = mel_spec.repeat(8, 1, 1)
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(10):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
name, param = name_param
if name == 'gst_layer.encoder.recurrence.weight_hh_l0':
#print(param.grad)
continue
assert (param != param_ref).any(
), "param {} {} with shape {} not updated!! \n{}\n{}".format(
name, count, param.shape, param, param_ref)
count += 1

View File

@ -9,6 +9,7 @@ from torch import nn, optim
from mozilla_voice_tts.tts.layers.losses import L1LossMasked
from mozilla_voice_tts.tts.models.tacotron import Tacotron
from mozilla_voice_tts.utils.io import load_config
from mozilla_voice_tts.utils.audio import AudioProcessor
#pylint: disable=unused-variable
@ -18,6 +19,9 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
def count_parameters(model):
r"""Count number of trainable parameters in a network"""
@ -31,7 +35,7 @@ class TacotronTrainTest(unittest.TestCase):
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device)
linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
@ -49,7 +53,7 @@ class TacotronTrainTest(unittest.TestCase):
model = Tacotron(
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio['num_freq'],
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
r=c.r,
memory_size=c.memory_size
@ -85,15 +89,78 @@ class TacotronTrainTest(unittest.TestCase):
count, param.shape, param, param_ref)
count += 1
class TacotronGSTTrainTest(unittest.TestCase):
class MultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_embeddings = torch.rand(8, 55).to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
r=c.r,
memory_size=c.memory_size,
speaker_embedding_dim=55,
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(" > Num parameters for Tacotron model:%s" %
(count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths,
speaker_embeddings=speaker_embeddings)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
count += 1
class TacotronGSTTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
# with random gst mel style
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device)
linear_spec = torch.rand(8, 120, c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
mel_lengths[-1] = 120
stop_targets = torch.zeros(8, 120, 1).float().to(device)
@ -113,13 +180,82 @@ class TacotronGSTTrainTest(unittest.TestCase):
num_chars=32,
num_speakers=5,
gst=True,
postnet_output_dim=c.audio['num_freq'],
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
r=c.r,
memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(model)
# print(model)
print(" > Num parameters for Tacotron GST model:%s" %
(count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(10):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
# ignore pre-higway layer since it works conditional
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
count += 1
# with file gst style
mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :120].unsqueeze(0).transpose(1, 2).to(device)
mel_spec = mel_spec.repeat(8, 1, 1)
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths[-1] = 128
linear_spec = torch.rand(8, mel_spec.size(1), c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, mel_spec.size(1), (8, )).long().to(device)
mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
r=c.r,
memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
# print(model)
print(" > Num parameters for Tacotron GST model:%s" %
(count_parameters(model)))
model_ref = copy.deepcopy(model)

374
utils/generic_utils.py Normal file
View File

@ -0,0 +1,374 @@
import os
import glob
import torch
import shutil
import datetime
import subprocess
import importlib
import numpy as np
from collections import Counter
def get_git_branch():
try:
out = subprocess.check_output(["git", "branch"]).decode("utf8")
current = next(line for line in out.split("\n")
if line.startswith("*"))
current.replace("* ", "")
except subprocess.CalledProcessError:
current = "inside_docker"
return current
def get_commit_hash():
"""https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script"""
# try:
# subprocess.check_output(['git', 'diff-index', '--quiet',
# 'HEAD']) # Verify client is clean
# except:
# raise RuntimeError(
# " !! Commit before training to get the commit hash.")
try:
commit = subprocess.check_output(
['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
# Not copying .git folder into docker container
except subprocess.CalledProcessError:
commit = "0000000"
print(' > Git Hash: {}'.format(commit))
return commit
def create_experiment_folder(root_path, model_name, debug):
""" Create a folder with the current date and time """
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
if debug:
commit_hash = 'debug'
else:
commit_hash = get_commit_hash()
output_folder = os.path.join(
root_path, model_name + '-' + date_str + '-' + commit_hash)
os.makedirs(output_folder, exist_ok=True)
print(" > Experiment folder: {}".format(output_folder))
return output_folder
def remove_experiment_folder(experiment_path):
"""Check folder if there is a checkpoint, otherwise remove the folder"""
checkpoint_files = glob.glob(experiment_path + "/*.pth.tar")
if not checkpoint_files:
if os.path.exists(experiment_path):
shutil.rmtree(experiment_path, ignore_errors=True)
print(" ! Run is removed from {}".format(experiment_path))
else:
print(" ! Run is kept in {}".format(experiment_path))
def count_parameters(model):
r"""Count number of trainable parameters in a network"""
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def split_dataset(items):
is_multi_speaker = False
speakers = [item[-1] for item in items]
is_multi_speaker = len(set(speakers)) > 1
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
len(items) * 0.01)
assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
np.random.seed(0)
np.random.shuffle(items)
if is_multi_speaker:
items_eval = []
# most stupid code ever -- Fix it !
while len(items_eval) < eval_split_size:
speakers = [item[-1] for item in items]
speaker_counter = Counter(speakers)
item_idx = np.random.randint(0, len(items))
if speaker_counter[items[item_idx][-1]] > 1:
items_eval.append(items[item_idx])
del items[item_idx]
return items_eval, items
return items[:eval_split_size], items[eval_split_size:]
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
def sequence_mask(sequence_length, max_len=None):
if max_len is None:
max_len = sequence_length.data.max()
batch_size = sequence_length.size(0)
seq_range = torch.arange(0, max_len).long()
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
if sequence_length.is_cuda:
seq_range_expand = seq_range_expand.to(sequence_length.device)
seq_length_expand = (
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
# B x T_max
return seq_range_expand < seq_length_expand
def set_init_dict(model_dict, checkpoint_state, c):
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
for k, v in checkpoint_state.items():
if k not in model_dict:
print(" | > Layer missing in the model definition: {}".format(k))
# 1. filter out unnecessary keys
pretrained_dict = {
k: v
for k, v in checkpoint_state.items() if k in model_dict
}
# 2. filter out different size layers
pretrained_dict = {
k: v
for k, v in pretrained_dict.items()
if v.numel() == model_dict[k].numel()
}
# 3. skip reinit layers
if c.reinit_layers is not None:
for reinit_layer_name in c.reinit_layers:
pretrained_dict = {
k: v
for k, v in pretrained_dict.items()
if reinit_layer_name not in k
}
# 4. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
len(model_dict)))
return model_dict
def setup_model(num_chars, num_speakers, c):
print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('TTS.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model)
if c.model.lower() in "tacotron":
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
decoder_output_dim=c.audio['num_mels'],
gst=c.use_gst,
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
memory_size=c.memory_size,
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r)
elif c.model.lower() == "tacotron2":
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
postnet_output_dim=c.audio['num_mels'],
decoder_output_dim=c.audio['num_mels'],
gst=c.use_gst,
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r)
return model
class KeepAverage():
def __init__(self):
self.avg_values = {}
self.iters = {}
def __getitem__(self, key):
return self.avg_values[key]
def items(self):
return self.avg_values.items()
def add_value(self, name, init_val=0, init_iter=0):
self.avg_values[name] = init_val
self.iters[name] = init_iter
def update_value(self, name, value, weighted_avg=False):
if name not in self.avg_values:
# add value if not exist before
self.add_value(name, init_val=value)
else:
# else update existing value
if weighted_avg:
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
self.iters[name] += 1
else:
self.avg_values[name] = self.avg_values[name] * \
self.iters[name] + value
self.iters[name] += 1
self.avg_values[name] /= self.iters[name]
def add_values(self, name_dict):
for key, value in name_dict.items():
self.add_value(key, init_val=value)
def update_values(self, value_dict):
for key, value in value_dict.items():
self.update_value(key, value)
def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None):
if alternative in c.keys() and c[alternative] is not None:
return
if restricted:
assert name in c.keys(), f' [!] {name} not defined in config.json'
if name in c.keys():
if max_val:
assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
if min_val:
assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
if enum_list:
assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
if val_type:
assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
def check_config(c):
_check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
_check_argument('run_name', c, restricted=True, val_type=str)
_check_argument('run_description', c, val_type=str)
# AUDIO
_check_argument('audio', c, restricted=True, val_type=dict)
# audio processing parameters
_check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
_check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
_check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
_check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
_check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
_check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
_check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
_check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
_check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
_check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
# vocabulary parameters
_check_argument('characters', c, restricted=False, val_type=dict)
_check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
# normalization parameters
_check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
_check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
_check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
_check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
_check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
_check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
_check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100)
_check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
_check_argument('trim_db', c['audio'], restricted=True, val_type=int)
# training parameters
_check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
_check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
_check_argument('r', c, restricted=True, val_type=int, min_val=1)
_check_argument('gradual_training', c, restricted=False, val_type=list)
_check_argument('loss_masking', c, restricted=True, val_type=bool)
# _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
# validation parameters
_check_argument('run_eval', c, restricted=True, val_type=bool)
_check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
_check_argument('test_sentences_file', c, restricted=False, val_type=str)
# optimizer
_check_argument('noam_schedule', c, restricted=False, val_type=bool)
_check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
_check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
_check_argument('lr', c, restricted=True, val_type=float, min_val=0)
_check_argument('wd', c, restricted=True, val_type=float, min_val=0)
_check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
_check_argument('seq_len_norm', c, restricted=True, val_type=bool)
# tacotron prenet
_check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
_check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
_check_argument('prenet_dropout', c, restricted=True, val_type=bool)
# attention
_check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
_check_argument('attention_heads', c, restricted=True, val_type=int)
_check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
_check_argument('windowing', c, restricted=True, val_type=bool)
_check_argument('use_forward_attn', c, restricted=True, val_type=bool)
_check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
_check_argument('transition_agent', c, restricted=True, val_type=bool)
_check_argument('transition_agent', c, restricted=True, val_type=bool)
_check_argument('location_attn', c, restricted=True, val_type=bool)
_check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
_check_argument('double_decoder_consistency', c, restricted=True, val_type=bool)
_check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
# stopnet
_check_argument('stopnet', c, restricted=True, val_type=bool)
_check_argument('separate_stopnet', c, restricted=True, val_type=bool)
# tensorboard
_check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
_check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1)
_check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
_check_argument('checkpoint', c, restricted=True, val_type=bool)
_check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
# dataloading
# pylint: disable=import-outside-toplevel
from TTS.utils.text import cleaners
_check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners))
_check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
_check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
_check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
_check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
_check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
_check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
# paths
_check_argument('output_path', c, restricted=True, val_type=str)
# multi-speaker
_check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
# GST
_check_argument('use_gst', c, restricted=True, val_type=bool)
_check_argument('gst', c, restricted=True, val_type=dict)
_check_argument('gst_style_input', c['gst'], restricted=True, val_type=str)
_check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=1)
_check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=1)
_check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1)
# datasets - checking only the first entry
_check_argument('datasets', c, restricted=True, val_type=list)
for dataset_entry in c['datasets']:
_check_argument('name', dataset_entry, restricted=True, val_type=str)
_check_argument('path', dataset_entry, restricted=True, val_type=str)
_check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
_check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)