mirror of https://github.com/coqui-ai/TTS.git
commit
2344379cb8
|
@ -1,5 +1,5 @@
|
|||
# Number of days of inactivity before an issue becomes stale
|
||||
daysUntilStale: 60
|
||||
daysUntilStale: 30
|
||||
# Number of days of inactivity before a stale issue is closed
|
||||
daysUntilClose: 7
|
||||
# Issues with these labels will never be considered stale
|
||||
|
@ -12,8 +12,7 @@ staleLabel: wontfix
|
|||
markComment: >
|
||||
This issue has been automatically marked as stale because it has not had
|
||||
recent activity. It will be closed if no further activity occurs. Thank you
|
||||
for your contributions. You might also look our discourse page for further help.
|
||||
https://discourse.mozilla.org/c/tts
|
||||
for your contributions. You might also look our discussion channels.
|
||||
# Comment to post when closing a stale issue. Set to `false` to disable
|
||||
closeComment: false
|
||||
|
||||
|
|
|
@ -35,7 +35,8 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install espeak-ng git
|
||||
sudo apt install -y espeak-ng git
|
||||
sudo apt install -y python3-wheel gcc
|
||||
- name: Upgrade pip
|
||||
# so we can take advantage of pyproject.toml build-dependency support
|
||||
run: python3 -m pip install --upgrade pip
|
||||
|
@ -45,7 +46,7 @@ jobs:
|
|||
python3 setup.py egg_info
|
||||
- name: Lint check
|
||||
run: |
|
||||
cardboardlinter -n auto
|
||||
cardboardlinter
|
||||
- name: Unit tests
|
||||
run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker
|
||||
- name: Test scripts
|
||||
|
@ -57,3 +58,5 @@ jobs:
|
|||
./tests/test_vocoder_wavegrad_train.sh
|
||||
./tests/test_vocoder_wavernn_train.sh
|
||||
./tests/test_speedy_speech_train.sh
|
||||
./tests/test_resample.sh
|
||||
./tests/test_compute_statistics.sh
|
||||
|
|
|
@ -130,4 +130,6 @@ TODO.txt
|
|||
data/*
|
||||
notebooks/data/*
|
||||
TTS/tts/layers/glow_tts/monotonic_align/core.c
|
||||
temp_build/*
|
||||
.vscode-upload.json
|
||||
temp_build/*
|
||||
recipes/*
|
||||
|
|
|
@ -74,12 +74,14 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
|
|||
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
|
||||
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
|
||||
- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
|
||||
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
|
||||
|
||||
### Attention Methods
|
||||
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
|
||||
- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
|
||||
- Graves Attention: [paper](https://arxiv.org/abs/1907.09006)
|
||||
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
|
||||
- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
|
||||
|
||||
### Speaker Encoder
|
||||
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
|
||||
|
@ -174,7 +176,7 @@ Run a tts and a vocoder model from the released model list. (Simply copy and pas
|
|||
tts --text "Text for TTS" \
|
||||
--model_name "<type>/<language>/<dataset>/<model_name>" \
|
||||
--vocoder_name "<type>/<language>/<dataset>/<model_name>" \
|
||||
--out_path folder/to/save/output/
|
||||
--out_path folder/to/save/output.wav
|
||||
```
|
||||
|
||||
Run your own TTS model (Using Griffin-Lim Vocoder)
|
||||
|
@ -182,7 +184,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder)
|
|||
tts --text "Text for TTS" \
|
||||
--model_path path/to/model.pth.tar \
|
||||
--config_path path/to/config.json \
|
||||
--out_path output/path/speech.wav
|
||||
--out_path folder/to/save/output.wav
|
||||
```
|
||||
|
||||
Run your own TTS and Vocoder models
|
||||
|
@ -190,7 +192,7 @@ Run your own TTS and Vocoder models
|
|||
tts --text "Text for TTS" \
|
||||
--model_path path/to/config.json \
|
||||
--config_path path/to/model.pth.tar \
|
||||
--out_path output/path/speech.wav \
|
||||
--out_path folder/to/save/output.wav \
|
||||
--vocoder_path path/to/vocoder.pth.tar \
|
||||
--vocoder_config_path path/to/vocoder_config.json
|
||||
```
|
||||
|
@ -263,7 +265,6 @@ cardboardlinter --refspec master
|
|||
Feel free to ping us at any step you need help using our communication channels.
|
||||
[Here](https://github.com/firstcontributions/first-contributions) is a good resource for complete beginners.
|
||||
|
||||
|
||||
### Acknowledgement
|
||||
- https://github.com/keithito/tacotron (Dataset pre-processing)
|
||||
- https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture)
|
||||
|
|
|
@ -94,6 +94,16 @@
|
|||
"contact": "egolge@coqui.com"
|
||||
}
|
||||
}
|
||||
},
|
||||
"de":{
|
||||
"thorsten":{
|
||||
"tacotron2-DCA":{
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
|
||||
"default_vocoder": "vocoder_models/de/thorsten/wavegrad",
|
||||
"author": "@thorstenMueller",
|
||||
"commit": "unknown"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"vocoder_models":{
|
||||
|
@ -141,6 +151,15 @@
|
|||
"commit": "unknown"
|
||||
}
|
||||
}
|
||||
},
|
||||
"de":{
|
||||
"thorsten":{
|
||||
"wavegrad":{
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
|
||||
"author": "@thorstenMueller",
|
||||
"commit": "unknown"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,7 @@
|
|||
# %%
|
||||
# %%
|
||||
import argparse
|
||||
from difflib import SequenceMatcher
|
||||
import os
|
||||
import sys
|
||||
# %%
|
||||
# print variable match
|
||||
from pprint import pprint
|
||||
|
||||
import numpy as np
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import librosa
|
||||
from distutils.dir_util import copy_tree
|
||||
from argparse import RawTextHelpFormatter
|
||||
from multiprocessing import Pool
|
||||
from tqdm import tqdm
|
||||
|
||||
def resample_file(func_args):
|
||||
filename, output_sr = func_args
|
||||
y, sr = librosa.load(filename, sr=output_sr)
|
||||
librosa.output.write_wav(filename, y, sr)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='''Resample a folder recusively with librosa
|
||||
Can be used in place or create a copy of the folder as an output.\n\n
|
||||
Example run:
|
||||
python TTS/bin/resample.py
|
||||
--input_dir /root/LJSpeech-1.1/
|
||||
--output_sr 22050
|
||||
--output_dir /root/resampled_LJSpeech-1.1/
|
||||
--n_jobs 24
|
||||
''',
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
|
||||
parser.add_argument('--input_dir',
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help='Path of the folder containing the audio files to resample')
|
||||
|
||||
parser.add_argument('--output_sr',
|
||||
type=int,
|
||||
default=22050,
|
||||
required=False,
|
||||
help='Samlple rate to which the audio files should be resampled')
|
||||
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default=None,
|
||||
required=False,
|
||||
help='Path of the destination folder. If not defined, the operation is done in place')
|
||||
|
||||
parser.add_argument('--n_jobs',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Number of threads to use, by default it uses all cores')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.output_dir:
|
||||
print('Recursively copying the input folder...')
|
||||
copy_tree(args.input_dir, args.output_dir)
|
||||
args.input_dir = args.output_dir
|
||||
|
||||
print('Resampling the audio files...')
|
||||
audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True)
|
||||
print(f'Found {len(audio_files)} files...')
|
||||
audio_files = list(zip(audio_files, len(audio_files)*[args.output_sr]))
|
||||
with Pool(processes=args.n_jobs) as p:
|
||||
with tqdm(total=len(audio_files)) as pbar:
|
||||
for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
|
||||
pbar.update()
|
||||
|
||||
print('Done !')
|
|
@ -2,9 +2,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import string
|
||||
from argparse import RawTextHelpFormatter
|
||||
# pylint: disable=redefined-outer-name, unused-argument
|
||||
from pathlib import Path
|
||||
|
@ -103,8 +101,8 @@ def main():
|
|||
parser.add_argument(
|
||||
'--out_path',
|
||||
type=str,
|
||||
default=Path(__file__).resolve().parent,
|
||||
help='Path to save final wav file. Wav file will be named as the given text.',
|
||||
default='tts_output.wav',
|
||||
help='Output wav file path.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--use_cuda',
|
||||
|
@ -218,12 +216,8 @@ def main():
|
|||
wav = synthesizer.tts(args.text)
|
||||
|
||||
# save the results
|
||||
file_name = args.text.replace(" ", "_")[0:20]
|
||||
file_name = file_name.translate(
|
||||
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
|
||||
out_path = os.path.join(args.out_path, file_name)
|
||||
print(" > Saving output to {}".format(out_path))
|
||||
synthesizer.save_wav(wav, out_path,)
|
||||
print(" > Saving output to {}".format(args.out_path))
|
||||
synthesizer.save_wav(wav, args.out_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -0,0 +1,625 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from random import randrange
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP_th
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from TTS.tts.datasets.preprocess import load_meta_data
|
||||
from TTS.tts.datasets.TTSDataset import MyDataset
|
||||
from TTS.tts.layers.losses import AlignTTSLoss
|
||||
from TTS.tts.utils.generic_utils import setup_model
|
||||
from TTS.tts.utils.io import save_best_model, save_checkpoint
|
||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
||||
from TTS.tts.utils.speakers import parse_speakers
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.utils.arguments import parse_arguments, process_args
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.distribute import init_distributed, reduce_tensor
|
||||
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.training import NoamLR, setup_torch_training_env
|
||||
|
||||
if __name__ == '__main__':
|
||||
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
||||
# torch.autograd.set_detect_anomaly(True)
|
||||
|
||||
def setup_loader(ap, r, is_val=False, verbose=False):
|
||||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
dataset = MyDataset(
|
||||
r,
|
||||
c.text_cleaner,
|
||||
compute_linear_spec=False,
|
||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||
ap=ap,
|
||||
tp=c.characters if 'characters' in c.keys() else None,
|
||||
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
|
||||
batch_group_size=0 if is_val else c.batch_group_size *
|
||||
c.batch_size,
|
||||
min_seq_len=c.min_seq_len,
|
||||
max_seq_len=c.max_seq_len,
|
||||
phoneme_cache_path=c.phoneme_cache_path,
|
||||
use_phonemes=c.use_phonemes,
|
||||
phoneme_language=c.phoneme_language,
|
||||
enable_eos_bos=c.enable_eos_bos_chars,
|
||||
use_noise_augment=not is_val,
|
||||
verbose=verbose,
|
||||
speaker_mapping=speaker_mapping if c.use_speaker_embedding
|
||||
and c.use_external_speaker_embedding_file else None)
|
||||
|
||||
if c.use_phonemes and c.compute_input_seq_cache:
|
||||
# precompute phonemes to have a better estimate of sequence lengths.
|
||||
dataset.compute_input_seq(c.num_loader_workers)
|
||||
dataset.sort_items()
|
||||
|
||||
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=c.eval_batch_size if is_val else c.batch_size,
|
||||
shuffle=False,
|
||||
collate_fn=dataset.collate_fn,
|
||||
drop_last=False,
|
||||
sampler=sampler,
|
||||
num_workers=c.num_val_loader_workers
|
||||
if is_val else c.num_loader_workers,
|
||||
pin_memory=False)
|
||||
return loader
|
||||
|
||||
def format_data(data):
|
||||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
speaker_names = data[2]
|
||||
mel_input = data[4].permute(0, 2, 1) # B x D x T
|
||||
mel_lengths = data[5]
|
||||
item_idx = data[7]
|
||||
avg_text_length = torch.mean(text_lengths.float())
|
||||
avg_spec_length = torch.mean(mel_lengths.float())
|
||||
|
||||
if c.use_speaker_embedding:
|
||||
if c.use_external_speaker_embedding_file:
|
||||
# return precomputed embedding vector
|
||||
speaker_c = data[8]
|
||||
else:
|
||||
# return speaker_id to be used by an embedding layer
|
||||
speaker_c = [
|
||||
speaker_mapping[speaker_name]
|
||||
for speaker_name in speaker_names
|
||||
]
|
||||
speaker_c = torch.LongTensor(speaker_c)
|
||||
else:
|
||||
speaker_c = None
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
text_input = text_input.cuda(non_blocking=True)
|
||||
text_lengths = text_lengths.cuda(non_blocking=True)
|
||||
mel_input = mel_input.cuda(non_blocking=True)
|
||||
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
||||
if speaker_c is not None:
|
||||
speaker_c = speaker_c.cuda(non_blocking=True)
|
||||
return text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
||||
avg_text_length, avg_spec_length, item_idx
|
||||
|
||||
def train(data_loader, model, criterion, optimizer, scheduler, ap,
|
||||
global_step, epoch, training_phase):
|
||||
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
if use_cuda:
|
||||
batch_n_iter = int(
|
||||
len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||
else:
|
||||
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
|
||||
end_time = time.time()
|
||||
c_logger.print_train_start()
|
||||
scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
|
||||
avg_text_length, avg_spec_length, _ = format_data(data)
|
||||
|
||||
loader_time = time.time() - end_time
|
||||
|
||||
global_step += 1
|
||||
optimizer.zero_grad()
|
||||
|
||||
# forward pass model
|
||||
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
|
||||
decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward(
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_targets,
|
||||
mel_lengths,
|
||||
g=speaker_c,
|
||||
phase=training_phase)
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(logp,
|
||||
decoder_output,
|
||||
mel_targets,
|
||||
mel_lengths,
|
||||
dur_output,
|
||||
dur_mas_output,
|
||||
text_lengths,
|
||||
global_step,
|
||||
phase=training_phase)
|
||||
|
||||
# backward pass with loss scaling
|
||||
if c.mixed_precision:
|
||||
scaler.scale(loss_dict['loss']).backward()
|
||||
scaler.unscale_(optimizer)
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||
model.parameters(), c.grad_clip)
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
else:
|
||||
loss_dict['loss'].backward()
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||
model.parameters(), c.grad_clip)
|
||||
optimizer.step()
|
||||
|
||||
# setup lr
|
||||
if c.noam_schedule:
|
||||
scheduler.step()
|
||||
|
||||
# current_lr
|
||||
current_lr = optimizer.param_groups[0]['lr']
|
||||
|
||||
# compute alignment error (the lower the better )
|
||||
align_error = 1 - alignment_diagonal_score(alignments, binary=True)
|
||||
loss_dict['align_error'] = align_error
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
loss_dict['loss_l1'] = reduce_tensor(loss_dict['loss_l1'].data,
|
||||
num_gpus)
|
||||
loss_dict['loss_ssim'] = reduce_tensor(
|
||||
loss_dict['loss_ssim'].data, num_gpus)
|
||||
loss_dict['loss_dur'] = reduce_tensor(
|
||||
loss_dict['loss_dur'].data, num_gpus)
|
||||
loss_dict['loss'] = reduce_tensor(loss_dict['loss'].data,
|
||||
num_gpus)
|
||||
|
||||
# detach loss values
|
||||
loss_dict_new = dict()
|
||||
for key, value in loss_dict.items():
|
||||
if isinstance(value, (int, float)):
|
||||
loss_dict_new[key] = value
|
||||
else:
|
||||
loss_dict_new[key] = value.item()
|
||||
loss_dict = loss_dict_new
|
||||
|
||||
# update avg stats
|
||||
update_train_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_train_values['avg_' + key] = value
|
||||
update_train_values['avg_loader_time'] = loader_time
|
||||
update_train_values['avg_step_time'] = step_time
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
# print training progress
|
||||
if global_step % c.print_step == 0:
|
||||
log_dict = {
|
||||
"avg_spec_length": [avg_spec_length,
|
||||
1], # value, precision
|
||||
"avg_text_length": [avg_text_length, 1],
|
||||
"step_time": [step_time, 4],
|
||||
"loader_time": [loader_time, 2],
|
||||
"current_lr": current_lr,
|
||||
}
|
||||
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
||||
log_dict, loss_dict,
|
||||
keep_avg.avg_values)
|
||||
|
||||
if args.rank == 0:
|
||||
# Plot Training Iter Stats
|
||||
# reduce TB load
|
||||
if global_step % c.tb_plot_step == 0:
|
||||
iter_stats = {
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"step_time": step_time
|
||||
}
|
||||
iter_stats.update(loss_dict)
|
||||
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||
|
||||
if global_step % c.save_step == 0:
|
||||
if c.checkpoint:
|
||||
# save model
|
||||
save_checkpoint(model,
|
||||
optimizer,
|
||||
global_step,
|
||||
epoch,
|
||||
1,
|
||||
OUT_PATH,
|
||||
model_characters,
|
||||
model_loss=loss_dict['loss'])
|
||||
|
||||
# wait all kernels to be completed
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Diagnostic visualizations
|
||||
if decoder_output is not None:
|
||||
idx = np.random.randint(mel_targets.shape[0])
|
||||
pred_spec = decoder_output[idx].detach().data.cpu(
|
||||
).numpy().T
|
||||
gt_spec = mel_targets[idx].data.cpu().numpy().T
|
||||
align_img = alignments[idx].data.cpu()
|
||||
|
||||
figures = {
|
||||
"prediction": plot_spectrogram(pred_spec, ap),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap),
|
||||
"alignment": plot_alignment(align_img),
|
||||
}
|
||||
|
||||
tb_logger.tb_train_figures(global_step, figures)
|
||||
|
||||
# Sample audio
|
||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||
tb_logger.tb_train_audios(global_step,
|
||||
{'TrainAudio': train_audio},
|
||||
c.audio["sample_rate"])
|
||||
end_time = time.time()
|
||||
|
||||
# print epoch stats
|
||||
c_logger.print_train_epoch_end(global_step, epoch, epoch_time,
|
||||
keep_avg)
|
||||
|
||||
# Plot Epoch Stats
|
||||
if args.rank == 0:
|
||||
epoch_stats = {"epoch_time": epoch_time}
|
||||
epoch_stats.update(keep_avg.avg_values)
|
||||
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
|
||||
if c.tb_model_param_stats:
|
||||
tb_logger.tb_model_weights(model, global_step)
|
||||
return keep_avg.avg_values, global_step
|
||||
|
||||
@torch.no_grad()
|
||||
def evaluate(data_loader, model, criterion, ap, global_step, epoch,
|
||||
training_phase):
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
c_logger.print_eval_start()
|
||||
if data_loader is not None:
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
|
||||
_, _, _ = format_data(data)
|
||||
|
||||
# forward pass model
|
||||
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
|
||||
decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward(
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_targets,
|
||||
mel_lengths,
|
||||
g=speaker_c,
|
||||
phase=training_phase)
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(logp,
|
||||
decoder_output,
|
||||
mel_targets,
|
||||
mel_lengths,
|
||||
dur_output,
|
||||
dur_mas_output,
|
||||
text_lengths,
|
||||
global_step,
|
||||
phase=training_phase)
|
||||
|
||||
|
||||
# step time
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# compute alignment score
|
||||
align_error = 1 - alignment_diagonal_score(alignments,
|
||||
binary=True)
|
||||
loss_dict['align_error'] = align_error
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
loss_dict['loss_l1'] = reduce_tensor(
|
||||
loss_dict['loss_l1'].data, num_gpus)
|
||||
loss_dict['loss_ssim'] = reduce_tensor(
|
||||
loss_dict['loss_ssim'].data, num_gpus)
|
||||
loss_dict['loss_dur'] = reduce_tensor(
|
||||
loss_dict['loss_dur'].data, num_gpus)
|
||||
loss_dict['loss'] = reduce_tensor(loss_dict['loss'].data,
|
||||
num_gpus)
|
||||
|
||||
# detach loss values
|
||||
loss_dict_new = dict()
|
||||
for key, value in loss_dict.items():
|
||||
if isinstance(value, (int, float)):
|
||||
loss_dict_new[key] = value
|
||||
else:
|
||||
loss_dict_new[key] = value.item()
|
||||
loss_dict = loss_dict_new
|
||||
|
||||
# update avg stats
|
||||
update_train_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_train_values['avg_' + key] = value
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
if c.print_eval:
|
||||
c_logger.print_eval_step(num_iter, loss_dict,
|
||||
keep_avg.avg_values)
|
||||
|
||||
if args.rank == 0:
|
||||
# Diagnostic visualizations
|
||||
idx = np.random.randint(mel_targets.shape[0])
|
||||
pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
|
||||
gt_spec = mel_targets[idx].data.cpu().numpy().T
|
||||
align_img = alignments[idx].data.cpu()
|
||||
|
||||
eval_figures = {
|
||||
"prediction": plot_spectrogram(pred_spec,
|
||||
ap,
|
||||
output_fig=False),
|
||||
"ground_truth": plot_spectrogram(gt_spec,
|
||||
ap,
|
||||
output_fig=False),
|
||||
"alignment": plot_alignment(align_img, output_fig=False)
|
||||
}
|
||||
|
||||
# Sample audio
|
||||
eval_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
|
||||
c.audio["sample_rate"])
|
||||
|
||||
# Plot Validation Stats
|
||||
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
||||
tb_logger.tb_eval_figures(global_step, eval_figures)
|
||||
|
||||
if args.rank == 0 and epoch >= c.test_delay_epochs:
|
||||
if c.test_sentences_file is None:
|
||||
test_sentences = [
|
||||
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||
"Be a voice, not an echo.",
|
||||
"I'm sorry Dave. I'm afraid I can't do that.",
|
||||
"This cake is great. It's so delicious and moist.",
|
||||
"Prior to November 22, 1963."
|
||||
]
|
||||
else:
|
||||
with open(c.test_sentences_file, "r") as f:
|
||||
test_sentences = [s.strip() for s in f.readlines()]
|
||||
|
||||
# test sentences
|
||||
test_audios = {}
|
||||
test_figures = {}
|
||||
print(" | > Synthesizing test sentences")
|
||||
if c.use_speaker_embedding:
|
||||
if c.use_external_speaker_embedding_file:
|
||||
speaker_embedding = speaker_mapping[list(
|
||||
speaker_mapping.keys())[randrange(
|
||||
len(speaker_mapping) - 1)]]['embedding']
|
||||
speaker_id = None
|
||||
else:
|
||||
speaker_id = 0
|
||||
speaker_embedding = None
|
||||
else:
|
||||
speaker_id = None
|
||||
speaker_embedding = None
|
||||
|
||||
style_wav = c.get("style_wav_for_test")
|
||||
for idx, test_sentence in enumerate(test_sentences):
|
||||
try:
|
||||
wav, alignment, _, postnet_output, _, _ = synthesis(
|
||||
model,
|
||||
test_sentence,
|
||||
c,
|
||||
use_cuda,
|
||||
ap,
|
||||
speaker_id=speaker_id,
|
||||
speaker_embedding=speaker_embedding,
|
||||
style_wav=style_wav,
|
||||
truncated=False,
|
||||
enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
|
||||
use_griffin_lim=True,
|
||||
do_trim_silence=False)
|
||||
|
||||
file_path = os.path.join(AUDIO_PATH, str(global_step))
|
||||
os.makedirs(file_path, exist_ok=True)
|
||||
file_path = os.path.join(file_path,
|
||||
"TestSentence_{}.wav".format(idx))
|
||||
ap.save_wav(wav, file_path)
|
||||
test_audios['{}-audio'.format(idx)] = wav
|
||||
test_figures['{}-prediction'.format(
|
||||
idx)] = plot_spectrogram(postnet_output, ap)
|
||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(
|
||||
alignment)
|
||||
except: #pylint: disable=bare-except
|
||||
print(" !! Error creating Test Sentence -", idx)
|
||||
traceback.print_exc()
|
||||
tb_logger.tb_test_audios(global_step, test_audios,
|
||||
c.audio['sample_rate'])
|
||||
tb_logger.tb_test_figures(global_step, test_figures)
|
||||
return keep_avg.avg_values
|
||||
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
if 'characters' in c.keys():
|
||||
symbols, phonemes = make_symbols(**c.characters)
|
||||
|
||||
# DISTRUBUTED
|
||||
if num_gpus > 1:
|
||||
init_distributed(args.rank, num_gpus, args.group_id,
|
||||
c.distributed["backend"], c.distributed["url"])
|
||||
|
||||
# set model characters
|
||||
model_characters = phonemes if c.use_phonemes else symbols
|
||||
num_chars = len(model_characters)
|
||||
|
||||
# load data instances
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets,
|
||||
eval_split=True)
|
||||
|
||||
# set the portion of the data used for training if set in config.json
|
||||
if 'train_portion' in c.keys():
|
||||
meta_data_train = meta_data_train[:int(
|
||||
len(meta_data_train) * c.train_portion)]
|
||||
if 'eval_portion' in c.keys():
|
||||
meta_data_eval = meta_data_eval[:int(
|
||||
len(meta_data_eval) * c.eval_portion)]
|
||||
|
||||
# parse speakers
|
||||
num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(
|
||||
c, args, meta_data_train, OUT_PATH)
|
||||
|
||||
# setup model
|
||||
model = setup_model(num_chars,
|
||||
num_speakers,
|
||||
c,
|
||||
speaker_embedding_dim=speaker_embedding_dim)
|
||||
optimizer = RAdam(model.parameters(),
|
||||
lr=c.lr,
|
||||
weight_decay=0,
|
||||
betas=(0.9, 0.98),
|
||||
eps=1e-9)
|
||||
criterion = AlignTTSLoss(c)
|
||||
|
||||
if args.restore_path:
|
||||
print(
|
||||
f" > Restoring from {os.path.basename(args.restore_path)} ...")
|
||||
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
||||
try:
|
||||
# TODO: fix optimizer init, model.cuda() needs to be called before
|
||||
# optimizer restore
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if c.reinit_layers:
|
||||
raise RuntimeError
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
except: #pylint: disable=bare-except
|
||||
print(" > Partial model initialization.")
|
||||
model_dict = model.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||
model.load_state_dict(model_dict)
|
||||
del model_dict
|
||||
|
||||
for group in optimizer.param_groups:
|
||||
group['initial_lr'] = c.lr
|
||||
print(" > Model restored from step %d" % checkpoint['step'],
|
||||
flush=True)
|
||||
args.restore_step = checkpoint['step']
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
criterion.cuda()
|
||||
|
||||
# DISTRUBUTED
|
||||
if num_gpus > 1:
|
||||
model = DDP_th(model, device_ids=[args.rank])
|
||||
|
||||
if c.noam_schedule:
|
||||
scheduler = NoamLR(optimizer,
|
||||
warmup_steps=c.warmup_steps,
|
||||
last_epoch=args.restore_step - 1)
|
||||
else:
|
||||
scheduler = None
|
||||
|
||||
num_params = count_parameters(model)
|
||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||
|
||||
if args.restore_step == 0 or not args.best_path:
|
||||
best_loss = float('inf')
|
||||
print(" > Starting with inf best loss.")
|
||||
else:
|
||||
print(" > Restoring best loss from "
|
||||
f"{os.path.basename(args.best_path)} ...")
|
||||
best_loss = torch.load(args.best_path,
|
||||
map_location='cpu')['model_loss']
|
||||
print(f" > Starting with loaded last best loss {best_loss}.")
|
||||
keep_all_best = c.get('keep_all_best', False)
|
||||
keep_after = c.get('keep_after', 10000) # void if keep_all_best False
|
||||
|
||||
# define dataloaders
|
||||
train_loader = setup_loader(ap, 1, is_val=False, verbose=True)
|
||||
eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
|
||||
|
||||
global_step = args.restore_step
|
||||
|
||||
def set_phase():
|
||||
"""Set AlignTTS training phase"""
|
||||
if isinstance(c.phase_start_steps, list):
|
||||
vals = [i < global_step for i in c.phase_start_steps]
|
||||
if not True in vals:
|
||||
phase = 0
|
||||
else:
|
||||
phase = len(c.phase_start_steps) - [
|
||||
i < global_step for i in c.phase_start_steps
|
||||
][::-1].index(True) - 1
|
||||
else:
|
||||
phase = None
|
||||
return phase
|
||||
|
||||
for epoch in range(0, c.epochs):
|
||||
cur_phase = set_phase()
|
||||
print(f"\n > Current AlignTTS phase: {cur_phase}")
|
||||
c_logger.print_epoch_start(epoch, c.epochs)
|
||||
train_avg_loss_dict, global_step = train(train_loader, model,
|
||||
criterion, optimizer,
|
||||
scheduler, ap,
|
||||
global_step, epoch,
|
||||
cur_phase)
|
||||
eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap,
|
||||
global_step, epoch, cur_phase)
|
||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||
target_loss = train_avg_loss_dict['avg_loss']
|
||||
if c.run_eval:
|
||||
target_loss = eval_avg_loss_dict['avg_loss']
|
||||
best_loss = save_best_model(target_loss,
|
||||
best_loss,
|
||||
model,
|
||||
optimizer,
|
||||
global_step,
|
||||
epoch,
|
||||
1,
|
||||
OUT_PATH,
|
||||
model_characters,
|
||||
keep_all_best=keep_all_best,
|
||||
keep_after=keep_after)
|
||||
|
||||
args = parse_arguments(sys.argv)
|
||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||
args, model_class='tts')
|
||||
|
||||
try:
|
||||
main(args)
|
||||
except KeyboardInterrupt:
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
try:
|
||||
sys.exit(0)
|
||||
except SystemExit:
|
||||
os._exit(0) # pylint: disable=protected-access
|
||||
except Exception: # pylint: disable=broad-except
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
|
@ -580,7 +580,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if __name__ == '__main__':
|
||||
args = parse_arguments(sys.argv)
|
||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||
args, model_type='glow_tts')
|
||||
args, model_class='tts')
|
||||
|
||||
try:
|
||||
main(args)
|
||||
|
|
|
@ -540,7 +540,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if __name__ == '__main__':
|
||||
args = parse_arguments(sys.argv)
|
||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||
args, model_type='tts')
|
||||
args, model_class='tts')
|
||||
|
||||
try:
|
||||
main(args)
|
||||
|
|
|
@ -85,7 +85,7 @@ def format_data(data):
|
|||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
speaker_names = data[2]
|
||||
linear_input = data[3] if c.model in ["Tacotron"] else None
|
||||
linear_input = data[3] if c.model.lower() in ["tacotron"] else None
|
||||
mel_input = data[4]
|
||||
mel_lengths = data[5]
|
||||
stop_targets = data[6]
|
||||
|
@ -658,7 +658,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if __name__ == '__main__':
|
||||
args = parse_arguments(sys.argv)
|
||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||
args, model_type='tacotron')
|
||||
args, model_class='tts')
|
||||
|
||||
try:
|
||||
main(args)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python3
|
||||
# TODO: mixed precision training
|
||||
"""Trains GAN based vocoder model."""
|
||||
|
||||
import os
|
||||
|
@ -590,7 +591,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if __name__ == '__main__':
|
||||
args = parse_arguments(sys.argv)
|
||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||
args, model_type='gan')
|
||||
args, model_class='vocoder')
|
||||
|
||||
try:
|
||||
main(args)
|
||||
|
|
|
@ -436,7 +436,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if __name__ == '__main__':
|
||||
args = parse_arguments(sys.argv)
|
||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||
args, model_type='wavegrad')
|
||||
args, model_class='vocoder')
|
||||
|
||||
try:
|
||||
main(args)
|
||||
|
|
|
@ -460,7 +460,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if __name__ == "__main__":
|
||||
args = parse_arguments(sys.argv)
|
||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||
args, model_type='wavernn')
|
||||
args, model_class='vocoder')
|
||||
|
||||
try:
|
||||
main(args)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
|
||||
# adapted from https://github.com/cvqluu/GE2E-Loss
|
||||
class GE2ELoss(nn.Module):
|
||||
|
@ -155,6 +155,6 @@ class AngleProtoLoss(nn.Module):
|
|||
cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2))
|
||||
torch.clamp(self.w, 1e-6)
|
||||
cos_sim_matrix = cos_sim_matrix * self.w + self.b
|
||||
label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device)
|
||||
label = torch.arange(num_speakers).to(cos_sim_matrix.device)
|
||||
L = self.criterion(cos_sim_matrix, label)
|
||||
return L
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
import os
|
||||
from glob import glob
|
||||
import re
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.tts.utils.generic_utils import split_dataset
|
||||
|
||||
####################
|
||||
|
@ -35,7 +35,7 @@ def load_meta_data(datasets, eval_split=True):
|
|||
meta_data_eval_all += meta_data_eval
|
||||
meta_data_train_all += meta_data_train
|
||||
# load attention masks for duration predictor training
|
||||
if 'meta_file_attn_mask' in dataset:
|
||||
if 'meta_file_attn_mask' in dataset and dataset['meta_file_attn_mask'] is not None:
|
||||
meta_data = dict(load_attention_mask_meta_data(dataset['meta_file_attn_mask']))
|
||||
for idx, ins in enumerate(meta_data_train_all):
|
||||
attn_file = meta_data[ins[1]].strip()
|
||||
|
@ -159,7 +159,7 @@ def ljspeech(root_path, meta_file):
|
|||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "ljspeech"
|
||||
with open(txt_file, 'r') as ttf:
|
||||
with open(txt_file, 'r', encoding="utf-8") as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = os.path.join(root_path, 'wavs', cols[0] + '.wav')
|
||||
|
@ -168,13 +168,30 @@ def ljspeech(root_path, meta_file):
|
|||
return items
|
||||
|
||||
|
||||
def sam_accenture(root_path, meta_file):
|
||||
"""Normalizes the sam-accenture meta data file to TTS format
|
||||
https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
|
||||
xml_file = os.path.join(root_path, 'voice_over_recordings', meta_file)
|
||||
xml_root = ET.parse(xml_file).getroot()
|
||||
items = []
|
||||
speaker_name = "sam_accenture"
|
||||
for item in xml_root.findall('./fileid'):
|
||||
text = item.text
|
||||
wav_file = os.path.join(root_path, 'vo_voice_quality_transformation', item.get('id')+'.wav')
|
||||
if not os.path.exists(wav_file):
|
||||
print(f' [!] {wav_file} in metafile does not exist. Skipping...')
|
||||
continue
|
||||
items.append([text, wav_file, speaker_name])
|
||||
return items
|
||||
|
||||
|
||||
def ruslan(root_path, meta_file):
|
||||
"""Normalizes the RUSLAN meta data file to TTS format
|
||||
https://ruslan-corpus.github.io/"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "ljspeech"
|
||||
with open(txt_file, 'r') as ttf:
|
||||
with open(txt_file, 'r', encoding="utf-8") as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = os.path.join(root_path, 'RUSLAN', cols[0] + '.wav')
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
from torch import nn
|
||||
from TTS.tts.layers.generic.transformer import FFTransformerBlock
|
||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
|
||||
|
||||
class DurationPredictor(nn.Module):
|
||||
def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads):
|
||||
super().__init__()
|
||||
self.embed = nn.Embedding(num_chars, hidden_channels)
|
||||
self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1)
|
||||
self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1)
|
||||
self.out_layer = nn.Conv1d(hidden_channels, 1, 1)
|
||||
|
||||
def forward(self, text, text_lengths):
|
||||
# B, L -> B, L
|
||||
emb = self.embed(text)
|
||||
emb = self.pos_enc(emb.transpose(1, 2))
|
||||
x = self.FFT(emb, text_lengths)
|
||||
x = self.out_layer(x).squeeze(-1)
|
||||
return x
|
|
@ -0,0 +1,29 @@
|
|||
from torch import nn
|
||||
|
||||
|
||||
class MDNBlock(nn.Module):
|
||||
"""Mixture of Density Network implementation
|
||||
https://arxiv.org/pdf/2003.01950.pdf
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels):
|
||||
super().__init__()
|
||||
self.out_channels = out_channels
|
||||
self.conv1 = nn.Conv1d(in_channels, in_channels, 1)
|
||||
self.norm = nn.LayerNorm(in_channels)
|
||||
self.relu = nn.ReLU()
|
||||
self.dropout = nn.Dropout(0.1)
|
||||
self.conv2 = nn.Conv1d(in_channels, out_channels, 1)
|
||||
|
||||
def forward(self, x):
|
||||
o = self.conv1(x)
|
||||
o = o.transpose(1, 2)
|
||||
o = self.norm(o)
|
||||
o = o.transpose(1, 2)
|
||||
o = self.relu(o)
|
||||
o = self.dropout(o)
|
||||
mu_sigma = self.conv2(o)
|
||||
# TODO: check this sigmoid
|
||||
# mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :])
|
||||
mu = mu_sigma[:, :self.out_channels//2, :]
|
||||
log_sigma = mu_sigma[:, self.out_channels//2:, :]
|
||||
return mu, log_sigma
|
|
@ -3,6 +3,7 @@ from torch import nn
|
|||
from TTS.tts.layers.generic.res_conv_bn import Conv1dBNBlock, ResidualConv1dBNBlock, Conv1dBN
|
||||
from TTS.tts.layers.generic.wavenet import WNBlocks
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
from TTS.tts.layers.generic.transformer import FFTransformerBlock
|
||||
|
||||
|
||||
class WaveNetDecoder(nn.Module):
|
||||
|
@ -89,6 +90,37 @@ class RelativePositionTransformerDecoder(nn.Module):
|
|||
return o
|
||||
|
||||
|
||||
class FFTransformerDecoder(nn.Module):
|
||||
"""Decoder with FeedForwardTransformer.
|
||||
|
||||
Default params
|
||||
params={
|
||||
'hidden_channels_ffn': 1024,
|
||||
'num_heads': 2,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
}
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): number of hidden channels including Transformer layers.
|
||||
params (dict): dictionary for residual convolutional blocks.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, params):
|
||||
|
||||
super().__init__()
|
||||
self.transformer_block = FFTransformerBlock(in_channels, **params)
|
||||
self.postnet = nn.Conv1d(in_channels, out_channels, 1)
|
||||
|
||||
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
|
||||
# TODO: handle multi-speaker
|
||||
x_mask = 1 if x_mask is None else x_mask
|
||||
o = self.transformer_block(x) * x_mask
|
||||
o = self.postnet(o)* x_mask
|
||||
return o
|
||||
|
||||
|
||||
class ResidualConv1dBNDecoder(nn.Module):
|
||||
"""Residual Convolutional Decoder as in the original Speedy Speech paper
|
||||
|
||||
|
@ -159,24 +191,26 @@ class Decoder(nn.Module):
|
|||
c_in_channels=0):
|
||||
super().__init__()
|
||||
|
||||
if decoder_type == 'transformer':
|
||||
if decoder_type.lower() == "relative_position_transformer":
|
||||
self.decoder = RelativePositionTransformerDecoder(
|
||||
in_channels=in_hidden_channels,
|
||||
out_channels=out_channels,
|
||||
hidden_channels=in_hidden_channels,
|
||||
params=decoder_params)
|
||||
elif decoder_type == 'residual_conv_bn':
|
||||
elif decoder_type.lower() == 'residual_conv_bn':
|
||||
self.decoder = ResidualConv1dBNDecoder(
|
||||
in_channels=in_hidden_channels,
|
||||
out_channels=out_channels,
|
||||
hidden_channels=in_hidden_channels,
|
||||
params=decoder_params)
|
||||
elif decoder_type == 'wavenet':
|
||||
elif decoder_type.lower() == 'wavenet':
|
||||
self.decoder = WaveNetDecoder(in_channels=in_hidden_channels,
|
||||
out_channels=out_channels,
|
||||
hidden_channels=in_hidden_channels,
|
||||
c_in_channels=c_in_channels,
|
||||
params=decoder_params)
|
||||
elif decoder_type.lower() == 'fftransformer':
|
||||
self.decoder = FFTransformerDecoder(in_hidden_channels, out_channels, decoder_params)
|
||||
else:
|
||||
raise ValueError(f'[!] Unknown decoder type - {decoder_type}')
|
||||
|
|
@ -1,62 +1,8 @@
|
|||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
from TTS.tts.layers.generic.res_conv_bn import ResidualConv1dBNBlock
|
||||
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
"""Sinusoidal positional encoding for non-recurrent neural networks.
|
||||
Implementation based on "Attention Is All You Need"
|
||||
Args:
|
||||
channels (int): embedding size
|
||||
dropout (float): dropout parameter
|
||||
"""
|
||||
def __init__(self, channels, dropout=0.0, max_len=5000):
|
||||
super().__init__()
|
||||
if channels % 2 != 0:
|
||||
raise ValueError(
|
||||
"Cannot use sin/cos positional encoding with "
|
||||
"odd channels (got channels={:d})".format(channels))
|
||||
pe = torch.zeros(max_len, channels)
|
||||
position = torch.arange(0, max_len).unsqueeze(1)
|
||||
div_term = torch.exp((torch.arange(0, channels, 2, dtype=torch.float) *
|
||||
-(math.log(10000.0) / channels)))
|
||||
pe[:, 0::2] = torch.sin(position.float() * div_term)
|
||||
pe[:, 1::2] = torch.cos(position.float() * div_term)
|
||||
pe = pe.unsqueeze(0).transpose(1, 2)
|
||||
self.register_buffer('pe', pe)
|
||||
if dropout > 0:
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
self.channels = channels
|
||||
|
||||
def forward(self, x, mask=None, first_idx=None, last_idx=None):
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
mask: [B, 1, T]
|
||||
first_idx: int
|
||||
last_idx: int
|
||||
"""
|
||||
|
||||
x = x * math.sqrt(self.channels)
|
||||
if first_idx is None:
|
||||
if self.pe.size(2) < x.size(2):
|
||||
raise RuntimeError(
|
||||
f"Sequence is {x.size(2)} but PositionalEncoding is"
|
||||
f" limited to {self.pe.size(2)}. See max_len argument.")
|
||||
if mask is not None:
|
||||
pos_enc = (self.pe[:, :, :x.size(2)] * mask)
|
||||
else:
|
||||
pos_enc = self.pe[:, :, :x.size(2)]
|
||||
x = x + pos_enc
|
||||
else:
|
||||
x = x + self.pe[:, :, first_idx:last_idx]
|
||||
if hasattr(self, 'dropout'):
|
||||
x = self.dropout(x)
|
||||
return x
|
||||
from TTS.tts.layers.generic.transformer import FFTransformerBlock
|
||||
|
||||
|
||||
class RelativePositionTransformerEncoder(nn.Module):
|
||||
|
@ -138,26 +84,36 @@ class Encoder(nn.Module):
|
|||
c_in_channels (int): number of channels for conditional input.
|
||||
|
||||
Note:
|
||||
Default encoder_params...
|
||||
Default encoder_params to be set in config.json...
|
||||
|
||||
for 'transformer'
|
||||
encoder_params={
|
||||
'hidden_channels_ffn': 128,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
},
|
||||
```python
|
||||
# for 'relative_position_transformer'
|
||||
encoder_params={
|
||||
'hidden_channels_ffn': 128,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
},
|
||||
|
||||
for 'residual_conv_bn'
|
||||
encoder_params = {
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
}
|
||||
# for 'residual_conv_bn'
|
||||
encoder_params = {
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
}
|
||||
|
||||
# for 'fftransformer'
|
||||
encoder_params = {
|
||||
"hidden_channels_ffn": 1024 ,
|
||||
"num_heads": 2,
|
||||
"num_layers": 6,
|
||||
"dropout_p": 0.1
|
||||
}
|
||||
```
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -179,7 +135,7 @@ class Encoder(nn.Module):
|
|||
self.c_in_channels = c_in_channels
|
||||
|
||||
# init encoder
|
||||
if encoder_type.lower() == "transformer":
|
||||
if encoder_type.lower() == "relative_position_transformer":
|
||||
# text encoder
|
||||
self.encoder = RelativePositionTransformerEncoder(
|
||||
in_hidden_channels, out_channels, in_hidden_channels,
|
||||
|
@ -189,11 +145,13 @@ class Encoder(nn.Module):
|
|||
out_channels,
|
||||
in_hidden_channels,
|
||||
encoder_params)
|
||||
elif encoder_type.lower() == 'fftransformer':
|
||||
assert in_hidden_channels == out_channels, \
|
||||
"[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'"
|
||||
self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params) # pylint: disable=unexpected-keyword-arg
|
||||
else:
|
||||
raise NotImplementedError(' [!] unknown encoder type.')
|
||||
|
||||
# final projection layers
|
||||
|
||||
|
||||
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
|
||||
"""
|
|
@ -0,0 +1,56 @@
|
|||
import torch
|
||||
import math
|
||||
|
||||
from torch import nn
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
"""Sinusoidal positional encoding for non-recurrent neural networks.
|
||||
Implementation based on "Attention Is All You Need"
|
||||
Args:
|
||||
channels (int): embedding size
|
||||
dropout (float): dropout parameter
|
||||
"""
|
||||
def __init__(self, channels, dropout_p=0.0, max_len=5000):
|
||||
super().__init__()
|
||||
if channels % 2 != 0:
|
||||
raise ValueError(
|
||||
"Cannot use sin/cos positional encoding with "
|
||||
"odd channels (got channels={:d})".format(channels))
|
||||
pe = torch.zeros(max_len, channels)
|
||||
position = torch.arange(0, max_len).unsqueeze(1)
|
||||
div_term = torch.pow(10000,
|
||||
torch.arange(0, channels, 2).float() / channels)
|
||||
pe[:, 0::2] = torch.sin(position.float() * div_term)
|
||||
pe[:, 1::2] = torch.cos(position.float() * div_term)
|
||||
pe = pe.unsqueeze(0).transpose(1, 2)
|
||||
self.register_buffer('pe', pe)
|
||||
if dropout_p > 0:
|
||||
self.dropout = nn.Dropout(p=dropout_p)
|
||||
self.channels = channels
|
||||
|
||||
def forward(self, x, mask=None, first_idx=None, last_idx=None):
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
mask: [B, 1, T]
|
||||
first_idx: int
|
||||
last_idx: int
|
||||
"""
|
||||
|
||||
x = x * math.sqrt(self.channels)
|
||||
if first_idx is None:
|
||||
if self.pe.size(2) < x.size(2):
|
||||
raise RuntimeError(
|
||||
f"Sequence is {x.size(2)} but PositionalEncoding is"
|
||||
f" limited to {self.pe.size(2)}. See max_len argument.")
|
||||
if mask is not None:
|
||||
pos_enc = (self.pe[:, :, :x.size(2)] * mask)
|
||||
else:
|
||||
pos_enc = self.pe[:, :, :x.size(2)]
|
||||
x = x + pos_enc
|
||||
else:
|
||||
x = x + self.pe[:, :, first_idx:last_idx]
|
||||
if hasattr(self, 'dropout'):
|
||||
x = self.dropout(x)
|
||||
return x
|
|
@ -0,0 +1,74 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class FFTransformer(nn.Module):
|
||||
def __init__(self,
|
||||
in_out_channels,
|
||||
num_heads,
|
||||
hidden_channels_ffn=1024,
|
||||
kernel_size_fft=3,
|
||||
dropout_p=0.1):
|
||||
super().__init__()
|
||||
self.self_attn = nn.MultiheadAttention(in_out_channels,
|
||||
num_heads,
|
||||
dropout=dropout_p)
|
||||
|
||||
padding = (kernel_size_fft - 1) // 2
|
||||
self.conv1 = nn.Conv1d(in_out_channels, hidden_channels_ffn, kernel_size=kernel_size_fft, padding=padding)
|
||||
self.conv2 = nn.Conv1d(hidden_channels_ffn, in_out_channels, kernel_size=kernel_size_fft, padding=padding)
|
||||
|
||||
self.norm1 = nn.LayerNorm(in_out_channels)
|
||||
self.norm2 = nn.LayerNorm(in_out_channels)
|
||||
|
||||
self.dropout = nn.Dropout(dropout_p)
|
||||
|
||||
def forward(self, src, src_mask=None, src_key_padding_mask=None):
|
||||
"""😦 ugly looking with all the transposing """
|
||||
src = src.permute(2, 0, 1)
|
||||
src2, enc_align = self.self_attn(src,
|
||||
src,
|
||||
src,
|
||||
attn_mask=src_mask,
|
||||
key_padding_mask=src_key_padding_mask)
|
||||
src = self.norm1(src + src2)
|
||||
# T x B x D -> B x D x T
|
||||
src = src.permute(1, 2, 0)
|
||||
src2 = self.conv2(F.relu(self.conv1(src)))
|
||||
src2 = self.dropout(src2)
|
||||
src = src + src2
|
||||
src = src.transpose(1, 2)
|
||||
src = self.norm2(src)
|
||||
src = src.transpose(1, 2)
|
||||
return src, enc_align
|
||||
|
||||
|
||||
class FFTransformerBlock(nn.Module):
|
||||
def __init__(self, in_out_channels, num_heads, hidden_channels_ffn,
|
||||
num_layers, dropout_p):
|
||||
super().__init__()
|
||||
self.fft_layers = nn.ModuleList([
|
||||
FFTransformer(in_out_channels=in_out_channels,
|
||||
num_heads=num_heads,
|
||||
hidden_channels_ffn=hidden_channels_ffn,
|
||||
dropout_p=dropout_p) for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument
|
||||
"""
|
||||
TODO: handle multi-speaker
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
mask: [B, 1, T] or [B, T]
|
||||
"""
|
||||
if mask is not None and mask.ndim == 3:
|
||||
mask = mask.squeeze(1)
|
||||
# mask is negated, torch uses 1s and 0s reversely.
|
||||
mask = ~mask.bool()
|
||||
alignments = []
|
||||
for layer in self.fft_layers:
|
||||
x, align = layer(x, src_key_padding_mask=mask)
|
||||
alignments.append(align.unsqueeze(1))
|
||||
alignments = torch.cat(alignments, 1)
|
||||
return x
|
|
@ -91,6 +91,7 @@ class WN(torch.nn.Module):
|
|||
def forward(self, x, x_mask=None, g=None, **kwargs): # pylint: disable=unused-argument
|
||||
output = torch.zeros_like(x)
|
||||
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||
x_mask = 1.0 if x_mask is None else x_mask
|
||||
if g is not None:
|
||||
g = self.cond_layer(g)
|
||||
for i in range(self.num_layers):
|
||||
|
@ -163,7 +164,7 @@ class WNBlocks(nn.Module):
|
|||
weight_norm=weight_norm)
|
||||
self.wn_blocks.append(layer)
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
def forward(self, x, x_mask=None, g=None):
|
||||
o = x
|
||||
for layer in self.wn_blocks:
|
||||
o = layer(o, x_mask, g)
|
||||
|
|
|
@ -23,7 +23,6 @@ def generate_path(duration, mask):
|
|||
mask: [b, t_x, t_y]
|
||||
"""
|
||||
device = duration.device
|
||||
|
||||
b, t_x, t_y = mask.shape
|
||||
cum_duration = torch.cumsum(duration, 1)
|
||||
path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
|
||||
|
|
|
@ -297,6 +297,11 @@ class TacotronLoss(torch.nn.Module):
|
|||
stopnet_output, stopnet_target, output_lens, decoder_b_output,
|
||||
alignments, alignment_lens, alignments_backwards, input_lens):
|
||||
|
||||
|
||||
# decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
|
||||
# the target should be set acccordingly
|
||||
postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
|
||||
|
||||
return_dict = {}
|
||||
# remove lengths if no masking is applied
|
||||
if not self.config.loss_masking:
|
||||
|
@ -307,20 +312,13 @@ class TacotronLoss(torch.nn.Module):
|
|||
decoder_loss = self.criterion(decoder_output, mel_input,
|
||||
output_lens)
|
||||
if self.postnet_alpha > 0:
|
||||
if self.config.model in ["Tacotron", "TacotronGST"]:
|
||||
postnet_loss = self.criterion(postnet_output, linear_input,
|
||||
output_lens)
|
||||
else:
|
||||
postnet_loss = self.criterion(postnet_output, mel_input,
|
||||
output_lens)
|
||||
postnet_loss = self.criterion(postnet_output, postnet_target,
|
||||
output_lens)
|
||||
else:
|
||||
if self.decoder_alpha > 0:
|
||||
decoder_loss = self.criterion(decoder_output, mel_input)
|
||||
if self.postnet_alpha > 0:
|
||||
if self.config.model in ["Tacotron", "TacotronGST"]:
|
||||
postnet_loss = self.criterion(postnet_output, linear_input)
|
||||
else:
|
||||
postnet_loss = self.criterion(postnet_output, mel_input)
|
||||
postnet_loss = self.criterion(postnet_output, postnet_target)
|
||||
loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss
|
||||
return_dict['decoder_loss'] = decoder_loss
|
||||
return_dict['postnet_loss'] = postnet_loss
|
||||
|
@ -373,7 +371,7 @@ class TacotronLoss(torch.nn.Module):
|
|||
|
||||
# postnet differential spectral loss
|
||||
if self.config.postnet_diff_spec_alpha > 0:
|
||||
postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens)
|
||||
postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, postnet_target, output_lens)
|
||||
loss += postnet_diff_spec_loss * self.postnet_diff_spec_alpha
|
||||
return_dict['postnet_diff_spec_loss'] = postnet_diff_spec_loss
|
||||
|
||||
|
@ -385,7 +383,7 @@ class TacotronLoss(torch.nn.Module):
|
|||
|
||||
# postnet ssim loss
|
||||
if self.config.postnet_ssim_alpha > 0:
|
||||
postnet_ssim_loss = self.criterion_ssim(postnet_output, mel_input, output_lens)
|
||||
postnet_ssim_loss = self.criterion_ssim(postnet_output, postnet_target, output_lens)
|
||||
loss += postnet_ssim_loss * self.postnet_ssim_alpha
|
||||
return_dict['postnet_ssim_loss'] = postnet_ssim_loss
|
||||
|
||||
|
@ -442,5 +440,117 @@ class SpeedySpeechLoss(nn.Module):
|
|||
l1_loss = self.l1(decoder_output, decoder_target, decoder_output_lens)
|
||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
||||
huber_loss = self.huber(dur_output, dur_target, input_lens)
|
||||
loss = l1_loss + ssim_loss + huber_loss
|
||||
loss = self.l1_alpha * l1_loss + self.ssim_alpha * ssim_loss + self.huber_alpha * huber_loss
|
||||
return {'loss': loss, 'loss_l1': l1_loss, 'loss_ssim': ssim_loss, 'loss_dur': huber_loss}
|
||||
|
||||
|
||||
def mse_loss_custom(x, y):
|
||||
"""MSE loss using the torch back-end without reduction.
|
||||
It uses less VRAM than the raw code"""
|
||||
expanded_x, expanded_y = torch.broadcast_tensors(x, y)
|
||||
return torch._C._nn.mse_loss(expanded_x, expanded_y, 0) # pylint: disable=protected-access, c-extension-no-member
|
||||
|
||||
|
||||
class MDNLoss(nn.Module):
|
||||
"""Mixture of Density Network Loss as described in https://arxiv.org/pdf/2003.01950.pdf.
|
||||
"""
|
||||
|
||||
def forward(self, logp, text_lengths, mel_lengths): # pylint: disable=no-self-use
|
||||
'''
|
||||
Shapes:
|
||||
mu: [B, D, T]
|
||||
log_sigma: [B, D, T]
|
||||
mel_spec: [B, D, T]
|
||||
'''
|
||||
B, T_seq, T_mel = logp.shape
|
||||
log_alpha = logp.new_ones(B, T_seq, T_mel)*(-1e4)
|
||||
log_alpha[:, 0, 0] = logp[:, 0, 0]
|
||||
for t in range(1, T_mel):
|
||||
prev_step = torch.cat([log_alpha[:, :, t-1:t], functional.pad(log_alpha[:, :, t-1:t],
|
||||
(0, 0, 1, -1), value=-1e4)], dim=-1)
|
||||
log_alpha[:, :, t] = torch.logsumexp(prev_step + 1e-4, dim=-1) + logp[:, :, t]
|
||||
alpha_last = log_alpha[torch.arange(B), text_lengths-1, mel_lengths-1]
|
||||
mdn_loss = -alpha_last.mean() / T_seq
|
||||
return mdn_loss#, log_prob_matrix
|
||||
|
||||
|
||||
class AlignTTSLoss(nn.Module):
|
||||
"""Modified AlignTTS Loss.
|
||||
Computes following losses
|
||||
- L1 and SSIM losses from output spectrograms.
|
||||
- Huber loss for duration predictor.
|
||||
- MDNLoss for Mixture of Density Network.
|
||||
|
||||
All the losses are aggregated by a weighted sum with the loss alphas.
|
||||
Alphas can be scheduled based on number of steps.
|
||||
|
||||
Args:
|
||||
c (dict): TTS model configuration.
|
||||
"""
|
||||
def __init__(self, c):
|
||||
super().__init__()
|
||||
self.mdn_loss = MDNLoss()
|
||||
self.spec_loss = MSELossMasked(False)
|
||||
self.ssim = SSIMLoss()
|
||||
self.dur_loss = MSELossMasked(False)
|
||||
|
||||
self.ssim_alpha = c.ssim_alpha
|
||||
self.dur_loss_alpha = c.dur_loss_alpha
|
||||
self.spec_loss_alpha = c.spec_loss_alpha
|
||||
self.mdn_alpha = c.mdn_alpha
|
||||
|
||||
def forward(self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target,
|
||||
input_lens, step, phase):
|
||||
ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(
|
||||
step)
|
||||
spec_loss, ssim_loss, dur_loss, mdn_loss = 0, 0, 0, 0
|
||||
if phase == 0:
|
||||
mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
|
||||
elif phase == 1:
|
||||
spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens)
|
||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
||||
elif phase == 2:
|
||||
mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
|
||||
spec_loss = self.spec_lossX(decoder_output, decoder_target, decoder_output_lens)
|
||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
||||
elif phase == 3:
|
||||
dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens)
|
||||
else:
|
||||
mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
|
||||
spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens)
|
||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
||||
dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens)
|
||||
loss = spec_loss_alpha * spec_loss + ssim_alpha * ssim_loss + dur_loss_alpha * dur_loss + mdn_alpha * mdn_loss
|
||||
return {'loss': loss, 'loss_l1': spec_loss, 'loss_ssim': ssim_loss, 'loss_dur': dur_loss, 'mdn_loss': mdn_loss}
|
||||
|
||||
@staticmethod
|
||||
def _set_alpha(step, alpha_settings):
|
||||
'''Set the loss alpha wrt number of steps.
|
||||
Return the corresponding value if no schedule is set.
|
||||
|
||||
Example:
|
||||
Setting a alpha schedule.
|
||||
if ```alpha_settings``` is ```[[0, 1], [10000, 0.1]]``` then ```return_alpha == 1``` until 10k steps, then set to 0.1.
|
||||
if ```alpha_settings``` is a constant value then ```return_alpha``` is set to that constant.
|
||||
|
||||
Args:
|
||||
step (int): number of training steps.
|
||||
alpha_settings (int or list): constant alpha value or a list defining the schedule as explained above.
|
||||
'''
|
||||
return_alpha = None
|
||||
if isinstance(alpha_settings, list):
|
||||
for key, alpha in alpha_settings:
|
||||
if key < step:
|
||||
return_alpha = alpha
|
||||
elif isinstance(alpha_settings, (float, int)):
|
||||
return_alpha = alpha_settings
|
||||
return return_alpha
|
||||
|
||||
def set_alphas(self, step):
|
||||
'''Set the alpha values for all the loss functions
|
||||
'''
|
||||
ssim_alpha = self._set_alpha(step, self.ssim_alpha)
|
||||
dur_loss_alpha = self._set_alpha(step, self.dur_loss_alpha)
|
||||
spec_loss_alpha = self._set_alpha(step, self.spec_loss_alpha)
|
||||
mdn_alpha = self._set_alpha(step, self.mdn_alpha)
|
||||
return ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha
|
||||
|
|
|
@ -2,7 +2,7 @@ import torch
|
|||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from TTS.tts.layers.common_layers import Linear
|
||||
from TTS.tts.layers.tacotron.common_layers import Linear
|
||||
from scipy.stats import betabinom
|
||||
|
||||
|
|
@ -0,0 +1,323 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.layers.align_tts.mdn import MDNBlock
|
||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||
from TTS.tts.layers.feed_forward.decoder import Decoder
|
||||
|
||||
|
||||
class AlignTTS(nn.Module):
|
||||
"""AlignTTS with modified duration predictor.
|
||||
https://arxiv.org/pdf/2003.01950.pdf
|
||||
|
||||
Encoder -> DurationPredictor -> Decoder
|
||||
|
||||
AlignTTS's Abstract - Targeting at both high efficiency and performance, we propose AlignTTS to predict the
|
||||
mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a
|
||||
sequence of characters, and the duration of each character is determined by a duration predictor.Instead of
|
||||
adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented
|
||||
to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s
|
||||
how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean
|
||||
option score (MOS), but also a high efficiency which is more than 50 times faster than real-time.
|
||||
|
||||
Note:
|
||||
Original model uses a separate character embedding layer for duration predictor. However, it causes the
|
||||
duration predictor to overfit and prevents learning higher level interactions among characters. Therefore,
|
||||
we predict durations based on encoder outputs which has higher level information about input characters. This
|
||||
enables training without phases as in the original paper.
|
||||
|
||||
Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture
|
||||
differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters.
|
||||
|
||||
Args:
|
||||
num_chars (int):
|
||||
number of unique input to characters
|
||||
out_channels (int):
|
||||
number of output tensor channels. It is equal to the expected spectrogram size.
|
||||
hidden_channels (int):
|
||||
number of channels in all the model layers.
|
||||
hidden_channels_ffn (int):
|
||||
number of channels in transformer's conv layers.
|
||||
hidden_channels_dp (int):
|
||||
number of channels in duration predictor network.
|
||||
num_heads (int):
|
||||
number of attention heads in transformer networks.
|
||||
num_transformer_layers (int):
|
||||
number of layers in encoder and decoder transformer blocks.
|
||||
dropout_p (int):
|
||||
dropout rate in transformer layers.
|
||||
length_scale (int, optional):
|
||||
coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
|
||||
num_speakers (int, optional):
|
||||
number of speakers for multi-speaker training. Defaults to 0.
|
||||
external_c (bool, optional):
|
||||
enable external speaker embeddings. Defaults to False.
|
||||
c_in_channels (int, optional):
|
||||
number of channels in speaker embedding vectors. Defaults to 0.
|
||||
"""
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_chars,
|
||||
out_channels,
|
||||
hidden_channels=256,
|
||||
hidden_channels_dp=256,
|
||||
encoder_type='fftransformer',
|
||||
encoder_params={
|
||||
'hidden_channels_ffn': 1024,
|
||||
'num_heads': 2,
|
||||
'num_layers': 6,
|
||||
'dropout_p': 0.1
|
||||
},
|
||||
decoder_type='fftransformer',
|
||||
decoder_params={
|
||||
'hidden_channels_ffn': 1024,
|
||||
'num_heads': 2,
|
||||
'num_layers': 6,
|
||||
'dropout_p': 0.1
|
||||
},
|
||||
length_scale=1,
|
||||
num_speakers=0,
|
||||
external_c=False,
|
||||
c_in_channels=0):
|
||||
|
||||
super().__init__()
|
||||
self.length_scale = float(length_scale) if isinstance(
|
||||
length_scale, int) else length_scale
|
||||
self.emb = nn.Embedding(num_chars, hidden_channels)
|
||||
self.pos_encoder = PositionalEncoding(hidden_channels)
|
||||
self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type,
|
||||
encoder_params, c_in_channels)
|
||||
self.decoder = Decoder(out_channels, hidden_channels, decoder_type,
|
||||
decoder_params)
|
||||
self.duration_predictor = DurationPredictor(hidden_channels_dp)
|
||||
|
||||
self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1)
|
||||
self.mdn_block = MDNBlock(hidden_channels, 2 * out_channels)
|
||||
|
||||
if num_speakers > 1 and not external_c:
|
||||
# speaker embedding layer
|
||||
self.emb_g = nn.Embedding(num_speakers, c_in_channels)
|
||||
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
||||
|
||||
if c_in_channels > 0 and c_in_channels != hidden_channels:
|
||||
self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1)
|
||||
|
||||
@staticmethod
|
||||
def compute_log_probs(mu, log_sigma, y):
|
||||
# pylint: disable=protected-access, c-extension-no-member
|
||||
y = y.transpose(1, 2).unsqueeze(1) # [B, 1, T1, D]
|
||||
mu = mu.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D]
|
||||
log_sigma = log_sigma.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D]
|
||||
expanded_y, expanded_mu = torch.broadcast_tensors(y, mu)
|
||||
exponential = -0.5 * torch.mean(torch._C._nn.mse_loss(
|
||||
expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2),
|
||||
dim=-1) # B, L, T
|
||||
logp = exponential - 0.5 * log_sigma.mean(dim=-1)
|
||||
return logp
|
||||
|
||||
def compute_align_path(self, mu, log_sigma, y, x_mask, y_mask):
|
||||
# find the max alignment path
|
||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
||||
log_p = self.compute_log_probs(mu, log_sigma, y)
|
||||
# [B, T_en, T_dec]
|
||||
attn = maximum_path(log_p, attn_mask.squeeze(1)).unsqueeze(1)
|
||||
dr_mas = torch.sum(attn, -1)
|
||||
return dr_mas.squeeze(1), log_p
|
||||
|
||||
@staticmethod
|
||||
def convert_dr_to_align(dr, x_mask, y_mask):
|
||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
||||
attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
|
||||
return attn
|
||||
|
||||
def expand_encoder_outputs(self, en, dr, x_mask, y_mask):
|
||||
"""Generate attention alignment map from durations and
|
||||
expand encoder outputs
|
||||
|
||||
Example:
|
||||
encoder output: [a,b,c,d]
|
||||
durations: [1, 3, 2, 1]
|
||||
|
||||
expanded: [a, b, b, b, c, c, d]
|
||||
attention map: [[0, 0, 0, 0, 0, 0, 1],
|
||||
[0, 0, 0, 0, 1, 1, 0],
|
||||
[0, 1, 1, 1, 0, 0, 0],
|
||||
[1, 0, 0, 0, 0, 0, 0]]
|
||||
"""
|
||||
attn = self.convert_dr_to_align(dr, x_mask, y_mask)
|
||||
o_en_ex = torch.matmul(
|
||||
attn.squeeze(1).transpose(1, 2), en.transpose(1,
|
||||
2)).transpose(1, 2)
|
||||
return o_en_ex, attn
|
||||
|
||||
def format_durations(self, o_dr_log, x_mask):
|
||||
o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
|
||||
o_dr[o_dr < 1] = 1.0
|
||||
o_dr = torch.round(o_dr)
|
||||
return o_dr
|
||||
|
||||
@staticmethod
|
||||
def _concat_speaker_embedding(o_en, g):
|
||||
g_exp = g.expand(-1, -1, o_en.size(-1)) # [B, C, T_en]
|
||||
o_en = torch.cat([o_en, g_exp], 1)
|
||||
return o_en
|
||||
|
||||
def _sum_speaker_embedding(self, x, g):
|
||||
# project g to decoder dim.
|
||||
if hasattr(self, 'proj_g'):
|
||||
g = self.proj_g(g)
|
||||
return x + g
|
||||
|
||||
def _forward_encoder(self, x, x_lengths, g=None):
|
||||
if hasattr(self, 'emb_g'):
|
||||
g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1]
|
||||
|
||||
if g is not None:
|
||||
g = g.unsqueeze(-1)
|
||||
|
||||
# [B, T, C]
|
||||
x_emb = self.emb(x)
|
||||
# [B, C, T]
|
||||
x_emb = torch.transpose(x_emb, 1, -1)
|
||||
|
||||
# compute sequence masks
|
||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]),
|
||||
1).to(x.dtype)
|
||||
|
||||
# encoder pass
|
||||
o_en = self.encoder(x_emb, x_mask)
|
||||
|
||||
# speaker conditioning for duration predictor
|
||||
if g is not None:
|
||||
o_en_dp = self._concat_speaker_embedding(o_en, g)
|
||||
else:
|
||||
o_en_dp = o_en
|
||||
return o_en, o_en_dp, x_mask, g
|
||||
|
||||
def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g):
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None),
|
||||
1).to(o_en_dp.dtype)
|
||||
# expand o_en with durations
|
||||
o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
|
||||
# positional encoding
|
||||
if hasattr(self, 'pos_encoder'):
|
||||
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
|
||||
# speaker embedding
|
||||
if g is not None:
|
||||
o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
|
||||
# decoder pass
|
||||
o_de = self.decoder(o_en_ex, y_mask, g=g)
|
||||
return o_de, attn.transpose(1, 2)
|
||||
|
||||
def _forward_mdn(self, o_en, y, y_lengths, x_mask):
|
||||
# MAS potentials and alignment
|
||||
mu, log_sigma = self.mdn_block(o_en)
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None),
|
||||
1).to(o_en.dtype)
|
||||
dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask,
|
||||
y_mask)
|
||||
return dr_mas, mu, log_sigma, logp
|
||||
|
||||
def forward(self, x, x_lengths, y, y_lengths, phase=None, g=None): # pylint: disable=unused-argument
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, T_max]
|
||||
x_lengths: [B]
|
||||
y_lengths: [B]
|
||||
dr: [B, T_max]
|
||||
g: [B, C]
|
||||
"""
|
||||
o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None
|
||||
if phase == 0:
|
||||
# train encoder and MDN
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
dr_mas, mu, log_sigma, logp = self._forward_mdn(
|
||||
o_en, y, y_lengths, x_mask)
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None),
|
||||
1).to(o_en_dp.dtype)
|
||||
attn = self.convert_dr_to_align(dr_mas, x_mask, y_mask)
|
||||
elif phase == 1:
|
||||
# train decoder
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
dr_mas, _, _, _ = self._forward_mdn(o_en, y, y_lengths, x_mask)
|
||||
o_de, attn = self._forward_decoder(o_en.detach(),
|
||||
o_en_dp.detach(),
|
||||
dr_mas.detach(),
|
||||
x_mask,
|
||||
y_lengths,
|
||||
g=g)
|
||||
elif phase == 2:
|
||||
# train the whole except duration predictor
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
dr_mas, mu, log_sigma, logp = self._forward_mdn(
|
||||
o_en, y, y_lengths, x_mask)
|
||||
o_de, attn = self._forward_decoder(o_en,
|
||||
o_en_dp,
|
||||
dr_mas,
|
||||
x_mask,
|
||||
y_lengths,
|
||||
g=g)
|
||||
elif phase == 3:
|
||||
# train duration predictor
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
o_dr_log = self.duration_predictor(x, x_mask)
|
||||
dr_mas, mu, log_sigma, logp = self._forward_mdn(
|
||||
o_en, y, y_lengths, x_mask)
|
||||
o_de, attn = self._forward_decoder(o_en,
|
||||
o_en_dp,
|
||||
dr_mas,
|
||||
x_mask,
|
||||
y_lengths,
|
||||
g=g)
|
||||
o_dr_log = o_dr_log.squeeze(1)
|
||||
else:
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||
dr_mas, mu, log_sigma, logp = self._forward_mdn(
|
||||
o_en, y, y_lengths, x_mask)
|
||||
o_de, attn = self._forward_decoder(o_en,
|
||||
o_en_dp,
|
||||
dr_mas,
|
||||
x_mask,
|
||||
y_lengths,
|
||||
g=g)
|
||||
o_dr_log = o_dr_log.squeeze(1)
|
||||
dr_mas_log = torch.log(dr_mas + 1).squeeze(1)
|
||||
return o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, T_max]
|
||||
x_lengths: [B]
|
||||
g: [B, C]
|
||||
"""
|
||||
# pad input to prevent dropping the last word
|
||||
# x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
# o_dr_log = self.duration_predictor(x, x_mask)
|
||||
o_dr_log = self.duration_predictor(o_en_dp, x_mask)
|
||||
# duration predictor pass
|
||||
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
||||
y_lengths = o_dr.sum(1)
|
||||
o_de, attn = self._forward_decoder(o_en,
|
||||
o_en_dp,
|
||||
o_dr,
|
||||
x_mask,
|
||||
y_lengths,
|
||||
g=g)
|
||||
return o_de, attn
|
||||
|
||||
def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
self.load_state_dict(state['model'])
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
|
@ -9,7 +9,7 @@ from TTS.tts.utils.generic_utils import sequence_mask
|
|||
from TTS.tts.layers.glow_tts.monotonic_align import maximum_path, generate_path
|
||||
|
||||
|
||||
class GlowTts(nn.Module):
|
||||
class GlowTTS(nn.Module):
|
||||
"""Glow TTS models from https://arxiv.org/abs/2005.11129
|
||||
|
||||
Args:
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from TTS.tts.layers.speedy_speech.decoder import Decoder
|
||||
from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.speedy_speech.encoder import Encoder, PositionalEncoding
|
||||
from TTS.tts.layers.feed_forward.decoder import Decoder
|
||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path
|
||||
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.gst_layers import GST
|
||||
from TTS.tts.layers.tacotron import Decoder, Encoder, PostCBHG
|
||||
from TTS.tts.layers.tacotron.gst_layers import GST
|
||||
from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
|
||||
from TTS.tts.models.tacotron_abstract import TacotronAbstract
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.gst_layers import GST
|
||||
from TTS.tts.layers.tacotron2 import Decoder, Encoder, Postnet
|
||||
from TTS.tts.layers.tacotron.gst_layers import GST
|
||||
from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
|
||||
from TTS.tts.models.tacotron_abstract import TacotronAbstract
|
||||
|
||||
# TODO: match function arguments with tacotron
|
||||
|
@ -17,7 +17,7 @@ class Tacotron2(TacotronAbstract):
|
|||
r (int): initial model reduction rate.
|
||||
postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
|
||||
decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
|
||||
attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'.
|
||||
attn_type (str, optional): attention type. Check ```TTS.tts.layers.tacotron.common_layers.init_attn```. Defaults to 'original'.
|
||||
attn_win (bool, optional): enable/disable attention windowing.
|
||||
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||
attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
|
||||
|
|
|
@ -149,8 +149,7 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
def _backward_pass(self, mel_specs, encoder_outputs, mask):
|
||||
""" Run backwards decoder """
|
||||
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
|
||||
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
|
||||
self.speaker_embeddings_projected)
|
||||
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask)
|
||||
decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
|
||||
return decoder_outputs_b, alignments_b
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
from TTS.tts.tf.utils.tf_utils import shape_list
|
||||
from TTS.tts.tf.layers.common_layers import Prenet, Attention
|
||||
from TTS.tts.tf.layers.tacotron.common_layers import Prenet, Attention
|
||||
|
||||
|
||||
# NOTE: linter has a problem with the current TF release
|
|
@ -1,7 +1,7 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
|
||||
from TTS.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet
|
||||
from TTS.tts.tf.layers.tacotron.tacotron2 import Encoder, Decoder, Postnet
|
||||
from TTS.tts.tf.utils.tf_utils import shape_list
|
||||
|
||||
|
||||
|
|
|
@ -41,7 +41,9 @@ def sequence_mask(sequence_length, max_len=None):
|
|||
|
||||
def to_camel(text):
|
||||
text = text.capitalize()
|
||||
return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
|
||||
text = re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
|
||||
text = text.replace('Tts', 'TTS')
|
||||
return text
|
||||
|
||||
|
||||
def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
||||
|
@ -132,13 +134,23 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
|||
decoder_type=c['decoder_type'],
|
||||
decoder_params=c['decoder_params'],
|
||||
c_in_channels=0)
|
||||
elif c.model.lower() == "align_tts":
|
||||
model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False),
|
||||
out_channels=c.audio['num_mels'],
|
||||
hidden_channels=c['hidden_channels'],
|
||||
hidden_channels_dp=c['hidden_channels_dp'],
|
||||
encoder_type=c['encoder_type'],
|
||||
encoder_params=c['encoder_params'],
|
||||
decoder_type=c['decoder_type'],
|
||||
decoder_params=c['decoder_params'],
|
||||
c_in_channels=0)
|
||||
return model
|
||||
|
||||
def is_tacotron(c):
|
||||
return not c['model'] in ['speedy_speech', 'glow_tts']
|
||||
return 'tacotron' in c['model'].lower()
|
||||
|
||||
def check_config_tts(c):
|
||||
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
|
||||
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech', 'align_tts'], restricted=True, val_type=str)
|
||||
check_argument('run_name', c, restricted=True, val_type=str)
|
||||
check_argument('run_description', c, val_type=str)
|
||||
|
||||
|
@ -195,7 +207,7 @@ def check_config_tts(c):
|
|||
check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
if c['model'].lower == "speedy_speech":
|
||||
if c['model'].lower in ["speedy_speech", "align_tts"]:
|
||||
check_argument('ssim_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('l1_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('huber_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
|
@ -239,7 +251,7 @@ def check_config_tts(c):
|
|||
check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool)
|
||||
|
||||
# Model Parameters for non-tacotron models
|
||||
if c['model'].lower == "speedy_speech":
|
||||
if c['model'].lower in ["speedy_speech", "align_tts"]:
|
||||
check_argument('positional_encoding', c, restricted=True, val_type=type)
|
||||
check_argument('encoder_type', c, restricted=True, val_type=str)
|
||||
check_argument('encoder_params', c, restricted=True, val_type=dict)
|
||||
|
|
|
@ -77,7 +77,7 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
|
|||
# these only belong to tacotron models.
|
||||
decoder_output = None
|
||||
stop_tokens = None
|
||||
elif 'speedy_speech' in CONFIG.model.lower():
|
||||
elif CONFIG.model.lower() in ['speedy_speech', 'align_tts']:
|
||||
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
|
||||
if hasattr(model, 'module'):
|
||||
# distributed model
|
||||
|
@ -88,6 +88,8 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
|
|||
# these only belong to tacotron models.
|
||||
decoder_output = None
|
||||
stop_tokens = None
|
||||
else:
|
||||
raise ValueError('[!] Unknown model name.')
|
||||
return decoder_output, postnet_output, alignments, stop_tokens
|
||||
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
|||
]]
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations in french:
|
||||
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
|
||||
abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
('M', 'monsieur'),
|
||||
('Mlle', 'mademoiselle'),
|
||||
|
@ -58,4 +58,9 @@ abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
|
|||
('ex', 'exemple'),
|
||||
('excl', 'exclusivement'),
|
||||
('boul', 'boulevard'),
|
||||
]] + [(re.compile('\\b%s' % x[0]), x[1]) for x in [
|
||||
('Mlle', 'mademoiselle'),
|
||||
('Mlles', 'mesdemoiselles'),
|
||||
('Mme', 'Madame'),
|
||||
('Mmes', 'Mesdames'),
|
||||
]]
|
||||
|
|
|
@ -94,6 +94,7 @@ def basic_turkish_cleaners(text):
|
|||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
'''Pipeline for English text, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
|
@ -106,15 +107,17 @@ def english_cleaners(text):
|
|||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def french_cleaners(text):
|
||||
'''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
|
||||
text = lowercase(text)
|
||||
text = expand_abbreviations(text, lang='fr')
|
||||
text = lowercase(text)
|
||||
text = replace_symbols(text, lang='fr')
|
||||
text = remove_aux_symbols(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def portuguese_cleaners(text):
|
||||
'''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
|
||||
numbers, phonemizer already does that'''
|
||||
|
@ -124,13 +127,13 @@ def portuguese_cleaners(text):
|
|||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def chinese_mandarin_cleaners(text: str) -> str:
|
||||
'''Basic pipeline for chinese'''
|
||||
text = replace_numbers_to_characters_in_text(text)
|
||||
return text
|
||||
|
||||
|
||||
|
||||
def phoneme_cleaners(text):
|
||||
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
|
||||
text = expand_numbers(text)
|
||||
|
|
|
@ -7,7 +7,7 @@ import glob
|
|||
import os
|
||||
import re
|
||||
|
||||
from TTS.tts.utils.generic_utils import check_config_tts
|
||||
import torch
|
||||
from TTS.tts.utils.text.symbols import parse_symbols
|
||||
from TTS.utils.console_logger import ConsoleLogger
|
||||
from TTS.utils.generic_utils import create_experiment_folder, get_git_branch
|
||||
|
@ -104,7 +104,7 @@ def get_last_checkpoint(path):
|
|||
key_file_names = [fn for fn in file_names if key in fn]
|
||||
if last_model is None and len(key_file_names) > 0:
|
||||
last_model = max(key_file_names, key=os.path.getctime)
|
||||
last_model_num = os.path.getctime(last_model)
|
||||
last_model_num = torch.load(last_model)['step']
|
||||
|
||||
if last_model is not None:
|
||||
last_models[key] = last_model
|
||||
|
@ -125,19 +125,13 @@ def get_last_checkpoint(path):
|
|||
return last_models['checkpoint'], last_models['best_model']
|
||||
|
||||
|
||||
def process_args(args, model_type):
|
||||
"""Process parsed comand line arguments.
|
||||
def process_args(args, model_class):
|
||||
"""Process parsed comand line arguments based on model class (tts or vocoder).
|
||||
|
||||
Args:
|
||||
args (argparse.Namespace or dict like): Parsed input arguments.
|
||||
model_type (str): Model type used to check config parameters and setup
|
||||
the TensorBoard logger. One of:
|
||||
- tacotron
|
||||
- glow_tts
|
||||
- speedy_speech
|
||||
- gan
|
||||
- wavegrad
|
||||
- wavernn
|
||||
the TensorBoard logger. One of ['tts', 'vocoder'].
|
||||
|
||||
Raises:
|
||||
ValueError: If `model_type` is not one of implemented choices.
|
||||
|
@ -160,23 +154,9 @@ def process_args(args, model_type):
|
|||
|
||||
# setup output paths and read configs
|
||||
c = load_config(args.config_path)
|
||||
if model_type in "tacotron glow_tts speedy_speech":
|
||||
model_class = "TTS"
|
||||
elif model_type in "gan wavegrad wavernn":
|
||||
model_class = "VOCODER"
|
||||
else:
|
||||
raise ValueError("model type {model_type} not recognized!")
|
||||
|
||||
if model_class == "TTS":
|
||||
check_config_tts(c)
|
||||
elif model_class == "VOCODER":
|
||||
print("Vocoder config checker not implemented, skipping ...")
|
||||
else:
|
||||
raise ValueError(f"model type {model_type} not recognized!")
|
||||
|
||||
_ = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
if model_type in "tacotron wavegrad wavernn" and c.mixed_precision:
|
||||
if 'mixed_precision' in c and c.mixed_precision:
|
||||
print(" > Mixed precision mode is ON")
|
||||
|
||||
out_path = args.continue_path
|
||||
|
@ -198,7 +178,7 @@ def process_args(args, model_type):
|
|||
# if model characters are not set in the config file
|
||||
# save the default set to the config file for future
|
||||
# compatibility.
|
||||
if model_class == 'TTS' and 'characters' not in c:
|
||||
if model_class == 'tts' and 'characters' not in c:
|
||||
used_characters = parse_symbols()
|
||||
new_fields['characters'] = used_characters
|
||||
copy_model_files(c, args.config_path,
|
||||
|
@ -208,7 +188,7 @@ def process_args(args, model_type):
|
|||
|
||||
log_path = out_path
|
||||
|
||||
tb_logger = TensorboardLogger(log_path, model_name=model_class)
|
||||
tb_logger = TensorboardLogger(log_path, model_name=model_class.upper())
|
||||
|
||||
# write model desc to tensorboard
|
||||
tb_logger.tb_add_text("model-description", c["run_description"], 0)
|
||||
|
|
|
@ -15,6 +15,8 @@ def get_git_branch():
|
|||
current.replace("* ", "")
|
||||
except subprocess.CalledProcessError:
|
||||
current = "inside_docker"
|
||||
except FileNotFoundError:
|
||||
current = "unknown"
|
||||
return current
|
||||
|
||||
|
||||
|
@ -30,7 +32,7 @@ def get_commit_hash():
|
|||
commit = subprocess.check_output(
|
||||
['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
|
||||
# Not copying .git folder into docker container
|
||||
except subprocess.CalledProcessError:
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
commit = "0000000"
|
||||
print(' > Git Hash: {}'.format(commit))
|
||||
return commit
|
||||
|
|
|
@ -69,7 +69,7 @@ def copy_model_files(c, config_file, out_path, new_fields):
|
|||
else:
|
||||
new_line = '"{}":{},\n'.format(key, json.dumps(value, ensure_ascii=False))
|
||||
config_lines.insert(1, new_line)
|
||||
config_out_file = open(copy_config_path, "w")
|
||||
config_out_file = open(copy_config_path, "w", encoding="utf-8")
|
||||
config_out_file.writelines(config_lines)
|
||||
config_out_file.close()
|
||||
# copy model stats file if available
|
||||
|
|
|
@ -43,7 +43,7 @@ class ModelManager(object):
|
|||
Args:
|
||||
file_path (str): path to .models.json.
|
||||
"""
|
||||
with open(file_path) as json_file:
|
||||
with open(file_path, "r", encoding="utf-8") as json_file:
|
||||
self.models_dict = json.load(json_file)
|
||||
|
||||
def list_langs(self):
|
||||
|
|
|
@ -12,7 +12,6 @@ from TTS.vocoder.utils.generic_utils import setup_generator, interpolate_vocoder
|
|||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.tts.utils.synthesis import synthesis, trim_silence
|
||||
|
||||
from TTS.tts.utils.text import make_symbols, phonemes, symbols
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
dependencies = ['torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode'] # apt install espeak-ng
|
||||
dependencies = ['torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode', 'pypinyin'] # apt install espeak-ng
|
||||
import torch
|
||||
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
|
@ -9,7 +9,7 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name=None, us
|
|||
"""TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text.
|
||||
|
||||
Example:
|
||||
>>> synthesizer = torch.hub.load('mozilla/TTS', 'tts', source='github')
|
||||
>>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github')
|
||||
>>> wavs = synthesizer.tts("This is a test! This is also a test!!")
|
||||
wavs - is a list of values of the synthesized speech.
|
||||
|
||||
|
@ -33,5 +33,5 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name=None, us
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
synthesizer = torch.hub.load('mozilla/TTS:hub_conf', 'tts', source='github')
|
||||
synthesizer = torch.hub.load('coqui-ai/TTS:hub_conf', 'tts', source='github')
|
||||
synthesizer.tts("This is a test!")
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -393,7 +393,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.7"
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn.\n",
|
||||
"This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn. SNR paper can be seen here: https://www.cs.cmu.edu/~robust/Papers/KimSternIS08.pdf\n",
|
||||
"\n",
|
||||
"To use this notebook, you need:\n",
|
||||
"- WADA SNR estimation: http://www.cs.cmu.edu/~robust/archive/algorithms/WADA_SNR_IS_2008/\n",
|
||||
|
@ -18,12 +18,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os, sys\n",
|
||||
|
@ -42,12 +37,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set the meta parameters\n",
|
||||
|
@ -60,10 +50,7 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -89,12 +76,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wav_file = \"/home/erogol/Data/LJSpeech-1.1/wavs/LJ001-0001.wav\"\n",
|
||||
|
@ -136,7 +118,7 @@
|
|||
"snrs = [tup[0] for tup in file_snrs]\n",
|
||||
"\n",
|
||||
"error_idxs = np.where(np.isnan(snrs) == True)[0]\n",
|
||||
"error_files = [file_names[idx] for idx in error_idxs]\n",
|
||||
"error_files = [wav_files[idx] for idx in error_idxs]\n",
|
||||
"\n",
|
||||
"file_snrs = [i for j, i in enumerate(file_snrs) if j not in error_idxs]\n",
|
||||
"file_names = [tup[1] for tup in file_snrs]\n",
|
||||
|
@ -150,12 +132,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def output_snr_with_audio(idx):\n",
|
||||
|
@ -205,12 +182,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
|
@ -231,9 +203,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -243,9 +243,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9-final"
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,6 @@ nose==1.3.7
|
|||
cardboardlint==1.3.0
|
||||
pylint==2.5.3
|
||||
gdown
|
||||
umap-learn
|
||||
umap-learn==0.4.6
|
||||
cython
|
||||
pyyaml
|
|
@ -1,18 +1,20 @@
|
|||
set -e
|
||||
TF_CPP_MIN_LOG_LEVEL=3
|
||||
|
||||
# tests
|
||||
# # tests
|
||||
nosetests tests -x &&\
|
||||
|
||||
# runtime tests
|
||||
./tests/test_demo_server.sh && \
|
||||
./tests/test_resample.sh && \
|
||||
./tests/test_tacotron_train.sh && \
|
||||
./tests/test_glow-tts_train.sh && \
|
||||
./tests/test_vocoder_gan_train.sh && \
|
||||
./tests/test_vocoder_wavernn_train.sh && \
|
||||
./tests/test_vocoder_wavegrad_train.sh && \
|
||||
./tests/test_speedy_speech_train.sh && \
|
||||
./tests/test_aligntts_train.sh && \
|
||||
./tests/test_compute_statistics.sh && \
|
||||
|
||||
# linter check
|
||||
cardboardlinter --refspec main
|
||||
cardboardlinter --refspec main
|
||||
|
|
2
setup.py
2
setup.py
|
@ -19,7 +19,7 @@ if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version)
|
|||
)
|
||||
|
||||
|
||||
version = '0.0.10.3'
|
||||
version = '0.0.11'
|
||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
{
|
||||
"model": "align_tts",
|
||||
"run_name": "test_sample_dataset_run",
|
||||
"run_description": "sample dataset test run",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
// stft parameters
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// Griffin-Lim
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1,
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
// "characters":{
|
||||
// "pad": "_",
|
||||
// "eos": "&",
|
||||
// "bos": "*",
|
||||
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? ",
|
||||
// "punctuations":"!'(),-.:;? ",
|
||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ'̃' "
|
||||
// },
|
||||
|
||||
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"positional_encoding": true,
|
||||
"hidden_channels": 256,
|
||||
"encoder_type": "fftransformer",
|
||||
"encoder_params":{
|
||||
"hidden_channels_ffn": 1024 ,
|
||||
"num_heads": 2,
|
||||
"num_layers": 6,
|
||||
"dropout_p": 0.1
|
||||
},
|
||||
"decoder_type": "fftransformer",
|
||||
"decoder_params":{
|
||||
"hidden_channels_ffn": 1024 ,
|
||||
"num_heads": 2,
|
||||
"num_layers": 6,
|
||||
"dropout_p": 0.1
|
||||
},
|
||||
|
||||
|
||||
// TRAINING
|
||||
"batch_size":2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":1,
|
||||
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"phase_start_steps": [0, 40000, 80000, 160000, 170000],
|
||||
|
||||
|
||||
// LOSS PARAMETERS
|
||||
"ssim_alpha": 1,
|
||||
"spec_loss_alpha": 1,
|
||||
"dur_loss_alpha": 1,
|
||||
"mdn_alpha": 1,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": -1, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": true, // use noam warmup and lr schedule.
|
||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"lr": 0.002, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 1, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"keep_all_best": true, // If true, keeps all best_models after keep_after steps
|
||||
"keep_after": 10000, // Global step after which to keep best models if keep_all_best is true
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n
|
||||
"mixed_precision": false,
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "english_cleaners",
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 0, // number of evaluation data loader processes.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 300, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronoun[ciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
"external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
|
||||
|
||||
// DATASETS
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "tests/data/ljspeech/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": "metadata.csv",
|
||||
"meta_file_attn_mask": null
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
{
|
||||
"model": "Tacotron",
|
||||
"run_name": "test_sample_dataset_run",
|
||||
"run_description": "sample dataset test run",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
// stft parameters
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// Griffin-Lim
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 20.0,
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
// "characters":{
|
||||
// "pad": "_",
|
||||
// "eos": "~",
|
||||
// "bos": "^",
|
||||
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
|
||||
// "punctuations":"!'(),-.:;? ",
|
||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
||||
// },
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":1,
|
||||
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
||||
"mixed_precision": false,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// LOSS SETTINGS
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled
|
||||
"postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
|
||||
"postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
|
||||
"decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
|
||||
"decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled
|
||||
"postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled
|
||||
"ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
||||
"stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": false, // use noam warmup and lr schedule.
|
||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
// TACOTRON PRENET
|
||||
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
||||
"prenet_type": "bn", // "original" or "bn".
|
||||
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||
|
||||
// TACOTRON ATTENTION
|
||||
"attention_type": "original", // 'original' , 'graves', 'dynamic_convolution'
|
||||
"attention_heads": 4, // number of attention heads (only for 'graves')
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid.
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
|
||||
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
|
||||
"transition_agent": false, // enable/disable transition agent of forward attention.
|
||||
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
"bidirectional_decoder": true, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
|
||||
"double_decoder_consistency": false, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
|
||||
"ddc_r": 7, // reduction rate for coarse decoder.
|
||||
|
||||
// STOPNET
|
||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 1, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"keep_all_best": true, // If true, keeps all best_models after keep_after steps
|
||||
"keep_after": 10000, // Global step after which to keep best models if keep_all_best is true
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 0, // number of evaluation data loader processes.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
||||
"compute_input_seq_cache": true,
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_external_speaker_embedding_file": false,
|
||||
"external_speaker_embedding_file": null,
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_gst": true, // use global style tokens
|
||||
"gst": { // gst parameter if gst is enabled
|
||||
"gst_style_input": null, // Condition the style input either on a
|
||||
// -> wave file [path to wave] or
|
||||
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
|
||||
// with the dictionary being len(dict) == len(gst_style_tokens).
|
||||
"gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
|
||||
"gst_embedding_dim": 512,
|
||||
"gst_num_heads": 4,
|
||||
"gst_style_tokens": 10
|
||||
},
|
||||
|
||||
// DATASETS
|
||||
"train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments.
|
||||
"eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments.
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "tests/data/ljspeech/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": "metadata.csv"
|
||||
}
|
||||
]
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,177 @@
|
|||
{
|
||||
"model": "Tacotron",
|
||||
"run_name": "test_sample_dataset_run",
|
||||
"run_description": "sample dataset test run",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
// stft parameters
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// Griffin-Lim
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 20.0,
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
// "characters":{
|
||||
// "pad": "_",
|
||||
// "eos": "~",
|
||||
// "bos": "^",
|
||||
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
|
||||
// "punctuations":"!'(),-.:;? ",
|
||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
||||
// },
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":1,
|
||||
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
||||
"mixed_precision": false,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// LOSS SETTINGS
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled
|
||||
"postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
|
||||
"postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
|
||||
"decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
|
||||
"decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled
|
||||
"postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled
|
||||
"ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
||||
"stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": false, // use noam warmup and lr schedule.
|
||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
// TACOTRON PRENET
|
||||
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
||||
"prenet_type": "bn", // "original" or "bn".
|
||||
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||
|
||||
// TACOTRON ATTENTION
|
||||
"attention_type": "original", // 'original' , 'graves', 'dynamic_convolution'
|
||||
"attention_heads": 4, // number of attention heads (only for 'graves')
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid.
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
|
||||
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
|
||||
"transition_agent": false, // enable/disable transition agent of forward attention.
|
||||
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
|
||||
"double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
|
||||
"ddc_r": 7, // reduction rate for coarse decoder.
|
||||
|
||||
// STOPNET
|
||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 1, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"keep_all_best": true, // If true, keeps all best_models after keep_after steps
|
||||
"keep_after": 10000, // Global step after which to keep best models if keep_all_best is true
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 0, // number of evaluation data loader processes.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
||||
"compute_input_seq_cache": true,
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_external_speaker_embedding_file": false,
|
||||
"external_speaker_embedding_file": null,
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_gst": true, // use global style tokens
|
||||
"gst": { // gst parameter if gst is enabled
|
||||
"gst_style_input": null, // Condition the style input either on a
|
||||
// -> wave file [path to wave] or
|
||||
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
|
||||
// with the dictionary being len(dict) == len(gst_style_tokens).
|
||||
"gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
|
||||
"gst_embedding_dim": 512,
|
||||
"gst_num_heads": 4,
|
||||
"gst_style_tokens": 10
|
||||
},
|
||||
|
||||
// DATASETS
|
||||
"train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments.
|
||||
"eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments.
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "tests/data/ljspeech/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": "metadata.csv"
|
||||
}
|
||||
]
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/env bash
|
||||
set -xe
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_align_tts.py --config_path $BASEDIR/inputs/test_align_tts.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_align_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
|
@ -0,0 +1,106 @@
|
|||
import torch
|
||||
from TTS.tts.layers.feed_forward.decoder import Decoder
|
||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def test_encoder():
|
||||
input_dummy = torch.rand(8, 14, 37).to(device)
|
||||
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
|
||||
input_lengths[-1] = 37
|
||||
input_mask = torch.unsqueeze(
|
||||
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
# relative positional transformer encoder
|
||||
layer = Encoder(out_channels=11,
|
||||
in_hidden_channels=14,
|
||||
encoder_type='relative_position_transformer',
|
||||
encoder_params={
|
||||
'hidden_channels_ffn': 768,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
# residual conv bn encoder
|
||||
layer = Encoder(out_channels=11,
|
||||
in_hidden_channels=14,
|
||||
encoder_type='residual_conv_bn',
|
||||
encoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
# FFTransformer encoder
|
||||
layer = Encoder(out_channels=14,
|
||||
in_hidden_channels=14,
|
||||
encoder_type='fftransformer',
|
||||
encoder_params={
|
||||
"hidden_channels_ffn": 31,
|
||||
"num_heads": 2,
|
||||
"num_layers": 2,
|
||||
"dropout_p": 0.1
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 14, 37]
|
||||
|
||||
|
||||
def test_decoder():
|
||||
input_dummy = torch.rand(8, 128, 37).to(device)
|
||||
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
|
||||
input_lengths[-1] = 37
|
||||
|
||||
input_mask = torch.unsqueeze(
|
||||
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
# residual bn conv decoder
|
||||
layer = Decoder(out_channels=11, in_hidden_channels=128).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
# transformer decoder
|
||||
layer = Decoder(out_channels=11,
|
||||
in_hidden_channels=128,
|
||||
decoder_type='relative_position_transformer',
|
||||
decoder_params={
|
||||
'hidden_channels_ffn': 128,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 8,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
# wavenet decoder
|
||||
layer = Decoder(out_channels=11,
|
||||
in_hidden_channels=128,
|
||||
decoder_type='wavenet',
|
||||
decoder_params={
|
||||
"num_blocks": 12,
|
||||
"hidden_channels": 192,
|
||||
"kernel_size": 5,
|
||||
"dilation_rate": 1,
|
||||
"num_layers": 4,
|
||||
"dropout_p": 0.05
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
# FFTransformer decoder
|
||||
layer = Decoder(out_channels=11,
|
||||
in_hidden_channels=128,
|
||||
decoder_type='fftransformer',
|
||||
decoder_params={
|
||||
'hidden_channels_ffn': 31,
|
||||
'num_heads': 2,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 2,
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
|
@ -7,7 +7,7 @@ from tests import get_tests_input_path
|
|||
from torch import optim
|
||||
|
||||
from TTS.tts.layers.losses import GlowTTSLoss
|
||||
from TTS.tts.models.glow_tts import GlowTts
|
||||
from TTS.tts.models.glow_tts import GlowTTS
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
|
@ -35,14 +35,13 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
mel_spec = torch.rand(8, c.audio['num_mels'], 30).to(device)
|
||||
linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
|
||||
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
||||
|
||||
criterion = criterion = GlowTTSLoss()
|
||||
criterion = GlowTTSLoss()
|
||||
|
||||
# model to train
|
||||
model = GlowTts(
|
||||
model = GlowTTS(
|
||||
num_chars=32,
|
||||
hidden_channels_enc=48,
|
||||
hidden_channels_dec=48,
|
||||
|
@ -60,7 +59,7 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
use_encoder_prenet=True,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=5,
|
||||
dilation_rate=1,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.,
|
||||
num_speakers=0,
|
||||
|
@ -71,7 +70,7 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
mean_only=False).to(device)
|
||||
|
||||
# reference model to compare model weights
|
||||
model_ref = GlowTts(
|
||||
model_ref = GlowTTS(
|
||||
num_chars=32,
|
||||
hidden_channels_enc=48,
|
||||
hidden_channels_dec=48,
|
||||
|
@ -89,7 +88,7 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
use_encoder_prenet=True,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=5,
|
||||
dilation_rate=1,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.,
|
||||
num_speakers=0,
|
||||
|
@ -112,11 +111,11 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||
for _ in range(5):
|
||||
optimizer.zero_grad()
|
||||
z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
|
||||
input_dummy, input_lengths, mel_spec, mel_lengths, None)
|
||||
optimizer.zero_grad()
|
||||
loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
|
||||
o_dur_log, o_total_dur, input_lengths)
|
||||
loss = loss_dict['loss']
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
import torch as T
|
||||
|
||||
from TTS.tts.layers.tacotron import Prenet, CBHG, Decoder, Encoder
|
||||
from TTS.tts.layers.tacotron.tacotron import Prenet, CBHG, Decoder, Encoder
|
||||
from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
#!/usr/bin/env python3`
|
||||
import os
|
||||
import shutil
|
||||
import glob
|
||||
from tests import get_tests_output_path
|
||||
from TTS.utils.manage import ModelManager
|
||||
# #!/usr/bin/env python3`
|
||||
# import os
|
||||
# import shutil
|
||||
# import glob
|
||||
# from tests import get_tests_output_path
|
||||
# from TTS.utils.manage import ModelManager
|
||||
|
||||
|
||||
def test_if_all_models_available():
|
||||
"""Check if all the models are downloadable."""
|
||||
print(" > Checking the availability of all the models under the ModelManager.")
|
||||
manager = ModelManager(output_prefix=get_tests_output_path())
|
||||
model_names = manager.list_models()
|
||||
for model_name in model_names:
|
||||
manager.download_model(model_name)
|
||||
print(f" | > OK: {model_name}")
|
||||
# def test_if_all_models_available():
|
||||
# """Check if all the models are downloadable."""
|
||||
# print(" > Checking the availability of all the models under the ModelManager.")
|
||||
# manager = ModelManager(output_prefix=get_tests_output_path())
|
||||
# model_names = manager.list_models()
|
||||
# for model_name in model_names:
|
||||
# manager.download_model(model_name)
|
||||
# print(f" | > OK: {model_name}")
|
||||
|
||||
folders = glob.glob(os.path.join(manager.output_prefix, '*'))
|
||||
assert len(folders) == len(model_names)
|
||||
shutil.rmtree(manager.output_prefix)
|
||||
# folders = glob.glob(os.path.join(manager.output_prefix, '*'))
|
||||
# assert len(folders) == len(model_names)
|
||||
# shutil.rmtree(manager.output_prefix)
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env bash
|
||||
set -xe
|
||||
BASEDIR=$(dirname "$0")
|
||||
TARGET_SR=16000
|
||||
echo "$BASEDIR"
|
||||
#run the resample script
|
||||
python TTS/bin/resample.py --input_dir $BASEDIR/data/ljspeech --output_dir $BASEDIR/outputs/resample_tests --output_sr $TARGET_SR
|
||||
#check samplerate of output
|
||||
OUT_SR=$( (echo "import librosa" ; echo "y, sr = librosa.load('"$BASEDIR"/outputs/resample_tests/wavs/LJ001-0012.wav', sr=None)" ; echo "print(sr)") | python )
|
||||
OUT_SR=$(($OUT_SR + 0))
|
||||
if [[ $OUT_SR -ne $TARGET_SR ]]; then
|
||||
echo "Missmatch between target and output sample rates"
|
||||
exit 1
|
||||
fi
|
||||
#cleaning up
|
||||
rm -rf $BASEDIR/outputs/resample_tests
|
|
@ -1,8 +1,5 @@
|
|||
import torch
|
||||
|
||||
from TTS.tts.layers.speedy_speech.encoder import Encoder
|
||||
from TTS.tts.layers.speedy_speech.decoder import Decoder
|
||||
from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeech
|
||||
|
||||
|
@ -11,84 +8,6 @@ use_cuda = torch.cuda.is_available()
|
|||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def test_encoder():
|
||||
input_dummy = torch.rand(8, 14, 37).to(device)
|
||||
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
|
||||
input_lengths[-1] = 37
|
||||
input_mask = torch.unsqueeze(
|
||||
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
|
||||
# residual bn conv encoder
|
||||
layer = Encoder(out_channels=11,
|
||||
in_hidden_channels=14,
|
||||
encoder_type='residual_conv_bn').to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
# transformer encoder
|
||||
layer = Encoder(out_channels=11,
|
||||
in_hidden_channels=14,
|
||||
encoder_type='transformer',
|
||||
encoder_params={
|
||||
'hidden_channels_ffn': 768,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
|
||||
def test_decoder():
|
||||
input_dummy = torch.rand(8, 128, 37).to(device)
|
||||
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
|
||||
input_lengths[-1] = 37
|
||||
|
||||
input_mask = torch.unsqueeze(
|
||||
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
|
||||
# residual bn conv decoder
|
||||
layer = Decoder(out_channels=11, in_hidden_channels=128).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
# transformer decoder
|
||||
layer = Decoder(out_channels=11,
|
||||
in_hidden_channels=128,
|
||||
decoder_type='transformer',
|
||||
decoder_params={
|
||||
'hidden_channels_ffn': 128,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 8,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
|
||||
# wavenet decoder
|
||||
layer = Decoder(out_channels=11,
|
||||
in_hidden_channels=128,
|
||||
decoder_type='wavenet',
|
||||
decoder_params={
|
||||
"num_blocks": 12,
|
||||
"hidden_channels": 192,
|
||||
"kernel_size": 5,
|
||||
"dilation_rate": 1,
|
||||
"num_layers": 4,
|
||||
"dropout_p": 0.05
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
|
||||
|
||||
def test_duration_predictor():
|
||||
input_dummy = torch.rand(8, 128, 27).to(device)
|
||||
input_lengths = torch.randint(20, 27, (8, )).long().to(device)
|
||||
|
|
|
@ -2,8 +2,9 @@
|
|||
set -xe
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
|
@ -11,3 +12,25 @@ echo $LATEST_FOLDER
|
|||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
||||
|
||||
# run Tacotron bi-directional decoder
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_bd_config.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
||||
|
||||
# Tacotron2
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import torch
|
|||
import soundfile as sf
|
||||
from librosa.core import load
|
||||
|
||||
from tests import get_tests_path, get_tests_input_path
|
||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
from TTS.vocoder.layers.pqmf import PQMF
|
||||
|
||||
|
||||
|
@ -24,4 +24,5 @@ def test_pqmf():
|
|||
print(w2_.max())
|
||||
print(w2_.min())
|
||||
print(w2_.mean())
|
||||
sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)
|
||||
sf.write(os.path.join(get_tests_output_path(), 'pqmf_output.wav'),
|
||||
w2_.flatten().detach(), sr)
|
||||
|
|
|
@ -4,7 +4,7 @@ import tensorflow as tf
|
|||
import soundfile as sf
|
||||
from librosa.core import load
|
||||
|
||||
from tests import get_tests_path, get_tests_input_path
|
||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
from TTS.vocoder.tf.layers.pqmf import PQMF
|
||||
|
||||
|
||||
|
@ -25,4 +25,5 @@ def test_pqmf():
|
|||
print(w2_.max())
|
||||
print(w2_.min())
|
||||
print(w2_.mean())
|
||||
sf.write('tf_pqmf_output.wav', w2_.flatten(), sr)
|
||||
sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'),
|
||||
w2_.flatten(), sr)
|
||||
|
|
Loading…
Reference in New Issue