Merge branch 'dev'

pull/10/head
root 2021-01-13 10:09:07 +00:00
commit d320ccf6c6
102 changed files with 17285 additions and 1028 deletions
.circleci
.travis

57
.circleci/config.yml Normal file
View File

@ -0,0 +1,57 @@
version: 2
workflows:
version: 2
test:
jobs:
- test-3.6
- test-3.7
- test-3.8
executor: ubuntu-latest
on:
push:
pull_request:
types: [opened, synchronize, reopened]
jobs:
test-3.6: &test-template
docker:
- image: circleci/python:3.6
resource_class: large
working_directory: ~/repo
steps:
- checkout
- run: |
sudo apt update
sudo apt install espeak git
# so we can take advantage of pyproject.toml build-dependency support
- run: python3 -m pip install --upgrade pip
- run: python3 -m pip install numpy Cython
- run: |
python3 setup.py egg_info
python3 -m pip install -e .
- run: |
python3 -m pip install --quiet --upgrade cardboardlint pylint
cardboardlinter --refspec ${CIRCLE_BRANCH} -n auto
- run: nosetests tests --nocapture
- run: |
./tests/test_server_package.sh
./tests/test_glow-tts_train.sh
./tests/test_server_package.sh
./tests/test_tacotron_train.sh
./tests/test_vocoder_gan_train.sh
./tests/test_vocoder_wavegrad_train.sh
./tests/test_vocoder_wavernn_train.sh
./tests/test_speedy_speech_train.sh
test-3.7:
<<: *test-template
docker:
- image: circleci/python:3.7
test-3.8:
<<: *test-template
docker:
- image: circleci/python:3.8

View File

@ -1,32 +0,0 @@
language: python
git:
quiet: true
before_install:
- sudo apt-get update
- sudo apt-get -y install espeak
- python -m pip install --upgrade pip
- pip install six==1.12.0
- pip install --upgrade cython
matrix:
include:
- name: "Lint check"
python: "3.6"
install: pip install --quiet --upgrade cardboardlint pylint
env: TEST_SUITE="lint"
- name: "Unit tests"
python: "3.6"
install:
- python setup.py egg_info
- pip install -e .
env: TEST_SUITE="unittest"
- name: "Unit tests"
python: "3.6"
install:
- python setup.py egg_info
- pip install -e .
env: TEST_SUITE="testscripts"
script: ./.travis/script

View File

@ -1,22 +0,0 @@
#!/bin/bash
set -ex
git remote set-branches --add origin $TRAVIS_BRANCH
git fetch
if [[ ( "$TRAVIS_PULL_REQUEST" != "false" ) && ( "$TEST_SUITE" == "lint" ) ]]; then
# Run cardboardlinter, in case of pull requests
cardboardlinter --refspec origin/$TRAVIS_BRANCH -n auto
fi
if [[ "$TEST_SUITE" == "unittest" ]]; then
nosetests tests --nocapture
./tests/test_server_package.sh
fi
if [[ "$TEST_SUITE" == "testscripts" ]]; then
# test model training scripts
./tests/test_tts_train.sh
./tests/test_vocoder_gan_train.sh
./tests/test_vocoder_wavernn_train.sh
fi

123
README.md
View File

@ -1,32 +1,63 @@
<p align="center"><img src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" data-canonical-src="![TTS banner](https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png =250x250)
" width="320" height="95" /></p>
<img src="https://user-images.githubusercontent.com/1402048/104139991-3fd15e00-53af-11eb-8640-3a78a64641dd.png" data-canonical-src="![TTS banner](https://user-images.githubusercontent.com/1402048/104139991-3fd15e00-53af-11eb-8640-3a78a64641dd.png =250x250)
" width="256" height="256" align="right" />
<br/>
# TTS: Text-to-Speech for all.
<<<<<<< HEAD
<p align='center'>
<img src='https://circleci.com/gh/mozilla/TTS/tree/dev.svg?style=svg' alt="mozilla"/>
<a href='https://discourse.mozilla.org/c/tts'><img src="https://img.shields.io/badge/discourse-online-green.svg"/></a>
<a href='https://opensource.org/licenses/MPL-2.0'> <img src="https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg"/></a>
</p>
=======
TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to be achive the best trade-off among ease-of-training, speed and quality.
TTS comes with [pretrained models](https://github.com/mozilla/TTS/wiki/Released-Models), tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
>>>>>>> dev
<br/>
[![CircleCI](<https://circleci.com/gh/mozilla/TTS/tree/dev.svg?style=svg>)]()
[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
<<<<<<< HEAD
TTS is a deep learning based Text2Speech project, low in cost and high in quality.
=======
:loudspeaker: [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
English Voice Samples: https://erogol.github.io/ddc-samples/
:man_cook: [TTS training recipes](https://github.com/erogol/TTS_recipes)
>>>>>>> dev
TTS training recipes: https://github.com/erogol/TTS_recipes
:page_facing_up: [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers)
TTS paper collection: https://github.com/erogol/TTS-papers
## 💬 Where to ask questions
Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly, so that more people can benefit from it.
[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/0)](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/1)](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/2)](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/3)](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/4)](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/5)](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/6)](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/7)](https://sourcerer.io/fame/erogol/erogol/TTS/links/7)
| Type | Platforms |
| ------------------------------- | --------------------------------------- |
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
| ❔ **FAQ** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/FAQ) |
| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
| 👩‍💻 **Usage Questions** | [Discourse Forum] |
| 🗯 **General Discussion** | [Discourse Forum] and [Matrix Channel] |
## TTS Performance
[github issue tracker]: https://github.com/mozilla/tts/issues
[discourse forum]: https://discourse.mozilla.org/c/tts/
[matrix channel]: https://matrix.to/#/!KTePhNahjgiVumkqca:matrix.org?via=matrix.org
[Tutorials and Examples]: https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials
## 🔗 Links and Resources
| Type | Links |
| ------------------------------- | --------------------------------------- |
| 👩🏾‍🏫 **Tutorials and Examples** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials) |
| 🤖 **Released Models** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/Released-Models)|
| 💻 **Docker Image** | [Repository by @synesthesiam](https://github.com/synesthesiam/docker-mozillatts)|
## 🥇 TTS Performance
<p align="center"><img src="https://discourse-prod-uploads-81679984178418.s3.dualstack.us-west-2.amazonaws.com/optimized/3X/6/4/6428f980e9ec751c248e591460895f7881aec0c6_2_1035x591.png" width="800" /></p>
"Mozilla*" and "Judy*" are our models.
[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results)
<<<<<<< HEAD
## Provided Models and Methods
Text-to-Spectrogram:
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
@ -52,6 +83,8 @@ Vocoders:
You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).
=======
>>>>>>> dev
## Features
- High performance Deep Learning models for Text2Speech tasks.
- Text2Spec models (Tacotron, Tacotron2).
@ -68,26 +101,39 @@ You can also help us implement more models. Some TTS related work can be found [
- Notebooks for extensive model benchmarking.
- Modular (but not too much) code base enabling easy testing for new ideas.
## Main Requirements and Installation
Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation.
* python>=3.6
* pytorch>=1.5.0
* tensorflow>=2.3
* librosa
* tensorboard
* tensorboardX
* matplotlib
* unidecode
## Implemented Models
### Text-to-Spectrogram
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
Install TTS using ```setup.py```. It will install all of the requirements automatically and make TTS available to all the python environment as an ordinary python module.
### Attention Methods
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
- Graves Attention: [paper](https://arxiv.org/abs/1907.09006)
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
```python setup.py develop```
### Speaker Encoder
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
Or you can use ```requirements.txt``` to install the requirements only.
### Vocoders
- MelGAN: [paper](https://arxiv.org/abs/1710.10467)
- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
- WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
```pip install -r requirements.txt```
You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).
### Directory Structure
## Install TTS
TTS supports **python >= 3.6**.
```python setup.py install``` or ```python setup.py develop``` to keep your installation in your working directory.
## Directory Structure
```
|- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
|- utils/ (common utilities.)
@ -108,12 +154,6 @@ Or you can use ```requirements.txt``` to install the requirements only.
|- (same)
```
### Docker
A docker image is created by [@synesthesiam](https://github.com/synesthesiam) and shared in a separate [repository](https://github.com/synesthesiam/docker-mozillatts) with the latest LJSpeech models.
## Release Models
Please visit [our wiki.](https://github.com/mozilla/TTS/wiki/Released-Models)
## Sample Model Output
Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset.
@ -123,8 +163,11 @@ Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-articl
<img src="images/example_model_output.png?raw=true" alt="example_output" width="400"/>
<<<<<<< HEAD
## [TTS Tutorials and Notebooks](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials)
=======
>>>>>>> dev
## Datasets and Data-Loading
TTS provides a generic dataloader easy to use for your custom dataset.
You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples.
@ -139,7 +182,7 @@ Some of the public datasets that we successfully applied TTS:
- [LibriTTS](https://openslr.org/60/)
- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01
## Training and Fine-tuning LJ-Speech
## Example: Training and Fine-tuning LJ-Speech Dataset
Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below.
To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listening to the results is the best way to go.
@ -189,11 +232,8 @@ If you like to use TTS to try a new idea and like to share your experiments with
(If you have an idea for better collaboration, let us know)
- Create a new branch.
- Open an issue pointing your branch.
- Explain your experiment.
- Share your results as you proceed. (Tensorboard log files, audio results, visuals etc.)
- Use LJSpeech dataset (for English) if you like to compare results with the released models. (It is the most open scalable dataset for quick experimentation)
## [Contact/Getting Help](https://github.com/mozilla/TTS/wiki/Contact-and-Getting-Help)
- Explain your idea and experiment.
- Share your results regularly. (Tensorboard log files, audio results, visuals etc.)
## Major TODOs
- [x] Implement the model.
@ -205,17 +245,6 @@ If you like to use TTS to try a new idea and like to share your experiments with
- [x] Multi-speaker embedding.
- [x] Model optimization (model export, model pruning etc.)
<!--## References
- [Efficient Neural Audio Synthesis](https://arxiv.org/pdf/1802.08435.pdf)
- [Attention-Based models for speech recognition](https://arxiv.org/pdf/1506.07503.pdf)
- [Generating Sequences With Recurrent Neural Networks](https://arxiv.org/pdf/1308.0850.pdf)
- [Char2Wav: End-to-End Speech Synthesis](https://openreview.net/pdf?id=B1VWyySKx)
- [VoiceLoop: Voice Fitting and Synthesis via a Phonological Loop](https://arxiv.org/pdf/1707.06588.pdf)
- [WaveRNN](https://arxiv.org/pdf/1802.08435.pdf)
- [Faster WaveNet](https://arxiv.org/abs/1611.09482)
- [Parallel WaveNet](https://arxiv.org/abs/1711.10433)
-->
### Acknowledgement
- https://github.com/keithito/tacotron (Dataset pre-processing)
- https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture)

View File

@ -0,0 +1,166 @@
import argparse
import importlib
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from argparse import RawTextHelpFormatter
from TTS.tts.datasets.TTSDataset import MyDataset
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.io import load_checkpoint
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='''Extract attention masks from trained Tacotron/Tacotron2 models.
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n'''
'''Each attention mask is written to the same path as the input wav file with ".npy" file extension.
(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n'''
'''
Example run:
CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
--config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
--dataset_metafile /root/LJSpeech-1.1/metadata.csv
--data_path /root/LJSpeech-1.1/
--batch_size 32
--dataset ljspeech
--use_cuda True
''',
formatter_class=RawTextHelpFormatter
)
parser.add_argument('--model_path',
type=str,
required=True,
help='Path to Tacotron/Tacotron2 model file ')
parser.add_argument(
'--config_path',
type=str,
required=True,
help='Path to Tacotron/Tacotron2 config file.',
)
parser.add_argument('--dataset',
type=str,
default='',
required=True,
help='Target dataset processor name from TTS.tts.dataset.preprocess.')
parser.add_argument(
'--dataset_metafile',
type=str,
default='',
required=True,
help='Dataset metafile inclusing file paths with transcripts.')
parser.add_argument(
'--data_path',
type=str,
default='',
help='Defines the data path. It overwrites config.json.')
parser.add_argument('--use_cuda',
type=bool,
default=False,
help="enable/disable cuda.")
parser.add_argument(
'--batch_size',
default=16,
type=int,
help='Batch size for the model. Use batch_size=1 if you have no CUDA.')
args = parser.parse_args()
C = load_config(args.config_path)
ap = AudioProcessor(**C.audio)
# if the vocabulary was passed, replace the default
if 'characters' in C.keys():
symbols, phonemes = make_symbols(**C.characters)
# load the model
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
# TODO: handle multi-speaker
model = setup_model(num_chars, num_speakers=0, c=C)
model, _ = load_checkpoint(model, args.model_path, None, args.use_cuda)
model.eval()
# data loader
preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')
preprocessor = getattr(preprocessor, args.dataset)
meta_data = preprocessor(args.data_path, args.dataset_metafile)
dataset = MyDataset(model.decoder.r,
C.text_cleaner,
compute_linear_spec=False,
ap=ap,
meta_data=meta_data,
tp=C.characters if 'characters' in C.keys() else None,
add_blank=C['add_blank'] if 'add_blank' in C.keys() else False,
use_phonemes=C.use_phonemes,
phoneme_cache_path=C.phoneme_cache_path,
phoneme_language=C.phoneme_language,
enable_eos_bos=C.enable_eos_bos_chars)
dataset.sort_items()
loader = DataLoader(dataset,
batch_size=args.batch_size,
num_workers=4,
collate_fn=dataset.collate_fn,
shuffle=False,
drop_last=False)
# compute attentions
file_paths = []
with torch.no_grad():
for data in tqdm(loader):
# setup input data
text_input = data[0]
text_lengths = data[1]
linear_input = data[3]
mel_input = data[4]
mel_lengths = data[5]
stop_targets = data[6]
item_idxs = data[7]
# dispatch data to GPU
if args.use_cuda:
text_input = text_input.cuda()
text_lengths = text_lengths.cuda()
mel_input = mel_input.cuda()
mel_lengths = mel_lengths.cuda()
mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(
text_input, text_lengths, mel_input)
alignments = alignments.detach()
for idx, alignment in enumerate(alignments):
item_idx = item_idxs[idx]
# interpolate if r > 1
alignment = torch.nn.functional.interpolate(
alignment.transpose(0, 1).unsqueeze(0),
size=None,
scale_factor=model.decoder.r,
mode='nearest',
align_corners=None,
recompute_scale_factor=None).squeeze(0).transpose(0, 1)
# remove paddings
alignment = alignment[:mel_lengths[idx], :text_lengths[idx]].cpu().numpy()
# set file paths
wav_file_name = os.path.basename(item_idx)
align_file_name = os.path.splitext(wav_file_name)[0] + '.npy'
file_path = item_idx.replace(wav_file_name, align_file_name)
# save output
file_paths.append([item_idx, file_path])
np.save(file_path, alignment)
# ourput metafile
metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
with open(metafile, "w") as f:
for p in file_paths:
f.write(f"{p[0]}|{p[1]}\n")
print(f" >> Metafile created: {metafile}")

View File

@ -1,65 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import pathlib
import time
import subprocess
import argparse
import torch
def main():
"""
Call train.py as a new process and pass command arguments
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--continue_path',
type=str,
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default='',
required='--config_path' not in sys.argv)
parser.add_argument(
'--restore_path',
type=str,
help='Model file to be restored. Use to finetune a model.',
default='')
parser.add_argument(
'--config_path',
type=str,
help='Path to config file for training.',
required='--continue_path' not in sys.argv
)
args = parser.parse_args()
num_gpus = torch.cuda.device_count()
group_id = time.strftime("%Y_%m_%d-%H%M%S")
# set arguments for train.py
folder_path = pathlib.Path(__file__).parent.absolute()
command = [os.path.join(folder_path, 'train_tts.py')]
command.append('--continue_path={}'.format(args.continue_path))
command.append('--restore_path={}'.format(args.restore_path))
command.append('--config_path={}'.format(args.config_path))
command.append('--group_id=group_{}'.format(group_id))
command.append('')
# run processes
processes = []
for i in range(num_gpus):
my_env = os.environ.copy()
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
command[-1] = '--rank={}'.format(i)
stdout = None if i == 0 else open(os.devnull, 'w')
p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
processes.append(p)
print(command)
for p in processes:
p.wait()
if __name__ == '__main__':
main()

View File

@ -9,6 +9,7 @@ import string
import time
import torch
import numpy as np
from TTS.tts.utils.generic_utils import setup_model, is_tacotron
from TTS.tts.utils.synthesis import synthesis
@ -21,10 +22,31 @@ from TTS.vocoder.utils.generic_utils import setup_generator
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):
t_1 = time.time()
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
# grab spectrogram (thx to the nice guys at mozilla discourse for codesnipplet)
if args.save_spectogram:
spec_file_name = args.text.replace(" ", "_")[0:10]
spec_file_name = spec_file_name.translate(
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.npy'
spec_file_name = os.path.join(args.out_path, spec_file_name)
spectrogram = torch.FloatTensor(mel_postnet_spec.T)
spectrogram = spectrogram.unsqueeze(0)
np.save(spec_file_name, spectrogram)
print(" > Saving raw spectogram to " + spec_file_name)
if CONFIG.model == "Tacotron" and not use_gl:
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
if not use_gl:
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
# Use if not computed noise schedule with tune_wavegrad
beta = np.linspace(1e-6, 0.01, 50)
vocoder_model.compute_noise_level(beta)
# Use alternative when using output npy file from tune_wavegrad
# beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item()
# vocoder_model.compute_noise_level(beta['beta'])
device_type = "cuda" if use_cuda else "cpu"
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0))
if use_cuda and not use_gl:
waveform = waveform.cpu()
if not use_gl:
@ -88,6 +110,11 @@ if __name__ == "__main__":
'--gst_style',
help="Wav path file for GST stylereference.",
default=None)
parser.add_argument(
'--save_spectogram',
type=bool,
help="If true save raw spectogram for further (vocoder) processing in out_path.",
default=False)
args = parser.parse_args()
@ -170,7 +197,7 @@ if __name__ == "__main__":
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style)
# save the results
file_name = args.text.replace(" ", "_")
file_name = args.text.replace(" ", "_")[0:10]
file_name = file_name.translate(
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(args.out_path, file_name)

View File

@ -13,15 +13,14 @@ from TTS.speaker_encoder.dataset import MyDataset
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.speaker_encoder.utils.generic_utils import \
check_config_speaker_encoder
check_config_speaker_encoder, save_best_model
from TTS.speaker_encoder.utils.visual import plot_embeddings
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.utils.io import save_best_model
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import (count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.io import copy_model_files, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import NoamLR, check_update
@ -255,7 +254,7 @@ if __name__ == '__main__':
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'),
copy_model_files(c, args.config_path, OUT_PATH,
new_fields)
LOG_DIR = OUT_PATH

View File

@ -7,41 +7,37 @@ import os
import sys
import time
import traceback
from random import randrange
import torch
from random import randrange
# DISTRIBUTED
from torch.nn.parallel import DistributedDataParallel as DDP_th
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.datasets.TTSDataset import MyDataset
from TTS.tts.layers.losses import GlowTTSLoss
from TTS.tts.utils.generic_utils import setup_model, check_config_tts
from TTS.tts.utils.generic_utils import check_config_tts, setup_model
from TTS.tts.utils.io import save_best_model, save_checkpoint
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import parse_speakers, load_speaker_mapping
from TTS.tts.utils.speakers import parse_speakers
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.distribute import init_distributed, reduce_tensor
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.io import copy_model_files, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import (NoamLR, check_update,
setup_torch_training_env)
# DISTRIBUTED
from torch.nn.parallel import DistributedDataParallel as DDP_th
from torch.utils.data.distributed import DistributedSampler
from TTS.utils.distribute import init_distributed, reduce_tensor
from TTS.utils.training import NoamLR, setup_torch_training_env
use_cuda, num_gpus = setup_torch_training_env(True, False)
def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
def setup_loader(ap, r, is_val=False, verbose=False):
if is_val and not c.run_eval:
loader = None
else:
@ -61,8 +57,15 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars,
use_noise_augment=c['use_noise_augment'] and not is_val,
verbose=verbose,
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
if c.use_phonemes and c.compute_input_seq_cache:
# precompute phonemes to have a better estimate of sequence lengths.
dataset.compute_input_seq(c.num_loader_workers)
dataset.sort_items()
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
@ -78,29 +81,29 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
def format_data(data):
if c.use_speaker_embedding:
speaker_mapping = load_speaker_mapping(OUT_PATH)
# setup input data
text_input = data[0]
text_lengths = data[1]
speaker_names = data[2]
mel_input = data[4].permute(0, 2, 1) # B x D x T
mel_lengths = data[5]
attn_mask = data[8]
item_idx = data[7]
attn_mask = data[9]
avg_text_length = torch.mean(text_lengths.float())
avg_spec_length = torch.mean(mel_lengths.float())
if c.use_speaker_embedding:
if c.use_external_speaker_embedding_file:
speaker_ids = data[8]
# return precomputed embedding vector
speaker_c = data[8]
else:
speaker_ids = [
# return speaker_id to be used by an embedding layer
speaker_c = [
speaker_mapping[speaker_name] for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids)
speaker_c = torch.LongTensor(speaker_c)
else:
speaker_ids = None
speaker_c = None
# dispatch data to GPU
if use_cuda:
@ -108,15 +111,15 @@ def format_data(data):
text_lengths = text_lengths.cuda(non_blocking=True)
mel_input = mel_input.cuda(non_blocking=True)
mel_lengths = mel_lengths.cuda(non_blocking=True)
if speaker_ids is not None:
speaker_ids = speaker_ids.cuda(non_blocking=True)
if speaker_c is not None:
speaker_c = speaker_c.cuda(non_blocking=True)
if attn_mask is not None:
attn_mask = attn_mask.cuda(non_blocking=True)
return text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\
avg_text_length, avg_spec_length, attn_mask
return text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
avg_text_length, avg_spec_length, attn_mask, item_idx
def data_depended_init(model, ap, speaker_mapping=None):
def data_depended_init(data_loader, model, ap):
"""Data depended initialization for activation normalization."""
if hasattr(model, 'module'):
for f in model.module.decoder.flows:
@ -127,20 +130,22 @@ def data_depended_init(model, ap, speaker_mapping=None):
if getattr(f, "set_ddi", False):
f.set_ddi(True)
data_loader = setup_loader(ap, 1, is_val=False, speaker_mapping=speaker_mapping)
model.train()
print(" > Data depended initialization ... ")
num_iter = 0
with torch.no_grad():
for _, data in enumerate(data_loader):
# format data
text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\
_, _, attn_mask = format_data(data)
text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\
_, _, attn_mask, item_idx = format_data(data)
# forward pass model
_ = model.forward(
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids)
break
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=spekaer_embed)
if num_iter == c.data_dep_init_iter:
break
num_iter += 1
if hasattr(model, 'module'):
for f in model.module.decoder.flows:
@ -153,10 +158,9 @@ def data_depended_init(model, ap, speaker_mapping=None):
return model
def train(model, criterion, optimizer, scheduler,
ap, global_step, epoch, speaker_mapping=None):
data_loader = setup_loader(ap, 1, is_val=False,
verbose=(epoch == 0), speaker_mapping=speaker_mapping)
def train(data_loader, model, criterion, optimizer, scheduler,
ap, global_step, epoch):
model.train()
epoch_time = 0
keep_avg = KeepAverage()
@ -172,8 +176,8 @@ def train(model, criterion, optimizer, scheduler,
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\
avg_text_length, avg_spec_length, attn_mask = format_data(data)
text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data)
loader_time = time.time() - end_time
@ -183,7 +187,7 @@ def train(model, criterion, optimizer, scheduler,
# forward pass model
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids)
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c)
# compute loss
loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
@ -203,10 +207,6 @@ def train(model, criterion, optimizer, scheduler,
c.grad_clip)
optimizer.step()
grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
optimizer.step()
# setup lr
if c.noam_schedule:
scheduler.step()
@ -215,7 +215,7 @@ def train(model, criterion, optimizer, scheduler,
current_lr = optimizer.param_groups[0]['lr']
# compute alignment error (the lower the better )
align_error = 1 - alignment_diagonal_score(alignments)
align_error = 1 - alignment_diagonal_score(alignments, binary=True)
loss_dict['align_error'] = align_error
step_time = time.time() - start_time
@ -274,10 +274,18 @@ def train(model, criterion, optimizer, scheduler,
save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH,
model_loss=loss_dict['loss'])
# wait all kernels to be completed
torch.cuda.synchronize()
# Diagnostic visualizations
# direct pass on model for spec predictions
target_speaker = None if speaker_ids is None else speaker_ids[:1]
spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker)
target_speaker = None if speaker_c is None else speaker_c[:1]
if hasattr(model, 'module'):
spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker)
else:
spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker)
spec_pred = spec_pred.permute(0, 2, 1)
gt_spec = mel_input.permute(0, 2, 1)
const_spec = spec_pred[0].data.cpu().numpy()
@ -313,8 +321,7 @@ def train(model, criterion, optimizer, scheduler,
@torch.no_grad()
def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping):
data_loader = setup_loader(ap, 1, is_val=True, speaker_mapping=speaker_mapping)
def evaluate(data_loader, model, criterion, ap, global_step, epoch):
model.eval()
epoch_time = 0
keep_avg = KeepAverage()
@ -324,12 +331,12 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping):
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\
_, _, attn_mask = format_data(data)
text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
_, _, attn_mask, item_idx = format_data(data)
# forward pass model
z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids)
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c)
# compute loss
loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
@ -370,7 +377,7 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping):
if args.rank == 0:
# Diagnostic visualizations
# direct pass on model for spec predictions
target_speaker = None if speaker_ids is None else speaker_ids[:1]
target_speaker = None if speaker_c is None else speaker_c[:1]
if hasattr(model, 'module'):
spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker)
else:
@ -464,7 +471,7 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping):
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global meta_data_train, meta_data_eval, symbols, phonemes
global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
# Audio processor
ap = AudioProcessor(**c.audio)
if 'characters' in c.keys():
@ -538,14 +545,18 @@ def main(args): # pylint: disable=redefined-outer-name
if 'best_loss' not in locals():
best_loss = float('inf')
# define dataloaders
train_loader = setup_loader(ap, 1, is_val=False, verbose=True)
eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
global_step = args.restore_step
model = data_depended_init(model, ap, speaker_mapping)
model = data_depended_init(train_loader, model, ap)
for epoch in range(0, c.epochs):
c_logger.print_epoch_start(epoch, c.epochs)
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
scheduler, ap, global_step,
epoch, speaker_mapping)
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=speaker_mapping)
epoch)
eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = train_avg_loss_dict['avg_loss']
if c.run_eval:
@ -621,8 +632,8 @@ if __name__ == '__main__':
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
copy_model_files(c, args.config_path,
OUT_PATH, new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)

View File

@ -0,0 +1,618 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import glob
import os
import sys
import time
import traceback
import numpy as np
from random import randrange
import torch
# DISTRIBUTED
from torch.nn.parallel import DistributedDataParallel as DDP_th
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.datasets.TTSDataset import MyDataset
from TTS.tts.layers.losses import SpeedySpeechLoss
from TTS.tts.utils.generic_utils import check_config_tts, setup_model
from TTS.tts.utils.io import save_best_model, save_checkpoint
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import parse_speakers
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.distribute import init_distributed, reduce_tensor
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_model_files, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import NoamLR, setup_torch_training_env
use_cuda, num_gpus = setup_torch_training_env(True, False)
def setup_loader(ap, r, is_val=False, verbose=False):
if is_val and not c.run_eval:
loader = None
else:
dataset = MyDataset(
r,
c.text_cleaner,
compute_linear_spec=False,
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len,
max_seq_len=c.max_seq_len,
phoneme_cache_path=c.phoneme_cache_path,
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars,
use_noise_augment=not is_val,
verbose=verbose,
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
if c.use_phonemes and c.compute_input_seq_cache:
# precompute phonemes to have a better estimate of sequence lengths.
dataset.compute_input_seq(c.num_loader_workers)
dataset.sort_items()
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
batch_size=c.eval_batch_size if is_val else c.batch_size,
shuffle=False,
collate_fn=dataset.collate_fn,
drop_last=False,
sampler=sampler,
num_workers=c.num_val_loader_workers
if is_val else c.num_loader_workers,
pin_memory=False)
return loader
def format_data(data):
# setup input data
text_input = data[0]
text_lengths = data[1]
speaker_names = data[2]
mel_input = data[4].permute(0, 2, 1) # B x D x T
mel_lengths = data[5]
item_idx = data[7]
attn_mask = data[9]
avg_text_length = torch.mean(text_lengths.float())
avg_spec_length = torch.mean(mel_lengths.float())
if c.use_speaker_embedding:
if c.use_external_speaker_embedding_file:
# return precomputed embedding vector
speaker_c = data[8]
else:
# return speaker_id to be used by an embedding layer
speaker_c = [
speaker_mapping[speaker_name] for speaker_name in speaker_names
]
speaker_c = torch.LongTensor(speaker_c)
else:
speaker_c = None
# compute durations from attention mask
durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2])
for idx, am in enumerate(attn_mask):
# compute raw durations
c_idxs = am[:, :text_lengths[idx], :mel_lengths[idx]].max(1)[1]
# c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True)
c_idxs, counts = torch.unique(c_idxs, return_counts=True)
dur = torch.ones([text_lengths[idx]]).to(counts.dtype)
dur[c_idxs] = counts
# smooth the durations and set any 0 duration to 1
# by cutting off from the largest duration indeces.
extra_frames = dur.sum() - mel_lengths[idx]
largest_idxs = torch.argsort(-dur)[:extra_frames]
dur[largest_idxs] -= 1
assert dur.sum() == mel_lengths[idx], f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
durations[idx, :text_lengths[idx]] = dur
# dispatch data to GPU
if use_cuda:
text_input = text_input.cuda(non_blocking=True)
text_lengths = text_lengths.cuda(non_blocking=True)
mel_input = mel_input.cuda(non_blocking=True)
mel_lengths = mel_lengths.cuda(non_blocking=True)
if speaker_c is not None:
speaker_c = speaker_c.cuda(non_blocking=True)
attn_mask = attn_mask.cuda(non_blocking=True)
durations = durations.cuda(non_blocking=True)
return text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
avg_text_length, avg_spec_length, attn_mask, durations, item_idx
def train(data_loader, model, criterion, optimizer, scheduler,
ap, global_step, epoch):
model.train()
epoch_time = 0
keep_avg = KeepAverage()
if use_cuda:
batch_n_iter = int(
len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time()
c_logger.print_train_start()
scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
avg_text_length, avg_spec_length, _, dur_target, _ = format_data(data)
loader_time = time.time() - end_time
global_step += 1
optimizer.zero_grad()
# forward pass model
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
decoder_output, dur_output, alignments = model.forward(
text_input, text_lengths, mel_lengths, dur_target, g=speaker_c)
# compute loss
loss_dict = criterion(decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths)
# backward pass with loss scaling
if c.mixed_precision:
scaler.scale(loss_dict['loss']).backward()
scaler.unscale_(optimizer)
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
c.grad_clip)
scaler.step(optimizer)
scaler.update()
else:
loss_dict['loss'].backward()
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
c.grad_clip)
optimizer.step()
# setup lr
if c.noam_schedule:
scheduler.step()
# current_lr
current_lr = optimizer.param_groups[0]['lr']
# compute alignment error (the lower the better )
align_error = 1 - alignment_diagonal_score(alignments, binary=True)
loss_dict['align_error'] = align_error
step_time = time.time() - start_time
epoch_time += step_time
# aggregate losses from processes
if num_gpus > 1:
loss_dict['loss_l1'] = reduce_tensor(loss_dict['loss_l1'].data, num_gpus)
loss_dict['loss_ssim'] = reduce_tensor(loss_dict['loss_ssim'].data, num_gpus)
loss_dict['loss_dur'] = reduce_tensor(loss_dict['loss_dur'].data, num_gpus)
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
# detach loss values
loss_dict_new = dict()
for key, value in loss_dict.items():
if isinstance(value, (int, float)):
loss_dict_new[key] = value
else:
loss_dict_new[key] = value.item()
loss_dict = loss_dict_new
# update avg stats
update_train_values = dict()
for key, value in loss_dict.items():
update_train_values['avg_' + key] = value
update_train_values['avg_loader_time'] = loader_time
update_train_values['avg_step_time'] = step_time
keep_avg.update_values(update_train_values)
# print training progress
if global_step % c.print_step == 0:
log_dict = {
"avg_spec_length": [avg_spec_length, 1], # value, precision
"avg_text_length": [avg_text_length, 1],
"step_time": [step_time, 4],
"loader_time": [loader_time, 2],
"current_lr": current_lr,
}
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
log_dict, loss_dict, keep_avg.avg_values)
if args.rank == 0:
# Plot Training Iter Stats
# reduce TB load
if global_step % c.tb_plot_step == 0:
iter_stats = {
"lr": current_lr,
"grad_norm": grad_norm,
"step_time": step_time
}
iter_stats.update(loss_dict)
tb_logger.tb_train_iter_stats(global_step, iter_stats)
if global_step % c.save_step == 0:
if c.checkpoint:
# save model
save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH,
model_loss=loss_dict['loss'])
# wait all kernels to be completed
torch.cuda.synchronize()
# Diagnostic visualizations
idx = np.random.randint(mel_targets.shape[0])
pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
gt_spec = mel_targets[idx].data.cpu().numpy().T
align_img = alignments[idx].data.cpu()
figures = {
"prediction": plot_spectrogram(pred_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img),
}
tb_logger.tb_train_figures(global_step, figures)
# Sample audio
train_audio = ap.inv_melspectrogram(pred_spec.T)
tb_logger.tb_train_audios(global_step,
{'TrainAudio': train_audio},
c.audio["sample_rate"])
end_time = time.time()
# print epoch stats
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
# Plot Epoch Stats
if args.rank == 0:
epoch_stats = {"epoch_time": epoch_time}
epoch_stats.update(keep_avg.avg_values)
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, global_step)
return keep_avg.avg_values, global_step
@torch.no_grad()
def evaluate(data_loader, model, criterion, ap, global_step, epoch):
model.eval()
epoch_time = 0
keep_avg = KeepAverage()
c_logger.print_eval_start()
if data_loader is not None:
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
_, _, _, dur_target, _ = format_data(data)
# forward pass model
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
decoder_output, dur_output, alignments = model.forward(
text_input, text_lengths, mel_lengths, dur_target, g=speaker_c)
# compute loss
loss_dict = criterion(decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths)
# step time
step_time = time.time() - start_time
epoch_time += step_time
# compute alignment score
align_error = 1 - alignment_diagonal_score(alignments, binary=True)
loss_dict['align_error'] = align_error
# aggregate losses from processes
if num_gpus > 1:
loss_dict['loss_l1'] = reduce_tensor(loss_dict['loss_l1'].data, num_gpus)
loss_dict['loss_ssim'] = reduce_tensor(loss_dict['loss_ssim'].data, num_gpus)
loss_dict['loss_dur'] = reduce_tensor(loss_dict['loss_dur'].data, num_gpus)
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
# detach loss values
loss_dict_new = dict()
for key, value in loss_dict.items():
if isinstance(value, (int, float)):
loss_dict_new[key] = value
else:
loss_dict_new[key] = value.item()
loss_dict = loss_dict_new
# update avg stats
update_train_values = dict()
for key, value in loss_dict.items():
update_train_values['avg_' + key] = value
keep_avg.update_values(update_train_values)
if c.print_eval:
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
if args.rank == 0:
# Diagnostic visualizations
idx = np.random.randint(mel_targets.shape[0])
pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
gt_spec = mel_targets[idx].data.cpu().numpy().T
align_img = alignments[idx].data.cpu()
eval_figures = {
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
"alignment": plot_alignment(align_img, output_fig=False)
}
# Sample audio
eval_audio = ap.inv_melspectrogram(pred_spec.T)
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
c.audio["sample_rate"])
# Plot Validation Stats
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
tb_logger.tb_eval_figures(global_step, eval_figures)
if args.rank == 0 and epoch >= c.test_delay_epochs:
if c.test_sentences_file is None:
test_sentences = [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963."
]
else:
with open(c.test_sentences_file, "r") as f:
test_sentences = [s.strip() for s in f.readlines()]
# test sentences
test_audios = {}
test_figures = {}
print(" | > Synthesizing test sentences")
if c.use_speaker_embedding:
if c.use_external_speaker_embedding_file:
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping)-1)]]['embedding']
speaker_id = None
else:
speaker_id = 0
speaker_embedding = None
else:
speaker_id = None
speaker_embedding = None
style_wav = c.get("style_wav_for_test")
for idx, test_sentence in enumerate(test_sentences):
try:
wav, alignment, _, postnet_output, _, _ = synthesis(
model,
test_sentence,
c,
use_cuda,
ap,
speaker_id=speaker_id,
speaker_embedding=speaker_embedding,
style_wav=style_wav,
truncated=False,
enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
use_griffin_lim=True,
do_trim_silence=False)
file_path = os.path.join(AUDIO_PATH, str(global_step))
os.makedirs(file_path, exist_ok=True)
file_path = os.path.join(file_path,
"TestSentence_{}.wav".format(idx))
ap.save_wav(wav, file_path)
test_audios['{}-audio'.format(idx)] = wav
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
postnet_output, ap)
test_figures['{}-alignment'.format(idx)] = plot_alignment(
alignment)
except: #pylint: disable=bare-except
print(" !! Error creating Test Sentence -", idx)
traceback.print_exc()
tb_logger.tb_test_audios(global_step, test_audios,
c.audio['sample_rate'])
tb_logger.tb_test_figures(global_step, test_figures)
return keep_avg.avg_values
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
# Audio processor
ap = AudioProcessor(**c.audio)
if 'characters' in c.keys():
symbols, phonemes = make_symbols(**c.characters)
# DISTRUBUTED
if num_gpus > 1:
init_distributed(args.rank, num_gpus, args.group_id,
c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
# load data instances
meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=True)
# set the portion of the data used for training if set in config.json
if 'train_portion' in c.keys():
meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)]
if 'eval_portion' in c.keys():
meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)]
# parse speakers
num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, OUT_PATH)
# setup model
model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim)
optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9)
criterion = SpeedySpeechLoss(c)
if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location='cpu')
try:
# TODO: fix optimizer init, model.cuda() needs to be called before
# optimizer restore
optimizer.load_state_dict(checkpoint['optimizer'])
if c.reinit_layers:
raise RuntimeError
model.load_state_dict(checkpoint['model'])
except: #pylint: disable=bare-except
print(" > Partial model initialization.")
model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
model.load_state_dict(model_dict)
del model_dict
for group in optimizer.param_groups:
group['initial_lr'] = c.lr
print(" > Model restored from step %d" % checkpoint['step'],
flush=True)
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
if use_cuda:
model.cuda()
criterion.cuda()
# DISTRUBUTED
if num_gpus > 1:
model = DDP_th(model, device_ids=[args.rank])
if c.noam_schedule:
scheduler = NoamLR(optimizer,
warmup_steps=c.warmup_steps,
last_epoch=args.restore_step - 1)
else:
scheduler = None
num_params = count_parameters(model)
print("\n > Model has {} parameters".format(num_params), flush=True)
if 'best_loss' not in locals():
best_loss = float('inf')
# define dataloaders
train_loader = setup_loader(ap, 1, is_val=False, verbose=True)
eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
global_step = args.restore_step
for epoch in range(0, c.epochs):
c_logger.print_epoch_start(epoch, c.epochs)
train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
scheduler, ap, global_step,
epoch)
eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = train_avg_loss_dict['avg_loss']
if c.run_eval:
target_loss = eval_avg_loss_dict['avg_loss']
best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
OUT_PATH)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--continue_path',
type=str,
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default='',
required='--config_path' not in sys.argv)
parser.add_argument(
'--restore_path',
type=str,
help='Model file to be restored. Use to finetune a model.',
default='')
parser.add_argument(
'--config_path',
type=str,
help='Path to config file for training.',
required='--continue_path' not in sys.argv
)
parser.add_argument('--debug',
type=bool,
default=False,
help='Do not verify commit integrity to run training.')
# DISTRUBUTED
parser.add_argument(
'--rank',
type=int,
default=0,
help='DISTRIBUTED: process rank for distributed training.')
parser.add_argument('--group_id',
type=str,
default="",
help='DISTRIBUTED: process group id.')
args = parser.parse_args()
if args.continue_path != '':
args.output_path = args.continue_path
args.config_path = os.path.join(args.continue_path, 'config.json')
list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv
latest_model_file = max(list_of_files, key=os.path.getctime)
args.restore_path = latest_model_file
print(f" > Training continues for {args.restore_path}")
# setup output paths and read configs
c = load_config(args.config_path)
# check_config(c)
check_config_tts(c)
_ = os.path.dirname(os.path.realpath(__file__))
if c.mixed_precision:
print(" > Mixed precision enabled.")
OUT_PATH = args.continue_path
if args.continue_path == '':
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
c_logger = ConsoleLogger()
if args.rank == 0:
os.makedirs(AUDIO_PATH, exist_ok=True)
new_fields = {}
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_model_files(c, args.config_path, OUT_PATH, new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
# write model desc to tensorboard
tb_logger.tb_add_text('model-description', c['run_description'], 0)
try:
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)

View File

@ -18,7 +18,7 @@ from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.utils.generic_utils import check_config_tts, setup_model
from TTS.tts.utils.io import save_best_model, save_checkpoint
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import load_speaker_mapping, parse_speakers
from TTS.tts.utils.speakers import parse_speakers
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
@ -29,7 +29,7 @@ from TTS.utils.distribute import (DistributedSampler, apply_gradient_allreduce,
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.io import copy_model_files, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
@ -39,28 +39,35 @@ from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
use_cuda, num_gpus = setup_torch_training_env(True, False)
def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
def setup_loader(ap, r, is_val=False, verbose=False, dataset=None):
if is_val and not c.run_eval:
loader = None
else:
dataset = MyDataset(
r,
c.text_cleaner,
compute_linear_spec=c.model.lower() == 'tacotron',
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len,
max_seq_len=c.max_seq_len,
phoneme_cache_path=c.phoneme_cache_path,
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars,
verbose=verbose,
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
if dataset is None:
dataset = MyDataset(
r,
c.text_cleaner,
compute_linear_spec=c.model.lower() == 'tacotron',
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len,
max_seq_len=c.max_seq_len,
phoneme_cache_path=c.phoneme_cache_path,
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars,
verbose=verbose,
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
if c.use_phonemes and c.compute_input_seq_cache:
# precompute phonemes to have a better estimate of sequence lengths.
dataset.compute_input_seq(c.num_loader_workers)
dataset.sort_items()
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
@ -74,10 +81,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
pin_memory=False)
return loader
def format_data(data, speaker_mapping=None):
if speaker_mapping is None and c.use_speaker_embedding and not c.use_external_speaker_embedding_file:
speaker_mapping = load_speaker_mapping(OUT_PATH)
def format_data(data):
# setup input data
text_input = data[0]
text_lengths = data[1]
@ -126,10 +130,8 @@ def format_data(data, speaker_mapping=None):
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length
def train(model, criterion, optimizer, optimizer_st, scheduler,
ap, global_step, epoch, scaler, scaler_st, speaker_mapping=None):
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
verbose=(epoch == 0), speaker_mapping=speaker_mapping)
def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler,
ap, global_step, epoch, scaler, scaler_st):
model.train()
epoch_time = 0
keep_avg = KeepAverage()
@ -144,7 +146,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length = format_data(data, speaker_mapping)
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length = format_data(data)
loader_time = time.time() - end_time
global_step += 1
@ -327,8 +329,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
@torch.no_grad()
def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
data_loader = setup_loader(ap, model.decoder.r, is_val=True, speaker_mapping=speaker_mapping)
def evaluate(data_loader, model, criterion, ap, global_step, epoch):
model.eval()
epoch_time = 0
keep_avg = KeepAverage()
@ -338,7 +339,7 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data, speaker_mapping)
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data)
assert mel_input.shape[1] % model.decoder.r == 0
# forward pass model
@ -493,7 +494,7 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global meta_data_train, meta_data_eval, symbols, phonemes
global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
# Audio processor
ap = AudioProcessor(**c.audio)
if 'characters' in c.keys():
@ -586,6 +587,13 @@ def main(args): # pylint: disable=redefined-outer-name
if 'best_loss' not in locals():
best_loss = float('inf')
# define data loaders
train_loader = setup_loader(ap,
model.decoder.r,
is_val=False,
verbose=True)
eval_loader = setup_loader(ap, model.decoder.r, is_val=True)
global_step = args.restore_step
for epoch in range(0, c.epochs):
c_logger.print_epoch_start(epoch, c.epochs)
@ -596,17 +604,40 @@ def main(args): # pylint: disable=redefined-outer-name
model.decoder.set_r(r)
if c.bidirectional_decoder:
model.decoder_backward.set_r(r)
train_loader.dataset.outputs_per_step = r
eval_loader.dataset.outputs_per_step = r
train_loader = setup_loader(ap,
model.decoder.r,
is_val=False,
dataset=train_loader.dataset)
eval_loader = setup_loader(ap,
model.decoder.r,
is_val=True,
dataset=eval_loader.dataset)
print("\n > Number of output frames:", model.decoder.r)
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
# train one epoch
train_avg_loss_dict, global_step = train(train_loader, model,
criterion, optimizer,
optimizer_st, scheduler, ap,
global_step, epoch, scaler, scaler_st, speaker_mapping)
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping)
global_step, epoch, scaler,
scaler_st)
# eval one epoch
eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap,
global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = train_avg_loss_dict['avg_postnet_loss']
if c.run_eval:
target_loss = eval_avg_loss_dict['avg_postnet_loss']
best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
OUT_PATH, scaler=scaler.state_dict() if c.mixed_precision else None)
best_loss = save_best_model(
target_loss,
best_loss,
model,
optimizer,
global_step,
epoch,
c.r,
OUT_PATH,
scaler=scaler.state_dict() if c.mixed_precision else None)
if __name__ == '__main__':
@ -675,8 +706,8 @@ if __name__ == '__main__':
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
copy_model_files(c, args.config_path,
OUT_PATH, new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)

View File

@ -13,7 +13,7 @@ from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.io import copy_model_files, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import setup_torch_training_env
@ -639,8 +639,8 @@ if __name__ == '__main__':
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
copy_model_files(c, args.config_path,
OUT_PATH, new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)

View File

@ -18,7 +18,7 @@ from TTS.utils.distribute import init_distributed
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.io import copy_model_files, load_config
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import setup_torch_training_env
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
@ -486,8 +486,8 @@ if __name__ == '__main__':
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
copy_model_files(c, args.config_path,
OUT_PATH, new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)

View File

@ -14,7 +14,7 @@ from torch.utils.data import DataLoader
from TTS.tts.utils.visual import plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.utils.radam import RAdam
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.io import copy_model_files, load_config
from TTS.utils.training import setup_torch_training_env
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.tensorboard_logger import TensorboardLogger
@ -513,8 +513,8 @@ if __name__ == "__main__":
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(
args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
copy_model_files(
c, args.config_path, OUT_PATH, new_fields
)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)

View File

@ -8,7 +8,7 @@
<meta name="description" content="">
<meta name="author" content="">
<title>Mozilla - Text2Speech engine</title>
<title>TTS engine</title>
<!-- Bootstrap core CSS -->
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
@ -57,7 +57,6 @@
<div class="row">
<div class="col-lg-12 text-center">
<img class="mt-5" src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" alt=></img>
<h1 class="mt-5">Mozilla TTS</h1>
<ul class="list-unstyled">
</ul>
<input id="text" placeholder="Type here..." size=45 type="text" name="text">

View File

@ -99,7 +99,7 @@
"prenet_dropout": false, // enable/disable dropout at prenet.
// TACOTRON ATTENTION
"attention_type": "original", // 'original' or 'graves'
"attention_type": "original", // 'original' , 'graves', 'dynamic_convolution'
"attention_heads": 4, // number of attention heads (only for 'graves')
"attention_norm": "sigmoid", // softmax or sigmoid.
"windowing": false, // Enables attention windowing. Used only in eval mode.
@ -131,6 +131,8 @@
"batch_group_size": 4, //Number of batches to shuffle after bucketing.
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 153, // DATASET-RELATED: maximum text length
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
"use_noise_augment": true,
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",

View File

@ -105,6 +105,7 @@
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 500, // DATASET-RELATED: maximum text length
"compute_f0": false, // compute f0 values in data-loader
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",

View File

@ -1,7 +1,7 @@
{
"model": "glow_tts",
"run_name": "glow-tts-tdsep-conv",
"run_description": "glow-tts model training with time-depth separable conv encoder.",
"run_name": "glow-tts-residual_bn_conv",
"run_description": "glow-tts model training with residual BN conv.",
// AUDIO PARAMETERS
"audio":{
@ -28,15 +28,15 @@
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.00
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// VOCABULARY PARAMETERS
@ -62,13 +62,28 @@
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// MODEL PARAMETERS
"use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
// "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
"hidden_channels_encoder": 192,
"hidden_channels_decoder": 192,
"hidden_channels_duration_predictor": 256,
"use_encoder_prenet": true,
"encoder_type": "rel_pos_transformer",
"encoder_params": {
"kernel_size":3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 768,
"input_length": null
},
// TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16,
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
"mixed_precision": true,
"data_dep_init_iter": 10,
// VALIDATION
"run_eval": true,
@ -84,8 +99,6 @@
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
"encoder_type": "time-depth-separable",
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console.
"tb_plot_step": 100, // Number of steps to plot TB training figures.
@ -93,7 +106,6 @@
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"apex_amp_level": null,
// DATA LOADING
"text_cleaner": "phoneme_cleaners",
@ -104,6 +116,8 @@
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 500, // DATASET-RELATED: maximum text length
"compute_f0": false, // compute f0 values in data-loader
"use_noise_augment": true, //add a random noise to audio signal for augmentation at training .
"compute_input_seq_cache": true,
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",
@ -115,6 +129,7 @@
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_external_speaker_embedding_file": false,
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
"use_gst": false, // TACOTRON ONLY: use global style tokens

View File

@ -0,0 +1,171 @@
{
"model": "Tacotron2",
"run_name": "ljspeech-dcattn",
"run_description": "tacotron2 with dynamic convolution attention.",
// AUDIO PARAMETERS
"audio":{
// stft parameters
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1,
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
// "characters":{
// "pad": "_",
// "eos": "~",
// "bos": "^",
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
// "punctuations":"!'(),-.:;? ",
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
// },
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16,
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
"mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
// LOSS SETTINGS
"loss_masking": true, // enable / disable loss masking against the sequence padding.
"decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled
"postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
"postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
"decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
"decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled
"postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled
"ga_alpha": 0.0, // weight for guided attention loss. If > 0, guided attention is enabled.
"stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule": false, // use noam warmup and lr schedule.
"grad_clip": 1.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"wd": 0.000001, // Weight decay weight.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
// TACOTRON PRENET
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
"prenet_type": "original", // "original" or "bn".
"prenet_dropout": false, // enable/disable dropout at prenet.
// TACOTRON ATTENTION
"attention_type": "dynamic_convolution", // 'original' , 'graves', 'dynamic_convolution'
"attention_heads": 4, // number of attention heads (only for 'graves')
"attention_norm": "softmax", // softmax or sigmoid.
"windowing": false, // Enables attention windowing. Used only in eval mode.
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
"transition_agent": false, // enable/disable transition agent of forward attention.
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
"double_decoder_consistency": false, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
"ddc_r": 7, // reduction rate for coarse decoder.
// STOPNET
"stopnet": true, // Train stopnet predicting the end of synthesis.
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console.
"tb_plot_step": 100, // Number of steps to plot TB training figures.
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"batch_group_size": 4, //Number of batches to shuffle after bucketing.
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 153, // DATASET-RELATED: maximum text length
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",
// PHONEMES
"phoneme_cache_path": "/home/erogol/Models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_gst": false, // use global style tokens
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"gst": { // gst parameter if gst is enabled
"gst_style_input": null, // Condition the style input either on a
// -> wave file [path to wave] or
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
// with the dictionary being len(dict) <= len(gst_style_tokens).
"gst_embedding_dim": 512,
"gst_num_heads": 4,
"gst_style_tokens": 10,
"gst_use_speaker_embedding": false
},
// DATASETS
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "/home/erogol/Data/LJSpeech-1.1/",
"meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
"meta_file_val": null
}
]
}

View File

@ -0,0 +1,151 @@
{
"model": "speedy_speech",
"run_name": "speedy-speech-ljspeech",
"run_description": "speedy-speech model for LJSpeech dataset.",
// AUDIO PARAMETERS
"audio":{
// stft parameters
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1,
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
// "characters":{
// "pad": "_",
// "eos": "&",
// "bos": "*",
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? ",
// "punctuations":"!'(),-.:;? ",
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ'̃' "
// },
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// MODEL PARAMETERS
"positional_encoding": true,
"hidden_channels": 128, // defined globally all the hidden channels of the model - 128 default
"encoder_type": "residual_conv_bn",
"encoder_params":{
"kernel_size": 4,
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
"num_conv_blocks": 2,
"num_res_blocks": 13
},
"decoder_type": "residual_conv_bn",
"decoder_params":{
"kernel_size": 4,
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
"num_conv_blocks": 2,
"num_res_blocks": 17
},
// TRAINING
"batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":32,
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
// LOSS PARAMETERS
"ssim_alpha": 1,
"l1_alpha": 1,
"huber_alpha": 1,
// VALIDATION
"run_eval": true,
"test_delay_epochs": -1, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule": true, // use noam warmup and lr schedule.
"grad_clip": 1.0, // upper limit for gradients for clipping.
"epochs": 10000, // total number of epochs to train.
"lr": 0.002, // Initial learning rate. If Noam decay is active, maximum learning rate.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console.
"tb_plot_step": 100, // Number of steps to plot TB training figures.
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n
"mixed_precision": false,
// DATA LOADING
"text_cleaner": "english_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 8, // number of evaluation data loader processes.
"batch_group_size": 4, //Number of batches to shuffle after bucketing.
"min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 300, // DATASET-RELATED: maximum text length
"compute_f0": false, // compute f0 values in data-loader
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
// PATHS
"output_path": "/home/erogol/Models/ljspeech/",
// PHONEMES
"phoneme_cache_path": "/home/erogol/Models/ljspeech_phonemes/", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronoun[ciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
// DATASETS
"datasets": // List of datasets. They all merged and they get different s$
[
{
"name": "ljspeech",
"path": "/home/erogol/Data/LJSpeech-1.1/",
"meta_file_train": "metadata.csv",
"meta_file_val": null,
"meta_file_attn_mask": "/home/erogol/Data/LJSpeech-1.1/metadata_attn_mask.txt" // created by bin/compute_attention_masks.py
}
]
}

View File

@ -1,12 +1,16 @@
import os
import numpy as np
import collections
import torch
import os
import random
from torch.utils.data import Dataset
from multiprocessing import Manager, Pool
from TTS.tts.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
from TTS.tts.utils.data import prepare_data, prepare_tensor, prepare_stop_target
import numpy as np
import torch
import tqdm
from torch.utils.data import Dataset
from TTS.tts.utils.data import (prepare_data, prepare_stop_target,
prepare_tensor)
from TTS.tts.utils.text import (pad_with_eos_bos, phoneme_to_sequence,
text_to_sequence)
class MyDataset(Dataset):
@ -26,6 +30,7 @@ class MyDataset(Dataset):
phoneme_language="en-us",
enable_eos_bos=False,
speaker_mapping=None,
use_noise_augment=False,
verbose=False):
"""
Args:
@ -44,6 +49,7 @@ class MyDataset(Dataset):
phoneme_language (str): one the languages from
https://github.com/bootphon/phonemizer#languages
enable_eos_bos (bool): enable end of sentence and beginning of sentences characters.
use_noise_augment (bool): enable adding random noise to wav for augmentation.
verbose (bool): print diagnostic information.
"""
self.batch_group_size = batch_group_size
@ -62,7 +68,9 @@ class MyDataset(Dataset):
self.phoneme_language = phoneme_language
self.enable_eos_bos = enable_eos_bos
self.speaker_mapping = speaker_mapping
self.use_noise_augment = use_noise_augment
self.verbose = verbose
self.input_seq_computed = False
if use_phonemes and not os.path.isdir(phoneme_cache_path):
os.makedirs(phoneme_cache_path, exist_ok=True)
if self.verbose:
@ -71,7 +79,6 @@ class MyDataset(Dataset):
if use_phonemes:
print(" | > phoneme language: {}".format(phoneme_language))
print(" | > Number of instances : {}".format(len(self.items)))
self.sort_items()
def load_wav(self, filename):
audio = self.ap.load_wav(filename)
@ -82,35 +89,40 @@ class MyDataset(Dataset):
data = np.load(filename).astype('float32')
return data
def _generate_and_cache_phoneme_sequence(self, text, cache_path):
@staticmethod
def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, tp, add_blank):
"""generate a phoneme sequence from text.
since the usage is for subsequent caching, we never add bos and
eos chars here. Instead we add those dynamically later; based on the
config option."""
phonemes = phoneme_to_sequence(text, [self.cleaners],
language=self.phoneme_language,
phonemes = phoneme_to_sequence(text, [cleaners],
language=language,
enable_eos_bos=False,
tp=self.tp, add_blank=self.add_blank)
tp=tp, add_blank=add_blank)
phonemes = np.asarray(phonemes, dtype=np.int32)
np.save(cache_path, phonemes)
return phonemes
def _load_or_generate_phoneme_sequence(self, wav_file, text):
@staticmethod
def _load_or_generate_phoneme_sequence(wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, tp, add_blank):
file_name = os.path.splitext(os.path.basename(wav_file))[0]
cache_path = os.path.join(self.phoneme_cache_path,
file_name + '_phoneme.npy')
# different names for normal phonemes and with blank chars.
file_name_ext = '_blanked_phoneme.npy' if add_blank else '_phoneme.npy'
cache_path = os.path.join(phoneme_cache_path,
file_name + file_name_ext)
try:
phonemes = np.load(cache_path)
except FileNotFoundError:
phonemes = self._generate_and_cache_phoneme_sequence(
text, cache_path)
phonemes = MyDataset._generate_and_cache_phoneme_sequence(
text, cache_path, cleaners, language, tp, add_blank)
except (ValueError, IOError):
print(" > ERROR: failed loading phonemes for {}. "
print(" [!] failed loading phonemes for {}. "
"Recomputing.".format(wav_file))
phonemes = self._generate_and_cache_phoneme_sequence(
text, cache_path)
if self.enable_eos_bos:
phonemes = pad_with_eos_bos(phonemes, tp=self.tp)
phonemes = MyDataset._generate_and_cache_phoneme_sequence(
text, cache_path, cleaners, language, tp, add_blank)
if enable_eos_bos:
phonemes = pad_with_eos_bos(phonemes, tp=tp)
phonemes = np.asarray(phonemes, dtype=np.int32)
return phonemes
@ -125,11 +137,17 @@ class MyDataset(Dataset):
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
if self.use_phonemes:
text = self._load_or_generate_phoneme_sequence(wav_file, text)
else:
text = np.asarray(text_to_sequence(text, [self.cleaners],
tp=self.tp, add_blank=self.add_blank),
# apply noise for augmentation
if self.use_noise_augment:
wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
if not self.input_seq_computed:
if self.use_phonemes:
text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank)
else:
text = np.asarray(text_to_sequence(text, [self.cleaners],
tp=self.tp, add_blank=self.add_blank),
dtype=np.int32)
assert text.size > 0, self.items[idx][1]
@ -138,6 +156,12 @@ class MyDataset(Dataset):
if "attn_file" in locals():
attn = np.load(attn_file)
if len(text) > self.max_seq_len:
# return a different sample if the phonemized
# text is longer than the threshold
# TODO: find a better fix
return self.load_data(100)
sample = {
'text': text,
'wav': wav,
@ -148,6 +172,41 @@ class MyDataset(Dataset):
}
return sample
@staticmethod
def _phoneme_worker(args):
item = args[0]
func_args = args[1]
text, wav_file, *_ = item
phonemes = MyDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args)
return phonemes
def compute_input_seq(self, num_workers=0):
"""compute input sequences separately. Call it before
passing dataset to data loader."""
if not self.use_phonemes:
if self.verbose:
print(" | > Computing input sequences ...")
for idx, item in enumerate(tqdm.tqdm(self.items)):
text, *_ = item
sequence = np.asarray(text_to_sequence(text, [self.cleaners],
tp=self.tp, add_blank=self.add_blank),
dtype=np.int32)
self.items[idx][0] = sequence
else:
func_args = [self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank]
if self.verbose:
print(" | > Computing phonemes ...")
if num_workers == 0:
for idx, item in enumerate(tqdm.tqdm(self.items)):
phonemes = self._phoneme_worker([item, func_args])
self.items[idx][0] = phonemes
else:
with Pool(num_workers) as p:
phonemes = list(tqdm.tqdm(p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items)))
for idx, p in enumerate(phonemes):
self.items[idx][0] = p
def sort_items(self):
r"""Sort instances based on text length in ascending order"""
lengths = np.array([len(ins[0]) for ins in self.items])

View File

@ -8,6 +8,9 @@ from tqdm import tqdm
from TTS.tts.utils.generic_utils import split_dataset
####################
# UTILITIES
####################
def load_meta_data(datasets, eval_split=True):
meta_data_train_all = []
@ -17,9 +20,12 @@ def load_meta_data(datasets, eval_split=True):
root_path = dataset['path']
meta_file_train = dataset['meta_file_train']
meta_file_val = dataset['meta_file_val']
# setup the right data processor
preprocessor = get_preprocessor_by_name(name)
# load train set
meta_data_train = preprocessor(root_path, meta_file_train)
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
# load evaluation split if set
if eval_split:
if meta_file_val is None:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
@ -27,15 +33,41 @@ def load_meta_data(datasets, eval_split=True):
meta_data_eval = preprocessor(root_path, meta_file_val)
meta_data_eval_all += meta_data_eval
meta_data_train_all += meta_data_train
# load attention masks for duration predictor training
if 'meta_file_attn_mask' in dataset:
meta_data = dict(load_attention_mask_meta_data(dataset['meta_file_attn_mask']))
for idx, ins in enumerate(meta_data_train_all):
attn_file = meta_data[ins[1]].strip()
meta_data_train_all[idx].append(attn_file)
if meta_data_eval_all is not None:
for idx, ins in enumerate(meta_data_eval_all):
attn_file = meta_data[ins[1]].strip()
meta_data_eval_all[idx].append(attn_file)
return meta_data_train_all, meta_data_eval_all
def load_attention_mask_meta_data(metafile_path):
"""Load meta data file created by compute_attention_masks.py"""
with open(metafile_path, 'r') as f:
lines = f.readlines()
meta_data = []
for line in lines:
wav_file, attn_file = line.split('|')
meta_data.append([wav_file, attn_file])
return meta_data
def get_preprocessor_by_name(name):
"""Returns the respective preprocessing function."""
thismodule = sys.modules[__name__]
return getattr(thismodule, name.lower())
########################
# DATASETS
########################
def tweb(root_path, meta_file):
"""Normalize TWEB dataset.
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
@ -52,19 +84,6 @@ def tweb(root_path, meta_file):
return items
# def kusal(root_path, meta_file):
# txt_file = os.path.join(root_path, meta_file)
# texts = []
# wavs = []
# with open(txt_file, "r", encoding="utf8") as f:
# frames = [
# line.split('\t') for line in f
# if line.split('\t')[0] in self.wav_files_dict.keys()
# ]
# # TODO: code the rest
# return {'text': texts, 'wavs': wavs}
def mozilla(root_path, meta_file):
"""Normalizes Mozilla meta data files to TTS format"""
txt_file = os.path.join(root_path, meta_file)

View File

@ -0,0 +1,482 @@
import torch
from torch import nn
from torch.nn import functional as F
from TTS.tts.layers.common_layers import Linear
from scipy.stats import betabinom
class LocationLayer(nn.Module):
"""Layers for Location Sensitive Attention
Args:
attention_dim (int): number of channels in the input tensor.
attention_n_filters (int, optional): number of filters in convolution. Defaults to 32.
attention_kernel_size (int, optional): kernel size of convolution filter. Defaults to 31.
"""
def __init__(self,
attention_dim,
attention_n_filters=32,
attention_kernel_size=31):
super(LocationLayer, self).__init__()
self.location_conv1d = nn.Conv1d(
in_channels=2,
out_channels=attention_n_filters,
kernel_size=attention_kernel_size,
stride=1,
padding=(attention_kernel_size - 1) // 2,
bias=False)
self.location_dense = Linear(
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
def forward(self, attention_cat):
"""
Shapes:
attention_cat: [B, 2, C]
"""
processed_attention = self.location_conv1d(attention_cat)
processed_attention = self.location_dense(
processed_attention.transpose(1, 2))
return processed_attention
class GravesAttention(nn.Module):
"""Graves Attention as is ref1 with updates from ref2.
ref1: https://arxiv.org/abs/1910.10288
ref2: https://arxiv.org/pdf/1906.01083.pdf
Args:
query_dim (int): number of channels in query tensor.
K (int): number of Gaussian heads to be used for computing attention.
"""
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
def __init__(self, query_dim, K):
super(GravesAttention, self).__init__()
self._mask_value = 1e-8
self.K = K
# self.attention_alignment = 0.05
self.eps = 1e-5
self.J = None
self.N_a = nn.Sequential(
nn.Linear(query_dim, query_dim, bias=True),
nn.ReLU(),
nn.Linear(query_dim, 3*K, bias=True))
self.attention_weights = None
self.mu_prev = None
self.init_layers()
def init_layers(self):
torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean
torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std
def init_states(self, inputs):
if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
# pylint: disable=R0201
# pylint: disable=unused-argument
def preprocess_inputs(self, inputs):
return None
def forward(self, query, inputs, processed_inputs, mask):
"""
Shapes:
query: [B, C_attention_rnn]
inputs: [B, T_in, C_encoder]
processed_inputs: place_holder
mask: [B, T_in]
"""
gbk_t = self.N_a(query)
gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
# attention model parameters
# each B x K
g_t = gbk_t[:, 0, :]
b_t = gbk_t[:, 1, :]
k_t = gbk_t[:, 2, :]
# dropout to decorrelate attention heads
g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
# attention GMM parameters
sig_t = torch.nn.functional.softplus(b_t) + self.eps
mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
g_t = torch.softmax(g_t, dim=-1) + self.eps
j = self.J[:inputs.size(1)+1]
# attention weights
phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
# discritize attention weights
alpha_t = torch.sum(phi_t, 1)
alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
alpha_t[alpha_t == 0] = 1e-8
# apply masking
if mask is not None:
alpha_t.data.masked_fill_(~mask, self._mask_value)
context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
self.attention_weights = alpha_t
self.mu_prev = mu_t
return context
class OriginalAttention(nn.Module):
"""Bahdanau Attention with various optional modifications. Proposed below.
- Location sensitive attnetion: https://arxiv.org/abs/1712.05884
- Forward Attention: https://arxiv.org/abs/1807.06736 + state masking at inference
- Using sigmoid instead of softmax normalization
- Attention windowing at inference time
Note:
Location Sensitive Attention is an attention mechanism that extends the additive attention mechanism
to use cumulative attention weights from previous decoder time steps as an additional feature.
Forward attention considers only the alignment paths that satisfy the monotonic condition at each
decoder timestep. The modified attention probabilities at each timestep are computed recursively
using a forward algorithm.
Transition agent for forward attention is further proposed, which helps the attention mechanism
to make decisions whether to move forward or stay at each decoder timestep.
Attention windowing applies a sliding windows to time steps of the input tensor centering at the last
time step with the largest attention weight. It is especially useful at inference to keep the attention
alignment diagonal.
Args:
query_dim (int): number of channels in the query tensor.
embedding_dim (int): number of channels in the vakue tensor. In general, the value tensor is the output of the encoder layer.
attention_dim (int): number of channels of the inner attention layers.
location_attention (bool): enable/disable location sensitive attention.
attention_location_n_filters (int): number of location attention filters.
attention_location_kernel_size (int): filter size of location attention convolution layer.
windowing (int): window size for attention windowing. if it is 5, for computing the attention, it only considers the time steps [(t-5), ..., (t+5)] of the input.
norm (str): normalization method applied to the attention weights. 'softmax' or 'sigmoid'
forward_attn (bool): enable/disable forward attention.
trans_agent (bool): enable/disable transition agent in the forward attention.
forward_attn_mask (int): enable/disable an explicit masking in forward attention. It is useful to set at especially inference time.
"""
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask):
super(OriginalAttention, self).__init__()
self.query_layer = Linear(
query_dim, attention_dim, bias=False, init_gain='tanh')
self.inputs_layer = Linear(
embedding_dim, attention_dim, bias=False, init_gain='tanh')
self.v = Linear(attention_dim, 1, bias=True)
if trans_agent:
self.ta = nn.Linear(
query_dim + embedding_dim, 1, bias=True)
if location_attention:
self.location_layer = LocationLayer(
attention_dim,
attention_location_n_filters,
attention_location_kernel_size,
)
self._mask_value = -float("inf")
self.windowing = windowing
self.win_idx = None
self.norm = norm
self.forward_attn = forward_attn
self.trans_agent = trans_agent
self.forward_attn_mask = forward_attn_mask
self.location_attention = location_attention
def init_win_idx(self):
self.win_idx = -1
self.win_back = 2
self.win_front = 6
def init_forward_attn(self, inputs):
B = inputs.shape[0]
T = inputs.shape[1]
self.alpha = torch.cat(
[torch.ones([B, 1]),
torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
def init_location_attention(self, inputs):
B = inputs.size(0)
T = inputs.size(1)
self.attention_weights_cum = torch.zeros([B, T], device=inputs.device)
def init_states(self, inputs):
B = inputs.size(0)
T = inputs.size(1)
self.attention_weights = torch.zeros([B, T], device=inputs.device)
if self.location_attention:
self.init_location_attention(inputs)
if self.forward_attn:
self.init_forward_attn(inputs)
if self.windowing:
self.init_win_idx()
def preprocess_inputs(self, inputs):
return self.inputs_layer(inputs)
def update_location_attention(self, alignments):
self.attention_weights_cum += alignments
def get_location_attention(self, query, processed_inputs):
attention_cat = torch.cat((self.attention_weights.unsqueeze(1),
self.attention_weights_cum.unsqueeze(1)),
dim=1)
processed_query = self.query_layer(query.unsqueeze(1))
processed_attention_weights = self.location_layer(attention_cat)
energies = self.v(
torch.tanh(processed_query + processed_attention_weights +
processed_inputs))
energies = energies.squeeze(-1)
return energies, processed_query
def get_attention(self, query, processed_inputs):
processed_query = self.query_layer(query.unsqueeze(1))
energies = self.v(torch.tanh(processed_query + processed_inputs))
energies = energies.squeeze(-1)
return energies, processed_query
def apply_windowing(self, attention, inputs):
back_win = self.win_idx - self.win_back
front_win = self.win_idx + self.win_front
if back_win > 0:
attention[:, :back_win] = -float("inf")
if front_win < inputs.shape[1]:
attention[:, front_win:] = -float("inf")
# this is a trick to solve a special problem.
# but it does not hurt.
if self.win_idx == -1:
attention[:, 0] = attention.max()
# Update the window
self.win_idx = torch.argmax(attention, 1).long()[0].item()
return attention
def apply_forward_attention(self, alignment):
# forward attention
fwd_shifted_alpha = F.pad(
self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
# compute transition potentials
alpha = ((1 - self.u) * self.alpha
+ self.u * fwd_shifted_alpha
+ 1e-8) * alignment
# force incremental alignment
if not self.training and self.forward_attn_mask:
_, n = fwd_shifted_alpha.max(1)
val, _ = alpha.max(1)
for b in range(alignment.shape[0]):
alpha[b, n[b] + 3:] = 0
alpha[b, :(
n[b] - 1
)] = 0 # ignore all previous states to prevent repetition.
alpha[b,
(n[b] - 2
)] = 0.01 * val[b] # smoothing factor for the prev step
# renormalize attention weights
alpha = alpha / alpha.sum(dim=1, keepdim=True)
return alpha
def forward(self, query, inputs, processed_inputs, mask):
"""
shapes:
query: [B, C_attn_rnn]
inputs: [B, T_en, D_en]
processed_inputs: [B, T_en, D_attn]
mask: [B, T_en]
"""
if self.location_attention:
attention, _ = self.get_location_attention(
query, processed_inputs)
else:
attention, _ = self.get_attention(
query, processed_inputs)
# apply masking
if mask is not None:
attention.data.masked_fill_(~mask, self._mask_value)
# apply windowing - only in eval mode
if not self.training and self.windowing:
attention = self.apply_windowing(attention, inputs)
# normalize attention values
if self.norm == "softmax":
alignment = torch.softmax(attention, dim=-1)
elif self.norm == "sigmoid":
alignment = torch.sigmoid(attention) / torch.sigmoid(
attention).sum(
dim=1, keepdim=True)
else:
raise ValueError("Unknown value for attention norm type")
if self.location_attention:
self.update_location_attention(alignment)
# apply forward attention if enabled
if self.forward_attn:
alignment = self.apply_forward_attention(alignment)
self.alpha = alignment
context = torch.bmm(alignment.unsqueeze(1), inputs)
context = context.squeeze(1)
self.attention_weights = alignment
# compute transition agent
if self.forward_attn and self.trans_agent:
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
self.u = torch.sigmoid(self.ta(ta_input))
return context
class MonotonicDynamicConvolutionAttention(nn.Module):
"""Dynamic convolution attention from
https://arxiv.org/pdf/1910.10288.pdf
query -> linear -> tanh -> linear ->|
| mask values
v | |
atten_w(t-1) -|-> conv1d_dynamic -> linear -|-> tanh -> + -> softmax -> * -> * -> context
|-> conv1d_static -> linear -| |
|-> conv1d_prior -> log ----------------|
query: attention rnn output.
Note:
Dynamic convolution attention is an alternation of the location senstive attention with
dynamically computed convolution filters from the previous attention scores and a set of
constraints to keep the attention alignment diagonal.
Args:
query_dim (int): number of channels in the query tensor.
embedding_dim (int): number of channels in the value tensor.
static_filter_dim (int): number of channels in the convolution layer computing the static filters.
static_kernel_size (int): kernel size for the convolution layer computing the static filters.
dynamic_filter_dim (int): number of channels in the convolution layer computing the dynamic filters.
dynamic_kernel_size (int): kernel size for the convolution layer computing the dynamic filters.
prior_filter_len (int, optional): [description]. Defaults to 11 from the paper.
alpha (float, optional): [description]. Defaults to 0.1 from the paper.
beta (float, optional): [description]. Defaults to 0.9 from the paper.
"""
def __init__(
self,
query_dim,
embedding_dim, # pylint: disable=unused-argument
attention_dim,
static_filter_dim,
static_kernel_size,
dynamic_filter_dim,
dynamic_kernel_size,
prior_filter_len=11,
alpha=0.1,
beta=0.9,
):
super().__init__()
self._mask_value = 1e-8
self.dynamic_filter_dim = dynamic_filter_dim
self.dynamic_kernel_size = dynamic_kernel_size
self.prior_filter_len = prior_filter_len
self.attention_weights = None
# setup key and query layers
self.query_layer = nn.Linear(query_dim, attention_dim)
self.key_layer = nn.Linear(
attention_dim, dynamic_filter_dim * dynamic_kernel_size, bias=False
)
self.static_filter_conv = nn.Conv1d(
1,
static_filter_dim,
static_kernel_size,
padding=(static_kernel_size - 1) // 2,
bias=False,
)
self.static_filter_layer = nn.Linear(static_filter_dim, attention_dim, bias=False)
self.dynamic_filter_layer = nn.Linear(dynamic_filter_dim, attention_dim)
self.v = nn.Linear(attention_dim, 1, bias=False)
prior = betabinom.pmf(range(prior_filter_len), prior_filter_len - 1,
alpha, beta)
self.register_buffer("prior", torch.FloatTensor(prior).flip(0))
# pylint: disable=unused-argument
def forward(self, query, inputs, processed_inputs, mask):
"""
query: [B, C_attn_rnn]
inputs: [B, T_en, D_en]
processed_inputs: place holder.
mask: [B, T_en]
"""
# compute prior filters
prior_filter = F.conv1d(
F.pad(self.attention_weights.unsqueeze(1),
(self.prior_filter_len - 1, 0)), self.prior.view(1, 1, -1))
prior_filter = torch.log(prior_filter.clamp_min_(1e-6)).squeeze(1)
G = self.key_layer(torch.tanh(self.query_layer(query)))
# compute dynamic filters
dynamic_filter = F.conv1d(
self.attention_weights.unsqueeze(0),
G.view(-1, 1, self.dynamic_kernel_size),
padding=(self.dynamic_kernel_size - 1) // 2,
groups=query.size(0),
)
dynamic_filter = dynamic_filter.view(query.size(0), self.dynamic_filter_dim, -1).transpose(1, 2)
# compute static filters
static_filter = self.static_filter_conv(self.attention_weights.unsqueeze(1)).transpose(1, 2)
alignment = self.v(
torch.tanh(
self.static_filter_layer(static_filter) +
self.dynamic_filter_layer(dynamic_filter))).squeeze(-1) + prior_filter
# compute attention weights
attention_weights = F.softmax(alignment, dim=-1)
# apply masking
if mask is not None:
attention_weights.data.masked_fill_(~mask, self._mask_value)
self.attention_weights = attention_weights
# compute context
context = torch.bmm(attention_weights.unsqueeze(1), inputs).squeeze(1)
return context
def preprocess_inputs(self, inputs): # pylint: disable=no-self-use
return None
def init_states(self, inputs):
B = inputs.size(0)
T = inputs.size(1)
self.attention_weights = torch.zeros([B, T], device=inputs.device)
self.attention_weights[:, 0] = 1.
def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask, attn_K):
if attn_type == "original":
return OriginalAttention(query_dim, embedding_dim, attention_dim,
location_attention,
attention_location_n_filters,
attention_location_kernel_size, windowing,
norm, forward_attn, trans_agent,
forward_attn_mask)
if attn_type == "graves":
return GravesAttention(query_dim, attn_K)
if attn_type == "dynamic_convolution":
return MonotonicDynamicConvolutionAttention(query_dim,
embedding_dim,
attention_dim,
static_filter_dim=8,
static_kernel_size=21,
dynamic_filter_dim=8,
dynamic_kernel_size=21,
prior_filter_len=11,
alpha=0.1,
beta=0.9)
raise RuntimeError(
" [!] Given Attention Type '{attn_type}' is not exist.")

View File

@ -4,6 +4,14 @@ from torch.nn import functional as F
class Linear(nn.Module):
"""Linear layer with a specific initialization.
Args:
in_features (int): number of channels in the input tensor.
out_features (int): number of channels in the output tensor.
bias (bool, optional): enable/disable bias in the layer. Defaults to True.
init_gain (str, optional): method to compute the gain in the weight initializtion based on the nonlinear activation used afterwards. Defaults to 'linear'.
"""
def __init__(self,
in_features,
out_features,
@ -24,6 +32,16 @@ class Linear(nn.Module):
class LinearBN(nn.Module):
"""Linear layer with Batch Normalization.
x -> linear -> BN -> o
Args:
in_features (int): number of channels in the input tensor.
out_features (int ): number of channels in the output tensor.
bias (bool, optional): enable/disable bias in the linear layer. Defaults to True.
init_gain (str, optional): method to set the gain for weight initialization. Defaults to 'linear'.
"""
def __init__(self,
in_features,
out_features,
@ -41,6 +59,10 @@ class LinearBN(nn.Module):
gain=torch.nn.init.calculate_gain(init_gain))
def forward(self, x):
"""
Shapes:
x: [T, B, C] or [B, C]
"""
out = self.linear_layer(x)
if len(out.shape) == 3:
out = out.permute(1, 2, 0)
@ -51,6 +73,29 @@ class LinearBN(nn.Module):
class Prenet(nn.Module):
"""Tacotron specific Prenet with an optional Batch Normalization.
Note:
Prenet with BN improves the model performance significantly especially
if it is enabled after learning a diagonal attention alignment with the original
prenet. However, if the target dataset is high quality then it also works from
the start. It is also suggested to disable dropout if BN is in use.
prenet_type == "original"
x -> [linear -> ReLU -> Dropout]xN -> o
prenet_type == "bn"
x -> [linear -> BN -> ReLU -> Dropout]xN -> o
Args:
in_features (int): number of channels in the input tensor and the inner layers.
prenet_type (str, optional): prenet type "original" or "bn". Defaults to "original".
prenet_dropout (bool, optional): dropout rate. Defaults to True.
out_features (list, optional): List of output channels for each prenet block.
It also defines number of the prenet blocks based on the length of argument list.
Defaults to [256, 256].
bias (bool, optional): enable/disable bias in prenet linear layers. Defaults to True.
"""
# pylint: disable=dangerous-default-value
def __init__(self,
in_features,
@ -79,311 +124,4 @@ class Prenet(nn.Module):
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training)
else:
x = F.relu(linear(x))
return x
####################
# ATTENTION MODULES
####################
class LocationLayer(nn.Module):
def __init__(self,
attention_dim,
attention_n_filters=32,
attention_kernel_size=31):
super(LocationLayer, self).__init__()
self.location_conv1d = nn.Conv1d(
in_channels=2,
out_channels=attention_n_filters,
kernel_size=attention_kernel_size,
stride=1,
padding=(attention_kernel_size - 1) // 2,
bias=False)
self.location_dense = Linear(
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
def forward(self, attention_cat):
processed_attention = self.location_conv1d(attention_cat)
processed_attention = self.location_dense(
processed_attention.transpose(1, 2))
return processed_attention
class GravesAttention(nn.Module):
""" Discretized Graves attention:
- https://arxiv.org/abs/1910.10288
- https://arxiv.org/pdf/1906.01083.pdf
"""
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
def __init__(self, query_dim, K):
super(GravesAttention, self).__init__()
self._mask_value = 1e-8
self.K = K
# self.attention_alignment = 0.05
self.eps = 1e-5
self.J = None
self.N_a = nn.Sequential(
nn.Linear(query_dim, query_dim, bias=True),
nn.ReLU(),
nn.Linear(query_dim, 3*K, bias=True))
self.attention_weights = None
self.mu_prev = None
self.init_layers()
def init_layers(self):
torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean
torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std
def init_states(self, inputs):
if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
# pylint: disable=R0201
# pylint: disable=unused-argument
def preprocess_inputs(self, inputs):
return None
def forward(self, query, inputs, processed_inputs, mask):
"""
shapes:
query: B x D_attention_rnn
inputs: B x T_in x D_encoder
processed_inputs: place_holder
mask: B x T_in
"""
gbk_t = self.N_a(query)
gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
# attention model parameters
# each B x K
g_t = gbk_t[:, 0, :]
b_t = gbk_t[:, 1, :]
k_t = gbk_t[:, 2, :]
# dropout to decorrelate attention heads
g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
# attention GMM parameters
sig_t = torch.nn.functional.softplus(b_t) + self.eps
mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
g_t = torch.softmax(g_t, dim=-1) + self.eps
j = self.J[:inputs.size(1)+1]
# attention weights
phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
# discritize attention weights
alpha_t = torch.sum(phi_t, 1)
alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
alpha_t[alpha_t == 0] = 1e-8
# apply masking
if mask is not None:
alpha_t.data.masked_fill_(~mask, self._mask_value)
context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
self.attention_weights = alpha_t
self.mu_prev = mu_t
return context
class OriginalAttention(nn.Module):
"""Following the methods proposed here:
- https://arxiv.org/abs/1712.05884
- https://arxiv.org/abs/1807.06736 + state masking at inference
- Using sigmoid instead of softmax normalization
- Attention windowing at inference time
"""
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask):
super(OriginalAttention, self).__init__()
self.query_layer = Linear(
query_dim, attention_dim, bias=False, init_gain='tanh')
self.inputs_layer = Linear(
embedding_dim, attention_dim, bias=False, init_gain='tanh')
self.v = Linear(attention_dim, 1, bias=True)
if trans_agent:
self.ta = nn.Linear(
query_dim + embedding_dim, 1, bias=True)
if location_attention:
self.location_layer = LocationLayer(
attention_dim,
attention_location_n_filters,
attention_location_kernel_size,
)
self._mask_value = -float("inf")
self.windowing = windowing
self.win_idx = None
self.norm = norm
self.forward_attn = forward_attn
self.trans_agent = trans_agent
self.forward_attn_mask = forward_attn_mask
self.location_attention = location_attention
def init_win_idx(self):
self.win_idx = -1
self.win_back = 2
self.win_front = 6
def init_forward_attn(self, inputs):
B = inputs.shape[0]
T = inputs.shape[1]
self.alpha = torch.cat(
[torch.ones([B, 1]),
torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
def init_location_attention(self, inputs):
B = inputs.size(0)
T = inputs.size(1)
self.attention_weights_cum = torch.zeros([B, T], device=inputs.device)
def init_states(self, inputs):
B = inputs.size(0)
T = inputs.size(1)
self.attention_weights = torch.zeros([B, T], device=inputs.device)
if self.location_attention:
self.init_location_attention(inputs)
if self.forward_attn:
self.init_forward_attn(inputs)
if self.windowing:
self.init_win_idx()
def preprocess_inputs(self, inputs):
return self.inputs_layer(inputs)
def update_location_attention(self, alignments):
self.attention_weights_cum += alignments
def get_location_attention(self, query, processed_inputs):
attention_cat = torch.cat((self.attention_weights.unsqueeze(1),
self.attention_weights_cum.unsqueeze(1)),
dim=1)
processed_query = self.query_layer(query.unsqueeze(1))
processed_attention_weights = self.location_layer(attention_cat)
energies = self.v(
torch.tanh(processed_query + processed_attention_weights +
processed_inputs))
energies = energies.squeeze(-1)
return energies, processed_query
def get_attention(self, query, processed_inputs):
processed_query = self.query_layer(query.unsqueeze(1))
energies = self.v(torch.tanh(processed_query + processed_inputs))
energies = energies.squeeze(-1)
return energies, processed_query
def apply_windowing(self, attention, inputs):
back_win = self.win_idx - self.win_back
front_win = self.win_idx + self.win_front
if back_win > 0:
attention[:, :back_win] = -float("inf")
if front_win < inputs.shape[1]:
attention[:, front_win:] = -float("inf")
# this is a trick to solve a special problem.
# but it does not hurt.
if self.win_idx == -1:
attention[:, 0] = attention.max()
# Update the window
self.win_idx = torch.argmax(attention, 1).long()[0].item()
return attention
def apply_forward_attention(self, alignment):
# forward attention
fwd_shifted_alpha = F.pad(
self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
# compute transition potentials
alpha = ((1 - self.u) * self.alpha
+ self.u * fwd_shifted_alpha
+ 1e-8) * alignment
# force incremental alignment
if not self.training and self.forward_attn_mask:
_, n = fwd_shifted_alpha.max(1)
val, _ = alpha.max(1)
for b in range(alignment.shape[0]):
alpha[b, n[b] + 3:] = 0
alpha[b, :(
n[b] - 1
)] = 0 # ignore all previous states to prevent repetition.
alpha[b,
(n[b] - 2
)] = 0.01 * val[b] # smoothing factor for the prev step
# renormalize attention weights
alpha = alpha / alpha.sum(dim=1, keepdim=True)
return alpha
def forward(self, query, inputs, processed_inputs, mask):
"""
shapes:
query: B x D_attn_rnn
inputs: B x T_en x D_en
processed_inputs:: B x T_en x D_attn
mask: B x T_en
"""
if self.location_attention:
attention, _ = self.get_location_attention(
query, processed_inputs)
else:
attention, _ = self.get_attention(
query, processed_inputs)
# apply masking
if mask is not None:
attention.data.masked_fill_(~mask, self._mask_value)
# apply windowing - only in eval mode
if not self.training and self.windowing:
attention = self.apply_windowing(attention, inputs)
# normalize attention values
if self.norm == "softmax":
alignment = torch.softmax(attention, dim=-1)
elif self.norm == "sigmoid":
alignment = torch.sigmoid(attention) / torch.sigmoid(
attention).sum(
dim=1, keepdim=True)
else:
raise ValueError("Unknown value for attention norm type")
if self.location_attention:
self.update_location_attention(alignment)
# apply forward attention if enabled
if self.forward_attn:
alignment = self.apply_forward_attention(alignment)
self.alpha = alignment
context = torch.bmm(alignment.unsqueeze(1), inputs)
context = context.squeeze(1)
self.attention_weights = alignment
# compute transition agent
if self.forward_attn and self.trans_agent:
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
self.u = torch.sigmoid(self.ta(ta_input))
return context
def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask, attn_K):
if attn_type == "original":
return OriginalAttention(query_dim, embedding_dim, attention_dim,
location_attention,
attention_location_n_filters,
attention_location_kernel_size, windowing,
norm, forward_attn, trans_agent,
forward_attn_mask)
if attn_type == "graves":
return GravesAttention(query_dim, attn_K)
raise RuntimeError(
" [!] Given Attention Type '{attn_type}' is not exist.")
return x

View File

View File

@ -0,0 +1,118 @@
from torch import nn
class ZeroTemporalPad(nn.Module):
"""Pad sequences to equal lentgh in the temporal dimension"""
def __init__(self, kernel_size, dilation):
super().__init__()
total_pad = (dilation * (kernel_size - 1))
begin = total_pad // 2
end = total_pad - begin
self.pad_layer = nn.ZeroPad2d((0, 0, begin, end))
def forward(self, x):
return self.pad_layer(x)
class Conv1dBN(nn.Module):
"""1d convolutional with batch norm.
conv1d -> relu -> BN blocks.
Note:
Batch normalization is applied after ReLU regarding the original implementation.
Args:
in_channels (int): number of input channels.
out_channels (int): number of output channels.
kernel_size (int): kernel size for convolutional filters.
dilation (int): dilation for convolution layers.
"""
def __init__(self, in_channels, out_channels, kernel_size, dilation):
super().__init__()
padding = (dilation * (kernel_size - 1))
pad_s = padding // 2
pad_e = padding - pad_s
self.conv1d = nn.Conv1d(in_channels, out_channels, kernel_size, dilation=dilation)
self.pad = nn.ZeroPad2d((pad_s, pad_e, 0, 0)) # uneven left and right padding
self.norm = nn.BatchNorm1d(out_channels)
def forward(self, x):
o = self.conv1d(x)
o = self.pad(o)
o = nn.functional.relu(o)
o = self.norm(o)
return o
class Conv1dBNBlock(nn.Module):
"""1d convolutional block with batch norm. It is a set of conv1d -> relu -> BN blocks.
Args:
in_channels (int): number of input channels.
out_channels (int): number of output channels.
hidden_channels (int): number of inner convolution channels.
kernel_size (int): kernel size for convolutional filters.
dilation (int): dilation for convolution layers.
num_conv_blocks (int, optional): number of convolutional blocks. Defaults to 2.
"""
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation, num_conv_blocks=2):
super().__init__()
self.conv_bn_blocks = []
for idx in range(num_conv_blocks):
layer = Conv1dBN(in_channels if idx == 0 else hidden_channels,
out_channels if idx == (num_conv_blocks - 1) else hidden_channels,
kernel_size,
dilation)
self.conv_bn_blocks.append(layer)
self.conv_bn_blocks = nn.Sequential(*self.conv_bn_blocks)
def forward(self, x):
"""
Shapes:
x: (B, D, T)
"""
return self.conv_bn_blocks(x)
class ResidualConv1dBNBlock(nn.Module):
"""Residual Convolutional Blocks with BN
Each block has 'num_conv_block' conv layers and 'num_res_blocks' such blocks are connected
with residual connections.
conv_block = (conv1d -> relu -> bn) x 'num_conv_blocks'
residuak_conv_block = (x -> conv_block -> + ->) x 'num_res_blocks'
' - - - - - - - - - ^
Args:
in_channels (int): number of input channels.
out_channels (int): number of output channels.
hidden_channels (int): number of inner convolution channels.
kernel_size (int): kernel size for convolutional filters.
dilations (list): dilations for each convolution layer.
num_res_blocks (int, optional): number of residual blocks. Defaults to 13.
num_conv_blocks (int, optional): number of convolutional blocks in each residual block. Defaults to 2.
"""
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2):
super().__init__()
assert len(dilations) == num_res_blocks
self.res_blocks = nn.ModuleList()
for idx, dilation in enumerate(dilations):
block = Conv1dBNBlock(in_channels if idx==0 else hidden_channels,
out_channels if (idx + 1) == len(dilations) else hidden_channels,
hidden_channels,
kernel_size,
dilation,
num_conv_blocks)
self.res_blocks.append(block)
def forward(self, x, x_mask=None):
if x_mask is None:
x_mask = 1.0
o = x * x_mask
for block in self.res_blocks:
res = o
o = block(o)
o = o + res
if x_mask is not None:
o = o * x_mask
return o

View File

@ -0,0 +1,170 @@
import torch
from torch import nn
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
class WN(torch.nn.Module):
"""Wavenet layers with weight norm and no input conditioning.
|-----------------------------------------------------------------------------|
| |-> tanh -| |
res -|- conv1d(dilation) -> dropout -> + -| * -> conv1d1x1 -> split -|- + -> res
g -------------------------------------| |-> sigmoid -| |
o --------------------------------------------------------------------------- + --------- o
Args:
in_channels (int): number of input channels.
hidden_channes (int): number of hidden channels.
kernel_size (int): filter kernel size for the first conv layer.
dilation_rate (int): dilations rate to increase dilation per layer.
If it is 2, dilations are 1, 2, 4, 8 for the next 4 layers.
num_layers (int): number of wavenet layers.
c_in_channels (int): number of channels of conditioning input.
dropout_p (float): dropout rate.
weight_norm (bool): enable/disable weight norm for convolution layers.
"""
def __init__(self,
in_channels,
hidden_channels,
kernel_size,
dilation_rate,
num_layers,
c_in_channels=0,
dropout_p=0,
weight_norm=True):
super().__init__()
assert kernel_size % 2 == 1
assert hidden_channels % 2 == 0
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.num_layers = num_layers
self.c_in_channels = c_in_channels
self.dropout_p = dropout_p
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.dropout = nn.Dropout(dropout_p)
# init conditioning layer
if c_in_channels > 0:
cond_layer = torch.nn.Conv1d(c_in_channels,
2 * hidden_channels * num_layers, 1)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer,
name='weight')
# intermediate layers
for i in range(num_layers):
dilation = dilation_rate**i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(hidden_channels,
2 * hidden_channels,
kernel_size,
dilation=dilation,
padding=padding)
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
self.in_layers.append(in_layer)
if i < num_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels,
res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer,
name='weight')
self.res_skip_layers.append(res_skip_layer)
# setup weight norm
if not weight_norm:
self.remove_weight_norm()
def forward(self, x, x_mask=None, g=None, **kwargs): # pylint: disable=unused-argument
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i in range(self.num_layers):
x_in = self.in_layers[i](x)
x_in = self.dropout(x_in)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
else:
g_l = torch.zeros_like(x_in)
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l,
n_channels_tensor)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.num_layers - 1:
x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
output = output + res_skip_acts[:, self.hidden_channels:, :]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.c_in_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
class WNBlocks(nn.Module):
"""Wavenet blocks.
Note: After each block dilation resets to 1 and it increases in each block
along the dilation rate.
Args:
in_channels (int): number of input channels.
hidden_channes (int): number of hidden channels.
kernel_size (int): filter kernel size for the first conv layer.
dilation_rate (int): dilations rate to increase dilation per layer.
If it is 2, dilations are 1, 2, 4, 8 for the next 4 layers.
num_blocks (int): number of wavenet blocks.
num_layers (int): number of wavenet layers.
c_in_channels (int): number of channels of conditioning input.
dropout_p (float): dropout rate.
weight_norm (bool): enable/disable weight norm for convolution layers.
"""
def __init__(self,
in_channels,
hidden_channels,
kernel_size,
dilation_rate,
num_blocks,
num_layers,
c_in_channels=0,
dropout_p=0,
weight_norm=True):
super().__init__()
self.wn_blocks = nn.ModuleList()
for idx in range(num_blocks):
layer = WN(in_channels=in_channels if idx == 0 else hidden_channels,
hidden_channels=hidden_channels,
kernel_size=kernel_size,
dilation_rate=dilation_rate,
num_layers=num_layers,
c_in_channels=c_in_channels,
dropout_p=dropout_p,
weight_norm=weight_norm)
self.wn_blocks.append(layer)
def forward(self, x, x_mask, g=None):
o = x
for layer in self.wn_blocks:
o = layer(o, x_mask, g)
return o

View File

@ -2,10 +2,17 @@ import torch
from torch import nn
from TTS.tts.layers.glow_tts.glow import InvConvNear, CouplingBlock
from TTS.tts.layers.glow_tts.normalization import ActNorm
from TTS.tts.layers.generic.normalization import ActNorm
def squeeze(x, x_mask=None, num_sqz=2):
"""GlowTTS squeeze operation
Increase number of channels and reduce number of time steps
by the same factor.
Note:
each 's' is a n-dimensional vector.
[s1,s2,s3,s4,s5,s6] --> [[s1, s3, s5], [s2, s4, s6]]"""
b, c, t = x.size()
t = (t // num_sqz) * num_sqz
@ -23,6 +30,11 @@ def squeeze(x, x_mask=None, num_sqz=2):
def unsqueeze(x, x_mask=None, num_sqz=2):
"""GlowTTS unsqueeze operation
Note:
each 's' is a n-dimensional vector.
[[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5], [s2, s4, s6]] """
b, c, t = x.size()
x_unsqz = x.view(b, num_sqz, c // num_sqz, t)
@ -40,7 +52,19 @@ def unsqueeze(x, x_mask=None, num_sqz=2):
class Decoder(nn.Module):
"""Stack of Glow Modules"""
"""Stack of Glow Decoder Modules.
Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
Args:
in_channels (int): channels of input tensor.
hidden_channels (int): hidden decoder channels.
kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
dilation_rate (int): rate to increase dilation by each layer in a decoder block.
num_flow_blocks (int): number of decoder blocks.
num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
dropout_p (float): wavenet dropout rate.
sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
"""
def __init__(self,
in_channels,
hidden_channels,
@ -50,7 +74,7 @@ class Decoder(nn.Module):
num_coupling_layers,
dropout_p=0.,
num_splits=4,
num_sqz=2,
num_squeeze=2,
sigmoid_scale=False,
c_in_channels=0):
super().__init__()
@ -63,18 +87,18 @@ class Decoder(nn.Module):
self.num_coupling_layers = num_coupling_layers
self.dropout_p = dropout_p
self.num_splits = num_splits
self.num_sqz = num_sqz
self.num_squeeze = num_squeeze
self.sigmoid_scale = sigmoid_scale
self.c_in_channels = c_in_channels
self.flows = nn.ModuleList()
for _ in range(num_flow_blocks):
self.flows.append(ActNorm(channels=in_channels * num_sqz))
self.flows.append(ActNorm(channels=in_channels * num_squeeze))
self.flows.append(
InvConvNear(channels=in_channels * num_sqz,
InvConvNear(channels=in_channels * num_squeeze,
num_splits=num_splits))
self.flows.append(
CouplingBlock(in_channels * num_sqz,
CouplingBlock(in_channels * num_squeeze,
hidden_channels,
kernel_size=kernel_size,
dilation_rate=dilation_rate,
@ -91,16 +115,16 @@ class Decoder(nn.Module):
flows = reversed(self.flows)
logdet_tot = None
if self.num_sqz > 1:
x, x_mask = squeeze(x, x_mask, self.num_sqz)
if self.num_squeeze > 1:
x, x_mask = squeeze(x, x_mask, self.num_squeeze)
for f in flows:
if not reverse:
x, logdet = f(x, x_mask, g=g, reverse=reverse)
logdet_tot += logdet
else:
x, logdet = f(x, x_mask, g=g, reverse=reverse)
if self.num_sqz > 1:
x, x_mask = unsqueeze(x, x_mask, self.num_sqz)
if self.num_squeeze > 1:
x, x_mask = unsqueeze(x, x_mask, self.num_squeeze)
return x, logdet_tot
def store_inverse(self):

View File

@ -1,33 +1,50 @@
import torch
from torch import nn
from .normalization import LayerNorm
from ..generic.normalization import LayerNorm
class DurationPredictor(nn.Module):
def __init__(self, in_channels, filter_channels, kernel_size, dropout_p):
"""Glow-TTS duration prediction model.
[2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs
Args:
in_channels ([type]): [description]
hidden_channels ([type]): [description]
kernel_size ([type]): [description]
dropout_p ([type]): [description]
"""
def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p):
super().__init__()
# class arguments
self.in_channels = in_channels
self.filter_channels = filter_channels
self.filter_channels = hidden_channels
self.kernel_size = kernel_size
self.dropout_p = dropout_p
# layers
self.drop = nn.Dropout(dropout_p)
self.conv_1 = nn.Conv1d(in_channels,
filter_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2)
self.norm_1 = LayerNorm(filter_channels)
self.conv_2 = nn.Conv1d(filter_channels,
filter_channels,
self.norm_1 = LayerNorm(hidden_channels)
self.conv_2 = nn.Conv1d(hidden_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2)
self.norm_2 = LayerNorm(filter_channels)
self.norm_2 = LayerNorm(hidden_channels)
# output layer
self.proj = nn.Conv1d(filter_channels, 1, 1)
self.proj = nn.Conv1d(hidden_channels, 1, 1)
def forward(self, x, x_mask):
"""
Shapes:
x: [B, C, T]
x_mask: [B, 1, T]
Returns:
[type]: [description]
"""
x = self.conv_1(x * x_mask)
x = torch.relu(x)
x = self.norm_1(x)

View File

@ -2,25 +2,30 @@ import math
import torch
from torch import nn
from TTS.tts.layers.glow_tts.transformer import Transformer
from TTS.tts.layers.glow_tts.gated_conv import GatedConvBlock
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
from TTS.tts.layers.generic.gated_conv import GatedConvBlock
from TTS.tts.utils.generic_utils import sequence_mask
from TTS.tts.layers.glow_tts.glow import ConvLayerNorm
from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.layers.glow_tts.time_depth_sep_conv import TimeDepthSeparableConvBlock
from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlock
from TTS.tts.layers.generic.res_conv_bn import ResidualConv1dBNBlock
class Encoder(nn.Module):
"""Glow-TTS encoder module. It uses Transformer with Relative Pos.Encoding
as in the original paper or GatedConvBlock as a faster alternative.
"""Glow-TTS encoder module.
embedding -> <prenet> -> encoder_module -> <postnet> --> proj_mean
|
|-> proj_var
|
|-> concat -> duration_predictor
speaker_embed
Args:
num_chars (int): number of characters.
out_channels (int): number of output channels.
hidden_channels (int): encoder's embedding size.
filter_channels (int): transformer's feed-forward channels.
num_head (int): number of attention heads in transformer.
num_layers (int): number of transformer encoder stack.
hidden_channels_ffn (int): transformer's feed-forward channels.
kernel_size (int): kernel size for conv layers and duration predictor.
dropout_p (float): dropout rate for any dropout layer.
mean_only (bool): if True, output only mean values and use constant std.
@ -29,20 +34,49 @@ class Encoder(nn.Module):
Shapes:
- input: (B, T, C)
Notes:
suggested encoder params...
for encoder_type == 'rel_pos_transformer'
encoder_params={
'kernel_size':3,
'dropout_p': 0.1,
'num_layers': 6,
'num_heads': 2,
'hidden_channels_ffn': 768, # 4 times the hidden_channels
'input_length': None
}
for encoder_type == 'gated_conv'
encoder_params={
'kernel_size':5,
'dropout_p': 0.1,
'num_layers': 9,
}
for encoder_type == 'residual_conv_bn'
encoder_params={
"kernel_size": 4,
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
"num_conv_blocks": 2,
"num_res_blocks": 13
}
for encoder_type == 'time_depth_separable'
encoder_params={
"kernel_size": 5,
'num_layers': 9,
}
"""
def __init__(self,
num_chars,
out_channels,
hidden_channels,
filter_channels,
filter_channels_dp,
hidden_channels_dp,
encoder_type,
num_heads,
num_layers,
kernel_size,
dropout_p,
rel_attn_window_size=None,
input_length=None,
encoder_params,
dropout_p_dp=0.1,
mean_only=False,
use_prenet=True,
c_in_channels=0):
@ -51,12 +85,8 @@ class Encoder(nn.Module):
self.num_chars = num_chars
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.filter_channels_dp = filter_channels_dp
self.num_heads = num_heads
self.num_layers = num_layers
self.kernel_size = kernel_size
self.dropout_p = dropout_p
self.hidden_channels_dp = hidden_channels_dp
self.dropout_p_dp = dropout_p_dp
self.mean_only = mean_only
self.use_prenet = use_prenet
self.c_in_channels = c_in_channels
@ -64,35 +94,37 @@ class Encoder(nn.Module):
# embedding layer
self.emb = nn.Embedding(num_chars, hidden_channels)
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
# init encoder
if encoder_type.lower() == "transformer":
# optional convolutional prenet
# init encoder module
if encoder_type.lower() == "rel_pos_transformer":
if use_prenet:
self.pre = ConvLayerNorm(hidden_channels,
self.prenet = ResidualConv1dLayerNormBlock(hidden_channels,
hidden_channels,
hidden_channels,
kernel_size=5,
num_layers=3,
dropout_p=0.5)
# text encoder
self.encoder = Transformer(
hidden_channels,
filter_channels,
num_heads,
num_layers,
kernel_size=kernel_size,
dropout_p=dropout_p,
rel_attn_window_size=rel_attn_window_size,
input_length=input_length)
elif encoder_type.lower() == 'gatedconv':
self.encoder = GatedConvBlock(hidden_channels,
kernel_size=5,
dropout_p=dropout_p,
num_layers=3 + num_layers)
elif encoder_type.lower() == 'time-depth-separable':
# optional convolutional prenet
self.encoder = RelativePositionTransformer(hidden_channels,
hidden_channels,
hidden_channels,
**encoder_params)
elif encoder_type.lower() == 'gated_conv':
self.encoder = GatedConvBlock(hidden_channels, **encoder_params)
elif encoder_type.lower() == 'residual_conv_bn':
if use_prenet:
self.pre = ConvLayerNorm(hidden_channels,
self.prenet = nn.Sequential(
nn.Conv1d(hidden_channels, hidden_channels, 1),
nn.ReLU()
)
self.encoder = ResidualConv1dBNBlock(hidden_channels,
hidden_channels,
hidden_channels,
**encoder_params)
self.postnet = nn.Sequential(
nn.Conv1d(self.hidden_channels, self.hidden_channels, 1),
nn.BatchNorm1d(self.hidden_channels))
elif encoder_type.lower() == 'time_depth_separable':
if use_prenet:
self.prenet = ResidualConv1dLayerNormBlock(hidden_channels,
hidden_channels,
hidden_channels,
kernel_size=5,
@ -101,8 +133,9 @@ class Encoder(nn.Module):
self.encoder = TimeDepthSeparableConvBlock(hidden_channels,
hidden_channels,
hidden_channels,
kernel_size=5,
num_layers=3 + num_layers)
**encoder_params)
else:
raise ValueError(" [!] Unkown encoder type.")
# final projection layers
self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1)
@ -110,10 +143,16 @@ class Encoder(nn.Module):
self.proj_s = nn.Conv1d(hidden_channels, out_channels, 1)
# duration predictor
self.duration_predictor = DurationPredictor(
hidden_channels + c_in_channels, filter_channels_dp, kernel_size,
dropout_p)
hidden_channels + c_in_channels, hidden_channels_dp, 3,
dropout_p_dp)
def forward(self, x, x_lengths, g=None):
"""
Shapes:
x: [B, C, T]
x_lengths: [B]
g (optional): [B, 1, T]
"""
# embedding layer
# [B ,T, D]
x = self.emb(x) * math.sqrt(self.hidden_channels)
@ -122,12 +161,14 @@ class Encoder(nn.Module):
# compute input sequence mask
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)),
1).to(x.dtype)
# pre-conv layers
if self.encoder_type in ['transformer', 'time-depth-separable']:
if self.use_prenet:
x = self.pre(x, x_mask)
# prenet
if hasattr(self, 'prenet') and self.use_prenet:
x = self.prenet(x, x_mask)
# encoder
x = self.encoder(x, x_mask)
# postnet
if hasattr(self, 'postnet'):
x = self.postnet(x) * x_mask
# set duration predictor input
if g is not None:
g_exp = g.expand(-1, -1, x.size(-1))

View File

@ -1,13 +1,28 @@
import torch
from torch import nn
from torch.nn import functional as F
from TTS.tts.layers.generic.wavenet import WN
from .normalization import LayerNorm
from ..generic.normalization import LayerNorm
class ConvLayerNorm(nn.Module):
class ResidualConv1dLayerNormBlock(nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size,
num_layers, dropout_p):
"""Conv1d with Layer Normalization and residual connection as in GlowTTS paper.
https://arxiv.org/pdf/1811.00002.pdf
x |-> conv1d -> layer_norm -> relu -> dropout -> + -> o
|---------------> conv1d_1x1 -----------------------|
Args:
in_channels (int): number of input tensor channels.
hidden_channels (int): number of inner layer channels.
out_channels (int): number of output tensor channels.
kernel_size (int): kernel size of conv1d filter.
num_layers (int): number of blocks.
dropout_p (float): dropout rate for each block.
"""
super().__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
@ -21,16 +36,9 @@ class ConvLayerNorm(nn.Module):
self.conv_layers = nn.ModuleList()
self.norm_layers = nn.ModuleList()
self.conv_layers.append(
nn.Conv1d(in_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2))
self.norm_layers.append(LayerNorm(hidden_channels))
for _ in range(num_layers - 1):
for idx in range(num_layers):
self.conv_layers.append(
nn.Conv1d(hidden_channels,
nn.Conv1d(in_channels if idx == 0 else hidden_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2))
@ -50,105 +58,20 @@ class ConvLayerNorm(nn.Module):
return x * x_mask
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
class WN(torch.nn.Module):
def __init__(self,
in_channels,
hidden_channels,
kernel_size,
dilation_rate,
num_layers,
c_in_channels=0,
dropout_p=0):
super().__init__()
assert kernel_size % 2 == 1
assert hidden_channels % 2 == 0
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.num_layers = num_layers
self.c_in_channels = c_in_channels
self.dropout_p = dropout_p
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.dropout = nn.Dropout(dropout_p)
if c_in_channels != 0:
cond_layer = torch.nn.Conv1d(c_in_channels,
2 * hidden_channels * num_layers, 1)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer,
name='weight')
for i in range(num_layers):
dilation = dilation_rate**i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(hidden_channels,
2 * hidden_channels,
kernel_size,
dilation=dilation,
padding=padding)
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
self.in_layers.append(in_layer)
if i < num_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels,
res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer,
name='weight')
self.res_skip_layers.append(res_skip_layer)
def forward(self, x, x_mask=None, g=None, **kwargs): # pylint: disable=unused-argument
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i in range(self.num_layers):
x_in = self.in_layers[i](x)
x_in = self.dropout(x_in)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:,
cond_offset:cond_offset + 2 * self.hidden_channels, :]
else:
g_l = torch.zeros_like(x_in)
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l,
n_channels_tensor)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.num_layers - 1:
x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
output = output + res_skip_acts[:, self.hidden_channels:, :]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.c_in_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
class InvConvNear(nn.Module):
"""Invertible Convolution with input splitting as in GlowTTS paper.
https://arxiv.org/pdf/1811.00002.pdf
Args:
channels (int): input and output channels.
num_splits (int): number of splits, also H and W of conv layer.
no_jacobian (bool): enable/disable jacobian computations.
Note:
Split the input into groups of size self.num_splits and
perform 1x1 convolution separately. Cast 1x1 conv operation
to 2d by reshaping the input for efficiency.
"""
def __init__(self, channels, num_splits=4, no_jacobian=False, **kwargs): # pylint: disable=unused-argument
super().__init__()
assert num_splits % 2 == 0
@ -164,9 +87,10 @@ class InvConvNear(nn.Module):
self.weight = nn.Parameter(w_init)
def forward(self, x, x_mask=None, reverse=False, **kwargs): # pylint: disable=unused-argument
"""Split the input into groups of size self.num_splits and
perform 1x1 convolution separately. Cast 1x1 conv operation
to 2d by reshaping the input for efficienty.
"""
Shapes:
x: B x C x T
x_mask: B x 1 x T
"""
b, c, t = x.size()
@ -209,6 +133,25 @@ class InvConvNear(nn.Module):
class CouplingBlock(nn.Module):
"""Glow Affine Coupling block as in GlowTTS paper.
https://arxiv.org/pdf/1811.00002.pdf
x --> x0 -> conv1d -> wavenet -> conv1d --> t, s -> concat(s*x1 + t, x0) -> o
'-> x1 - - - - - - - - - - - - - - - - - - - - - - - - - ^
Args:
in_channels (int): number of input tensor channels.
hidden_channels (int): number of hidden channels.
kernel_size (int): WaveNet filter kernel size.
dilation_rate (int): rate to increase dilation by each layer in a decoder block.
num_layers (int): number of WaveNet layers.
c_in_channels (int): number of conditioning input channels.
dropout_p (int): wavenet dropout rate.
sigmoid_scale (bool): enable/disable sigmoid scaling for output scale.
Note:
It does not use conditional inputs differently from WaveGlow.
"""
def __init__(self,
in_channels,
hidden_channels,
@ -227,21 +170,28 @@ class CouplingBlock(nn.Module):
self.c_in_channels = c_in_channels
self.dropout_p = dropout_p
self.sigmoid_scale = sigmoid_scale
# input layer
start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1)
start = torch.nn.utils.weight_norm(start)
self.start = start
# output layer
# Initializing last layer to 0 makes the affine coupling layers
# do nothing at first. This helps with training stability
end = torch.nn.Conv1d(hidden_channels, in_channels, 1)
end.weight.data.zero_()
end.bias.data.zero_()
self.end = end
# coupling layers
self.wn = WN(in_channels, hidden_channels, kernel_size, dilation_rate,
num_layers, c_in_channels, dropout_p)
def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): # pylint: disable=unused-argument
"""
Shapes:
x: B x C x T
x_mask: B x 1 x T
g: B x C x 1
"""
if x_mask is None:
x_mask = 1
x_0, x_1 = x[:, :self.in_channels // 2], x[:, self.in_channels // 2:]
@ -251,17 +201,17 @@ class CouplingBlock(nn.Module):
out = self.end(x)
z_0 = x_0
m = out[:, :self.in_channels // 2, :]
logs = out[:, self.in_channels // 2:, :]
t = out[:, :self.in_channels // 2, :]
s = out[:, self.in_channels // 2:, :]
if self.sigmoid_scale:
logs = torch.log(1e-6 + torch.sigmoid(logs + 2))
s = torch.log(1e-6 + torch.sigmoid(s + 2))
if reverse:
z_1 = (x_1 - m) * torch.exp(-logs) * x_mask
z_1 = (x_1 - t) * torch.exp(-s) * x_mask
logdet = None
else:
z_1 = (m + torch.exp(logs) * x_1) * x_mask
logdet = torch.sum(logs * x_mask, [1, 2])
z_1 = (t + torch.exp(s) * x_1) * x_mask
logdet = torch.sum(s * x_mask, [1, 2])
z = torch.cat([z_0, z_1], 1)
return z, logdet

View File

@ -7,8 +7,46 @@ from TTS.tts.layers.glow_tts.glow import LayerNorm
class RelativePositionMultiHeadAttention(nn.Module):
"""Implementation of Relative Position Encoding based on
"""Multi-head attention with Relative Positional embedding.
https://arxiv.org/pdf/1809.04281.pdf
It learns positional embeddings for a window of neighbours. For keys and values,
it learns different set of embeddings. Key embeddings are agregated with the attention
scores and value embeddings are aggregated with the output.
Note:
Example with relative attention window size 2
input = [a, b, c, d, e]
rel_attn_embeddings = [e(t-2), e(t-1), e(t+1), e(t+2)]
So it learns 4 embedding vectors (in total 8) separately for key and value vectors.
Considering the input c
e(t-2) corresponds to c -> a
e(t-2) corresponds to c -> b
e(t-2) corresponds to c -> d
e(t-2) corresponds to c -> e
These embeddings are shared among different time steps. So input a, b, d and e also uses
the same embeddings.
Embeddings are ignored when the relative window is out of limit for the first and the last
n items.
Args:
channels (int): input and inner layer channels.
out_channels (int): output channels.
num_heads (int): number of attention heads.
rel_attn_window_size (int, optional): relation attention window size.
If 4, for each time step next and previous 4 time steps are attended.
If default, relative encoding is disabled and it is a regular transformer.
Defaults to None.
heads_share (bool, optional): [description]. Defaults to True.
dropout_p (float, optional): dropout rate. Defaults to 0..
input_length (int, optional): intput length for positional encoding. Defaults to None.
proximal_bias (bool, optional): enable/disable proximal bias as in the paper. Defaults to False.
proximal_init (bool, optional): enable/disable poximal init as in the paper.
Init key and query layer weights the same. Defaults to False.
"""
def __init__(self,
channels,
@ -20,6 +58,7 @@ class RelativePositionMultiHeadAttention(nn.Module):
input_length=None,
proximal_bias=False,
proximal_init=False):
super().__init__()
assert channels % num_heads == 0, " [!] channels should be divisible by num_heads."
# class attributes
@ -81,7 +120,7 @@ class RelativePositionMultiHeadAttention(nn.Module):
# compute raw attention scores
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(
self.k_channels)
# relative positional encoding
# relative positional encoding for scores
if self.rel_attn_window_size is not None:
assert t_s == t_t, "Relative attention is only available for self-attention."
# get relative key embeddings
@ -225,27 +264,35 @@ class RelativePositionMultiHeadAttention(nn.Module):
return diff.unsqueeze(0).unsqueeze(0)
class FFN(nn.Module):
class FeedForwardNetwork(nn.Module):
"""Feed Forward Inner layers for Transformer.
Args:
in_channels (int): input tensor channels.
out_channels (int): output tensor channels.
hidden_channels (int): inner layers hidden channels.
kernel_size (int): conv1d filter kernel size.
dropout_p (float, optional): dropout rate. Defaults to 0.
"""
def __init__(self,
in_channels,
out_channels,
filter_channels,
hidden_channels,
kernel_size,
dropout_p=0.,
activation=None):
dropout_p=0.):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dropout_p = dropout_p
self.activation = activation
self.conv_1 = nn.Conv1d(in_channels,
filter_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2)
self.conv_2 = nn.Conv1d(filter_channels,
self.conv_2 = nn.Conv1d(hidden_channels,
out_channels,
kernel_size,
padding=kernel_size // 2)
@ -253,19 +300,36 @@ class FFN(nn.Module):
def forward(self, x, x_mask):
x = self.conv_1(x * x_mask)
if self.activation == "gelu":
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = torch.relu(x)
x = self.dropout(x)
x = self.conv_2(x * x_mask)
return x * x_mask
class Transformer(nn.Module):
class RelativePositionTransformer(nn.Module):
"""Transformer with Relative Potional Encoding.
https://arxiv.org/abs/1803.02155
Args:
in_channels (int): number of channels of the input tensor.
out_chanels (int): number of channels of the output tensor.
hidden_channels (int): model hidden channels.
hidden_channels_ffn (int): hidden channels of FeedForwardNetwork.
num_heads (int): number of attention heads.
num_layers (int): number of transformer layers.
kernel_size (int, optional): kernel size of feed-forward inner layers. Defaults to 1.
dropout_p (float, optional): dropout rate for self-attention and feed-forward inner layers_per_stack. Defaults to 0.
rel_attn_window_size (int, optional): relation attention window size.
If 4, for each time step next and previous 4 time steps are attended.
If default, relative encoding is disabled and it is a regular transformer.
Defaults to None.
input_length (int, optional): input lenght to limit position encoding. Defaults to None.
"""
def __init__(self,
in_channels,
out_channels,
hidden_channels,
filter_channels,
hidden_channels_ffn,
num_heads,
num_layers,
kernel_size=1,
@ -274,7 +338,7 @@ class Transformer(nn.Module):
input_length=None):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.hidden_channels_ffn = hidden_channels_ffn
self.num_heads = num_heads
self.num_layers = num_layers
self.kernel_size = kernel_size
@ -286,25 +350,38 @@ class Transformer(nn.Module):
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for _ in range(self.num_layers):
for idx in range(self.num_layers):
self.attn_layers.append(
RelativePositionMultiHeadAttention(
hidden_channels,
hidden_channels if idx != 0 else in_channels,
hidden_channels,
num_heads,
rel_attn_window_size=rel_attn_window_size,
dropout_p=dropout_p,
input_length=input_length))
self.norm_layers_1.append(LayerNorm(hidden_channels))
if hidden_channels != out_channels and (idx + 1) == self.num_layers:
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
self.ffn_layers.append(
FFN(hidden_channels,
hidden_channels,
filter_channels,
FeedForwardNetwork(hidden_channels,
hidden_channels if (idx + 1) != self.num_layers else out_channels,
hidden_channels_ffn,
kernel_size,
dropout_p=dropout_p))
self.norm_layers_2.append(LayerNorm(hidden_channels))
self.norm_layers_2.append(
LayerNorm(hidden_channels if (
idx + 1) != self.num_layers else out_channels))
def forward(self, x, x_mask):
"""
Shapes:
x: [B, C, T]
x_mask: [B, 1, T]
"""
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
for i in range(self.num_layers):
x = x * x_mask
@ -314,6 +391,10 @@ class Transformer(nn.Module):
y = self.ffn_layers[i](x, x_mask)
y = self.dropout(y)
if (i + 1) == self.num_layers and hasattr(self, 'proj'):
x = self.proj(x)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x

View File

@ -240,6 +240,24 @@ class GuidedAttentionLoss(torch.nn.Module):
return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
class Huber(nn.Module):
# pylint: disable=R0201
def forward(self, x, y, length=None):
"""
Shapes:
x: B x T
y: B x T
length: B
"""
mask = sequence_mask(sequence_length=length, max_len=y.size(1)).float()
return torch.nn.functional.smooth_l1_loss(
x * mask, y * mask, reduction='sum') / mask.sum()
########################
# MODEL LOSS LAYERS
########################
class TacotronLoss(torch.nn.Module):
"""Collection of Tacotron set-up based on provided config."""
def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4):
@ -403,8 +421,27 @@ class GlowTTSLoss(torch.nn.Module):
return_dict['log_mle'] = log_mle
return_dict['loss_dur'] = loss_dur
# check if any loss is NaN
# check if any loss is NaN
for key, loss in return_dict.items():
if torch.isnan(loss):
raise RuntimeError(f" [!] NaN loss with {key}.")
return return_dict
class SpeedySpeechLoss(nn.Module):
def __init__(self, c):
super().__init__()
self.l1 = L1LossMasked(False)
self.ssim = SSIMLoss()
self.huber = Huber()
self.ssim_alpha = c.ssim_alpha
self.huber_alpha = c.huber_alpha
self.l1_alpha = c.l1_alpha
def forward(self, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens):
l1_loss = self.l1(decoder_output, decoder_target, decoder_output_lens)
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
huber_loss = self.huber(dur_output, dur_target, input_lens)
loss = l1_loss + ssim_loss + huber_loss
return {'loss': loss, 'loss_l1': l1_loss, 'loss_ssim': ssim_loss, 'loss_dur': huber_loss}

View File

View File

@ -0,0 +1,192 @@
import torch
from torch import nn
from TTS.tts.layers.generic.res_conv_bn import Conv1dBNBlock, ResidualConv1dBNBlock, Conv1dBN
from TTS.tts.layers.generic.wavenet import WNBlocks
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
class WaveNetDecoder(nn.Module):
"""WaveNet based decoder with a prenet and a postnet.
prenet: conv1d_1x1
postnet: 3 x [conv1d_1x1 -> relu] -> conv1d_1x1
TODO: Integrate speaker conditioning vector.
Note:
default wavenet parameters;
params = {
"num_blocks": 12,
"hidden_channels":192,
"kernel_size": 5,
"dilation_rate": 1,
"num_layers": 4,
"dropout_p": 0.05
}
Args:
in_channels (int): number of input channels.
out_channels (int): number of output channels.
hidden_channels (int): number of hidden channels for prenet and postnet.
params (dict): dictionary for residual convolutional blocks.
"""
def __init__(self, in_channels, out_channels, hidden_channels, c_in_channels, params):
super().__init__()
# prenet
self.prenet = torch.nn.Conv1d(in_channels, params['hidden_channels'], 1)
# wavenet layers
self.wn = WNBlocks(params['hidden_channels'], c_in_channels=c_in_channels, **params)
# postnet
self.postnet = [
torch.nn.Conv1d(params['hidden_channels'], hidden_channels, 1),
torch.nn.ReLU(),
torch.nn.Conv1d(hidden_channels, hidden_channels, 1),
torch.nn.ReLU(),
torch.nn.Conv1d(hidden_channels, hidden_channels, 1),
torch.nn.ReLU(),
torch.nn.Conv1d(hidden_channels, out_channels, 1),
]
self.postnet = nn.Sequential(*self.postnet)
def forward(self, x, x_mask=None, g=None):
x = self.prenet(x) * x_mask
x = self.wn(x, x_mask, g)
o = self.postnet(x) * x_mask
return o
class RelativePositionTransformerDecoder(nn.Module):
"""Decoder with Relative Positional Transformer.
Note:
Default params
params={
'hidden_channels_ffn': 128,
'num_heads': 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 8,
"rel_attn_window_size": 4,
"input_length": None
}
Args:
in_channels (int): number of input channels.
out_channels (int): number of output channels.
hidden_channels (int): number of hidden channels including Transformer layers.
params (dict): dictionary for residual convolutional blocks.
"""
def __init__(self, in_channels, out_channels, hidden_channels, params):
super().__init__()
self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1)
self.rel_pos_transformer = RelativePositionTransformer(
in_channels, out_channels, hidden_channels, **params)
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
o = self.prenet(x) * x_mask
o = self.rel_pos_transformer(o, x_mask)
return o
class ResidualConv1dBNDecoder(nn.Module):
"""Residual Convolutional Decoder as in the original Speedy Speech paper
TODO: Integrate speaker conditioning vector.
Note:
Default params
params = {
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17
}
Args:
in_channels (int): number of input channels.
out_channels (int): number of output channels.
hidden_channels (int): number of hidden channels including ResidualConv1dBNBlock layers.
params (dict): dictionary for residual convolutional blocks.
"""
def __init__(self, in_channels, out_channels, hidden_channels, params):
super().__init__()
self.res_conv_block = ResidualConv1dBNBlock(in_channels,
hidden_channels,
hidden_channels, **params)
self.post_conv = nn.Conv1d(hidden_channels, hidden_channels, 1)
self.postnet = nn.Sequential(
Conv1dBNBlock(hidden_channels,
hidden_channels,
hidden_channels,
params['kernel_size'],
1,
num_conv_blocks=2),
nn.Conv1d(hidden_channels, out_channels, 1),
)
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
o = self.res_conv_block(x, x_mask)
o = self.post_conv(o) + x
return self.postnet(o) * x_mask
class Decoder(nn.Module):
"""Decodes the expanded phoneme encoding into spectrograms
Args:
out_channels (int): number of output channels.
in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
decoder_type (str): decoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
decoder_params (dict): model parameters for specified decoder type.
c_in_channels (int): number of channels for conditional input.
Shapes:
- input: (B, C, T)
"""
# pylint: disable=dangerous-default-value
def __init__(
self,
out_channels,
in_hidden_channels,
decoder_type='residual_conv_bn',
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17
},
c_in_channels=0):
super().__init__()
if decoder_type == 'transformer':
self.decoder = RelativePositionTransformerDecoder(
in_channels=in_hidden_channels,
out_channels=out_channels,
hidden_channels=in_hidden_channels,
params=decoder_params)
elif decoder_type == 'residual_conv_bn':
self.decoder = ResidualConv1dBNDecoder(
in_channels=in_hidden_channels,
out_channels=out_channels,
hidden_channels=in_hidden_channels,
params=decoder_params)
elif decoder_type == 'wavenet':
self.decoder = WaveNetDecoder(in_channels=in_hidden_channels,
out_channels=out_channels,
hidden_channels=in_hidden_channels,
c_in_channels=c_in_channels,
params=decoder_params)
else:
raise ValueError(f'[!] Unknown decoder type - {decoder_type}')
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
"""
Args:
x: [B, C, T]
x_mask: [B, 1, T]
g: [B, C_g, 1]
"""
# TODO: implement multi-speaker
o = self.decoder(x, x_mask, g)
return o

View File

@ -0,0 +1,39 @@
from torch import nn
from TTS.tts.layers.generic.res_conv_bn import Conv1dBN
class DurationPredictor(nn.Module):
"""Speedy Speech duration predictor model.
Predicts phoneme durations from encoder outputs.
Note:
Outputs interpreted as log(durations)
To get actual durations, do exp transformation
conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
Args:
hidden_channels (int): number of channels in the inner layers.
"""
def __init__(self, hidden_channels):
super().__init__()
self.layers = nn.ModuleList([
Conv1dBN(hidden_channels, hidden_channels, 4, 1),
Conv1dBN(hidden_channels, hidden_channels, 3, 1),
Conv1dBN(hidden_channels, hidden_channels, 1, 1),
nn.Conv1d(hidden_channels, 1, 1)
])
def forward(self, x, x_mask):
"""
Shapes:
x: [B, C, T]
x_mask: [B, 1, T]
"""
o = x
for layer in self.layers:
o = layer(o) * x_mask
return o

View File

@ -0,0 +1,209 @@
import math
import torch
from torch import nn
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
from TTS.tts.layers.generic.res_conv_bn import ResidualConv1dBNBlock
class PositionalEncoding(nn.Module):
"""Sinusoidal positional encoding for non-recurrent neural networks.
Implementation based on "Attention Is All You Need"
Args:
channels (int): embedding size
dropout (float): dropout parameter
"""
def __init__(self, channels, dropout=0.0, max_len=5000):
super().__init__()
if channels % 2 != 0:
raise ValueError(
"Cannot use sin/cos positional encoding with "
"odd channels (got channels={:d})".format(channels))
pe = torch.zeros(max_len, channels)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp((torch.arange(0, channels, 2, dtype=torch.float) *
-(math.log(10000.0) / channels)))
pe[:, 0::2] = torch.sin(position.float() * div_term)
pe[:, 1::2] = torch.cos(position.float() * div_term)
pe = pe.unsqueeze(0).transpose(1, 2)
self.register_buffer('pe', pe)
if dropout > 0:
self.dropout = nn.Dropout(p=dropout)
self.channels = channels
def forward(self, x, mask=None, first_idx=None, last_idx=None):
"""
Shapes:
x: [B, C, T]
mask: [B, 1, T]
first_idx: int
last_idx: int
"""
x = x * math.sqrt(self.channels)
if first_idx is None:
if self.pe.size(2) < x.size(2):
raise RuntimeError(
f"Sequence is {x.size(2)} but PositionalEncoding is"
f" limited to {self.pe.size(2)}. See max_len argument.")
if mask is not None:
pos_enc = (self.pe[:, :, :x.size(2)] * mask)
else:
pos_enc = self.pe[:, :, :x.size(2)]
x = x + pos_enc
else:
x = x + self.pe[:, :, first_idx:last_idx]
if hasattr(self, 'dropout'):
x = self.dropout(x)
return x
class RelativePositionTransformerEncoder(nn.Module):
"""Speedy speech encoder built on Transformer with Relative Position encoding.
TODO: Integrate speaker conditioning vector.
Args:
in_channels (int): number of input channels.
out_channels (int): number of output channels.
hidden_channels (int): number of hidden channels
params (dict): dictionary for residual convolutional blocks.
"""
def __init__(self, in_channels, out_channels, hidden_channels, params):
super().__init__()
self.prenet = ResidualConv1dBNBlock(in_channels,
hidden_channels,
hidden_channels,
kernel_size=5,
num_res_blocks=3,
num_conv_blocks=1,
dilations=[1, 1, 1]
)
self.rel_pos_transformer = RelativePositionTransformer(
hidden_channels, out_channels, hidden_channels, **params)
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
if x_mask is None:
x_mask = 1
o = self.prenet(x) * x_mask
o = self.rel_pos_transformer(o, x_mask)
return o
class ResidualConv1dBNEncoder(nn.Module):
"""Residual Convolutional Encoder as in the original Speedy Speech paper
TODO: Integrate speaker conditioning vector.
Args:
in_channels (int): number of input channels.
out_channels (int): number of output channels.
hidden_channels (int): number of hidden channels
params (dict): dictionary for residual convolutional blocks.
"""
def __init__(self, in_channels, out_channels, hidden_channels, params):
super().__init__()
self.prenet = nn.Sequential(
nn.Conv1d(in_channels, hidden_channels, 1),
nn.ReLU())
self.res_conv_block = ResidualConv1dBNBlock(hidden_channels,
hidden_channels,
hidden_channels, **params)
self.postnet = nn.Sequential(*[
nn.Conv1d(hidden_channels, hidden_channels, 1),
nn.ReLU(),
nn.BatchNorm1d(hidden_channels),
nn.Conv1d(hidden_channels, out_channels, 1)
])
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
if x_mask is None:
x_mask = 1
o = self.prenet(x) * x_mask
o = self.res_conv_block(o, x_mask)
o = self.postnet(o + x) * x_mask
return o * x_mask
class Encoder(nn.Module):
# pylint: disable=dangerous-default-value
"""Factory class for Speedy Speech encoder enables different encoder types internally.
Args:
num_chars (int): number of characters.
out_channels (int): number of output channels.
in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
encoder_type (str): encoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
encoder_params (dict): model parameters for specified encoder type.
c_in_channels (int): number of channels for conditional input.
Note:
Default encoder_params...
for 'transformer'
encoder_params={
'hidden_channels_ffn': 128,
'num_heads': 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"rel_attn_window_size": 4,
"input_length": None
},
for 'residual_conv_bn'
encoder_params = {
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13
}
"""
def __init__(
self,
in_hidden_channels,
out_channels,
encoder_type='residual_conv_bn',
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13
},
c_in_channels=0):
super().__init__()
self.out_channels = out_channels
self.in_channels = in_hidden_channels
self.hidden_channels = in_hidden_channels
self.encoder_type = encoder_type
self.c_in_channels = c_in_channels
# init encoder
if encoder_type.lower() == "transformer":
# text encoder
self.encoder = RelativePositionTransformerEncoder(in_hidden_channels,
out_channels,
in_hidden_channels,
encoder_params) # pylint: disable=unexpected-keyword-arg
elif encoder_type.lower() == 'residual_conv_bn':
self.encoder = ResidualConv1dBNEncoder(in_hidden_channels,
out_channels,
in_hidden_channels,
encoder_params)
else:
raise NotImplementedError(' [!] unknown encoder type.')
# final projection layers
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
"""
Shapes:
x: [B, C, T]
x_mask: [B, 1, T]
g: [B, C, 1]
"""
o = self.encoder(x, x_mask)
return o * x_mask

View File

@ -1,7 +1,8 @@
# coding: utf-8
import torch
from torch import nn
from .common_layers import Prenet, init_attn
from .common_layers import Prenet
from .attentions import init_attn
class BatchNormConv1d(nn.Module):

View File

@ -1,7 +1,8 @@
import torch
from torch import nn
from torch.nn import functional as F
from .common_layers import init_attn, Prenet, Linear
from .common_layers import Prenet, Linear
from .attentions import init_attn
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter

View File

@ -10,46 +10,59 @@ from TTS.tts.layers.glow_tts.monotonic_align import maximum_path, generate_path
class GlowTts(nn.Module):
"""Glow TTS models from https://arxiv.org/abs/2005.11129"""
"""Glow TTS models from https://arxiv.org/abs/2005.11129
Args:
num_chars (int): number of embedding characters.
hidden_channels_enc (int): number of embedding and encoder channels.
hidden_channels_dec (int): number of decoder channels.
use_encoder_prenet (bool): enable/disable prenet for encoder. Prenet modules are hard-coded for each alternative encoder.
hidden_channels_dp (int): number of duration predictor channels.
out_channels (int): number of output channels. It should be equal to the number of spectrogram filter.
num_flow_blocks_dec (int): number of decoder blocks.
kernel_size_dec (int): decoder kernel size.
dilation_rate (int): rate to increase dilation by each layer in a decoder block.
num_block_layers (int): number of decoder layers in each decoder block.
dropout_p_dec (float): dropout rate for decoder.
num_speaker (int): number of speaker to define the size of speaker embedding layer.
c_in_channels (int): number of speaker embedding channels. It is set to 512 if embeddings are learned.
num_splits (int): number of split levels in inversible conv1x1 operation.
num_squeeze (int): number of squeeze levels. When squeezing channels increases and time steps reduces by the factor 'num_squeeze'.
sigmoid_scale (bool): enable/disable sigmoid scaling in decoder.
mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step.
encoder_type (str): encoder module type.
encoder_params (dict): encoder module parameters.
external_speaker_embedding_dim (int): channels of external speaker embedding vectors.
"""
def __init__(self,
num_chars,
hidden_channels,
filter_channels,
filter_channels_dp,
hidden_channels_enc,
hidden_channels_dec,
use_encoder_prenet,
hidden_channels_dp,
out_channels,
kernel_size=3,
num_heads=2,
num_layers_enc=6,
dropout_p=0.1,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=5,
num_block_layers=4,
dropout_p_dec=0.,
dropout_p_dp=0.1,
dropout_p_dec=0.05,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_sqz=1,
num_squeeze=1,
sigmoid_scale=False,
rel_attn_window_size=None,
input_length=None,
mean_only=False,
hidden_channels_enc=None,
hidden_channels_dec=None,
use_encoder_prenet=False,
encoder_type="transformer",
encoder_params=None,
external_speaker_embedding_dim=None):
super().__init__()
self.num_chars = num_chars
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.filter_channels_dp = filter_channels_dp
self.hidden_channels_dp = hidden_channels_dp
self.hidden_channels_enc = hidden_channels_enc
self.hidden_channels_dec = hidden_channels_dec
self.out_channels = out_channels
self.kernel_size = kernel_size
self.num_heads = num_heads
self.num_layers_enc = num_layers_enc
self.dropout_p = dropout_p
self.num_flow_blocks_dec = num_flow_blocks_dec
self.kernel_size_dec = kernel_size_dec
self.dilation_rate = dilation_rate
@ -58,16 +71,14 @@ class GlowTts(nn.Module):
self.num_speakers = num_speakers
self.c_in_channels = c_in_channels
self.num_splits = num_splits
self.num_sqz = num_sqz
self.num_squeeze = num_squeeze
self.sigmoid_scale = sigmoid_scale
self.rel_attn_window_size = rel_attn_window_size
self.input_length = input_length
self.mean_only = mean_only
self.hidden_channels_enc = hidden_channels_enc
self.hidden_channels_dec = hidden_channels_dec
self.use_encoder_prenet = use_encoder_prenet
self.noise_scale = 0.66
self.length_scale = 1.
# model constants.
self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference.
self.length_scale = 1. # scaler for the duration predictor. The larger it is, the slower the speech.
self.external_speaker_embedding_dim = external_speaker_embedding_dim
# if is a multispeaker and c_in_channels is 0, set to 256
@ -79,31 +90,29 @@ class GlowTts(nn.Module):
self.encoder = Encoder(num_chars,
out_channels=out_channels,
hidden_channels=hidden_channels,
filter_channels=filter_channels,
filter_channels_dp=filter_channels_dp,
hidden_channels=hidden_channels_enc,
hidden_channels_dp=hidden_channels_dp,
encoder_type=encoder_type,
num_heads=num_heads,
num_layers=num_layers_enc,
kernel_size=kernel_size,
dropout_p=dropout_p,
encoder_params=encoder_params,
mean_only=mean_only,
use_prenet=use_encoder_prenet,
dropout_p_dp=dropout_p_dp,
c_in_channels=self.c_in_channels)
self.decoder = Decoder(out_channels,
hidden_channels_dec or hidden_channels,
hidden_channels_dec,
kernel_size_dec,
dilation_rate,
num_flow_blocks_dec,
num_block_layers,
dropout_p=dropout_p_dec,
num_splits=num_splits,
num_sqz=num_sqz,
num_squeeze=num_squeeze,
sigmoid_scale=sigmoid_scale,
c_in_channels=self.c_in_channels)
if num_speakers > 1 and not external_speaker_embedding_dim:
# speaker embedding layer
self.emb_g = nn.Embedding(num_speakers, self.c_in_channels)
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
@ -122,11 +131,12 @@ class GlowTts(nn.Module):
def forward(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None):
"""
Shapes:
x: B x T
x_lenghts: B
y: B x C x T
y_lengths: B
Shapes:
x: [B, T]
x_lenghts: B
y: [B, C, T]
y_lengths: B
g: [B, C] or B
"""
y_max_length = y.size(2)
# norm speaker embeddings
@ -134,13 +144,13 @@ class GlowTts(nn.Module):
if self.external_speaker_embedding_dim:
g = F.normalize(g).unsqueeze(-1)
else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1)# [b, h]
g = F.normalize(self.emb_g(g)).unsqueeze(-1)# [b, h, 1]
# embedding pass
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x,
x_lengths,
g=g)
# format feature vectors and feature vector lenghts
# drop redisual frames wrt num_squeeze and set y_lengths.
y, y_lengths, y_max_length, attn = self.preprocess(
y, y_lengths, y_max_length, None)
# create masks
@ -170,7 +180,6 @@ class GlowTts(nn.Module):
@torch.no_grad()
def inference(self, x, x_lengths, g=None):
if g is not None:
if self.external_speaker_embedding_dim:
g = F.normalize(g).unsqueeze(-1)
@ -195,6 +204,7 @@ class GlowTts(nn.Module):
attn_mask.squeeze(1)).unsqueeze(1)
y_mean, y_log_scale, o_attn_dur = self.compute_outputs(
attn, o_mean, o_log_scale, x_mask)
z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) *
self.noise_scale) * y_mask
# decoder pass
@ -204,11 +214,11 @@ class GlowTts(nn.Module):
def preprocess(self, y, y_lengths, y_max_length, attn=None):
if y_max_length is not None:
y_max_length = (y_max_length // self.num_sqz) * self.num_sqz
y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze
y = y[:, :, :y_max_length]
if attn is not None:
attn = attn[:, :, :, :y_max_length]
y_lengths = (y_lengths // self.num_sqz) * self.num_sqz
y_lengths = (y_lengths // self.num_squeeze) * self.num_squeeze
return y, y_lengths, y_max_length, attn
def store_inverse(self):

View File

@ -0,0 +1,192 @@
import torch
from torch import nn
from TTS.tts.layers.speedy_speech.decoder import Decoder
from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
from TTS.tts.layers.speedy_speech.encoder import Encoder, PositionalEncoding
from TTS.tts.utils.generic_utils import sequence_mask
from TTS.tts.layers.glow_tts.monotonic_align import generate_path
class SpeedySpeech(nn.Module):
"""Speedy Speech model
https://arxiv.org/abs/2008.03802
Encoder -> DurationPredictor -> Decoder
This model is able to achieve a reasonable performance with only
~3M model parameters and convolutional layers.
This model requires precomputed phoneme durations to train a duration predictor. At inference
it only uses the duration predictor to compute durations and expand encoder outputs respectively.
Args:
num_chars (int): number of unique input to characters
out_channels (int): number of output tensor channels. It is equal to the expected spectrogram size.
hidden_channels (int): number of channels in all the model layers.
positional_encoding (bool, optional): enable/disable Positional encoding on encoder outputs. Defaults to True.
length_scale (int, optional): coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
encoder_type (str, optional): set the encoder type. Defaults to 'residual_conv_bn'.
encoder_params (dict, optional): set encoder parameters depending on 'encoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13 }.
decoder_type (str, optional): decoder type. Defaults to 'residual_conv_bn'.
decoder_params (dict, optional): set decoder parameters depending on 'decoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17 }.
num_speakers (int, optional): number of speakers for multi-speaker training. Defaults to 0.
external_c (bool, optional): enable external speaker embeddings. Defaults to False.
c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
"""
# pylint: disable=dangerous-default-value
def __init__(
self,
num_chars,
out_channels,
hidden_channels,
positional_encoding=True,
length_scale=1,
encoder_type='residual_conv_bn',
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13
},
decoder_type='residual_conv_bn',
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17
},
num_speakers=0,
external_c=False,
c_in_channels=0):
super().__init__()
self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale
self.emb = nn.Embedding(num_chars, hidden_channels)
self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type,
encoder_params, c_in_channels)
if positional_encoding:
self.pos_encoder = PositionalEncoding(hidden_channels)
self.decoder = Decoder(out_channels, hidden_channels,
decoder_type, decoder_params)
self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels)
if num_speakers > 1 and not external_c:
# speaker embedding layer
self.emb_g = nn.Embedding(num_speakers, c_in_channels)
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
if c_in_channels > 0 and c_in_channels != hidden_channels:
self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1)
@staticmethod
def expand_encoder_outputs(en, dr, x_mask, y_mask):
"""Generate attention alignment map from durations and
expand encoder outputs
Example:
encoder output: [a,b,c,d]
durations: [1, 3, 2, 1]
expanded: [a, b, b, b, c, c, d]
attention map: [[0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 1, 1, 0],
[0, 1, 1, 1, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0]]
"""
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype)
o_en_ex = torch.matmul(
attn.squeeze(1).transpose(1, 2), en.transpose(1,
2)).transpose(1, 2)
return o_en_ex, attn
def format_durations(self, o_dr_log, x_mask):
o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
o_dr[o_dr < 1] = 1.0
o_dr = torch.round(o_dr)
return o_dr
@staticmethod
def _concat_speaker_embedding(o_en, g):
g_exp = g.expand(-1, -1, o_en.size(-1)) # [B, C, T_en]
o_en = torch.cat([o_en, g_exp], 1)
return o_en
def _sum_speaker_embedding(self, x, g):
# project g to decoder dim.
if hasattr(self, 'proj_g'):
g = self.proj_g(g)
return x + g
def _forward_encoder(self, x, x_lengths, g=None):
if hasattr(self, 'emb_g'):
g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1]
if g is not None:
g = g.unsqueeze(-1)
# [B, T, C]
x_emb = self.emb(x)
# [B, C, T]
x_emb = torch.transpose(x_emb, 1, -1)
# compute sequence masks
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]),
1).to(x.dtype)
# encoder pass
o_en = self.encoder(x_emb, x_mask)
# speaker conditioning for duration predictor
if g is not None:
o_en_dp = self._concat_speaker_embedding(o_en, g)
else:
o_en_dp = o_en
return o_en, o_en_dp, x_mask, g
def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g):
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None),
1).to(o_en_dp.dtype)
# expand o_en with durations
o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
# positional encoding
if hasattr(self, 'pos_encoder'):
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
# speaker embedding
if g is not None:
o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
# decoder pass
o_de = self.decoder(o_en_ex, y_mask, g=g)
return o_de, attn.transpose(1, 2)
def forward(self, x, x_lengths, y_lengths, dr, g=None): # pylint: disable=unused-argument
"""
Shapes:
x: [B, T_max]
x_lengths: [B]
y_lengths: [B]
dr: [B, T_max]
g: [B, C]
"""
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
o_de, attn= self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
return o_de, o_dr_log.squeeze(1), attn
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
"""
Shapes:
x: [B, T_max]
x_lengths: [B]
g: [B, C]
"""
# pad input to prevent dropping the last word
x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
# duration predictor pass
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
y_lengths = o_dr.sum(1)
o_de, attn= self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
return o_de, attn

View File

@ -8,6 +8,45 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract
class Tacotron(TacotronAbstract):
"""Tacotron as in https://arxiv.org/abs/1703.10135
It's an autoregressive encoder-attention-decoder-postnet architecture.
Args:
num_chars (int): number of input characters to define the size of embedding layer.
num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings.
r (int): initial model reduction rate.
postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
attn_type (str, optional): attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
attn_win (bool, optional): enable/disable attention windowing.
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
prenet_type (str, optional): prenet type for the decoder. Defaults to "original".
prenet_dropout (bool, optional): prenet dropout rate. Defaults to True.
forward_attn (bool, optional): enable/disable forward attention.
It is only valid if ```attn_type``` is ```original```. Defaults to False.
trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False.
forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False.
location_attn (bool, optional): enable/disable location sensitive attention.
It is only valid if ```attn_type``` is ```original```. Defaults to True.
attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5.
separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient
flow from stopnet to the rest of the model. Defaults to True.
bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False.
double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False.
ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
gst (bool, optional): enable/disable global style token learning. Defaults to False.
gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size```
output frames to the prenet.
"""
def __init__(self,
num_chars,
num_speakers,
@ -95,10 +134,12 @@ class Tacotron(TacotronAbstract):
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
"""
Shapes:
- characters: B x T_in
- text_lengths: B
- mel_specs: B x T_out x D
- speaker_ids: B x 1
characters: [B, T_in]
text_lengths: [B]
mel_specs: [B, T_out, C]
mel_lengths: [B]
speaker_ids: [B, 1]
speaker_embeddings: [B, C]
"""
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x T_in x embed_dim

View File

@ -7,6 +7,43 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract
# TODO: match function arguments with tacotron
class Tacotron2(TacotronAbstract):
"""Tacotron2 as in https://arxiv.org/abs/1712.05884
It's an autoregressive encoder-attention-decoder-postnet architecture.
Args:
num_chars (int): number of input characters to define the size of embedding layer.
num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings.
r (int): initial model reduction rate.
postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'.
attn_win (bool, optional): enable/disable attention windowing.
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
prenet_type (str, optional): prenet type for the decoder. Defaults to "original".
prenet_dropout (bool, optional): prenet dropout rate. Defaults to True.
forward_attn (bool, optional): enable/disable forward attention.
It is only valid if ```attn_type``` is ```original```. Defaults to False.
trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False.
forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False.
location_attn (bool, optional): enable/disable location sensitive attention.
It is only valid if ```attn_type``` is ```original```. Defaults to True.
attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5.
separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient
flow from stopnet to the rest of the model. Defaults to True.
bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False.
double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False.
ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
gst (bool, optional): enable/disable global style token learning. Defaults to False.
gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
"""
def __init__(self,
num_chars,
num_speakers,
@ -93,6 +130,15 @@ class Tacotron2(TacotronAbstract):
return mel_outputs, mel_outputs_postnet, alignments
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
"""
Shapes:
text: [B, T_in]
text_lengths: [B]
mel_specs: [B, T_out, C]
mel_lengths: [B]
speaker_ids: [B, 1]
speaker_embeddings: [B, C]
"""
# compute mask for padding
# B x T_in_max (boolean)
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)

View File

@ -2,7 +2,7 @@ import tensorflow as tf
from tensorflow import keras
from TTS.tts.tf.utils.tf_utils import shape_list
from TTS.tts.tf.layers.common_layers import Prenet, Attention
# from tensorflow_addons.seq2seq import AttentionWrapper
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter

View File

@ -103,15 +103,13 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
speaker_embedding_dim=speaker_embedding_dim)
elif c.model.lower() == "glow_tts":
model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False),
hidden_channels=192,
filter_channels=768,
filter_channels_dp=256,
out_channels=80,
kernel_size=3,
num_heads=2,
num_layers_enc=6,
hidden_channels_enc=c['hidden_channels_encoder'],
hidden_channels_dec=c['hidden_channels_decoder'],
hidden_channels_dp=c['hidden_channels_duration_predictor'],
out_channels=c.audio['num_mels'],
encoder_type=c.encoder_type,
dropout_p=0.1,
encoder_params=c.encoder_params,
use_encoder_prenet=c["use_encoder_prenet"],
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
@ -120,20 +118,27 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
num_speakers=num_speakers,
c_in_channels=0,
num_splits=4,
num_sqz=2,
num_squeeze=2,
sigmoid_scale=False,
mean_only=True,
hidden_channels_enc=192,
hidden_channels_dec=192,
use_encoder_prenet=True,
external_speaker_embedding_dim=speaker_embedding_dim)
elif c.model.lower() == "speedy_speech":
model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False),
out_channels=c.audio['num_mels'],
hidden_channels=c['hidden_channels'],
positional_encoding=c['positional_encoding'],
encoder_type=c['encoder_type'],
encoder_params=c['encoder_params'],
decoder_type=c['decoder_type'],
decoder_params=c['decoder_params'],
c_in_channels=0)
return model
def is_tacotron(c):
return False if 'glow_tts' in c['model'] else True
return False if c['model'] in ['speedy_speech', 'glow_tts'] else True
def check_config_tts(c):
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts'], restricted=True, val_type=str)
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
check_argument('run_name', c, restricted=True, val_type=str)
check_argument('run_description', c, val_type=str)
@ -177,7 +182,7 @@ def check_config_tts(c):
check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
check_argument('r', c, restricted=True, val_type=int, min_val=1)
check_argument('gradual_training', c, restricted=False, val_type=list)
check_argument('apex_amp_level', c, restricted=False, val_type=str)
check_argument('mixed_precision', c, restricted=False, val_type=bool)
# check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
# loss parameters
@ -190,6 +195,10 @@ def check_config_tts(c):
check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0)
if c['model'].lower == "speedy_speech":
check_argument('ssim_alpha', c, restricted=True, val_type=float, min_val=0)
check_argument('l1_alpha', c, restricted=True, val_type=float, min_val=0)
check_argument('huber_alpha', c, restricted=True, val_type=float, min_val=0)
# validation parameters
check_argument('run_eval', c, restricted=True, val_type=bool)
@ -201,9 +210,9 @@ def check_config_tts(c):
check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
check_argument('lr', c, restricted=True, val_type=float, min_val=0)
check_argument('wd', c, restricted=True, val_type=float, min_val=0)
check_argument('wd', c, restricted=is_tacotron(c), val_type=float, min_val=0)
check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
check_argument('seq_len_norm', c, restricted=True, val_type=bool)
check_argument('seq_len_norm', c, restricted=is_tacotron(c), val_type=bool)
# tacotron prenet
check_argument('memory_size', c, restricted=is_tacotron(c), val_type=int, min_val=-1)
@ -211,7 +220,7 @@ def check_config_tts(c):
check_argument('prenet_dropout', c, restricted=is_tacotron(c), val_type=bool)
# attention
check_argument('attention_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['graves', 'original'])
check_argument('attention_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['graves', 'original', 'dynamic_convolution'])
check_argument('attention_heads', c, restricted=is_tacotron(c), val_type=int)
check_argument('attention_norm', c, restricted=is_tacotron(c), val_type=str, enum_list=['sigmoid', 'softmax'])
check_argument('windowing', c, restricted=is_tacotron(c), val_type=bool)
@ -224,9 +233,17 @@ def check_config_tts(c):
check_argument('double_decoder_consistency', c, restricted=is_tacotron(c), val_type=bool)
check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
# stopnet
check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool)
check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool)
if c['model'].lower() in ['tacotron', 'tacotron2']:
# stopnet
check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool)
check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool)
# Model Parameters for non-tacotron models
if c['model'].lower == "speedy_speech":
check_argument('positional_encoding', c, restricted=True, val_type=type)
check_argument('encoder_type', c, restricted=True, val_type=str)
check_argument('encoder_params', c, restricted=True, val_type=dict)
check_argument('decoder_residual_conv_bn_params', c, restricted=True, val_type=dict)
# GlowTTS parameters
check_argument('encoder_type', c, restricted=not is_tacotron(c), val_type=str)
@ -248,6 +265,7 @@ def check_config_tts(c):
check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
check_argument('compute_input_seq_cache', c, restricted=True, val_type=bool)
# paths
check_argument('output_path', c, restricted=True, val_type=str)
@ -256,8 +274,8 @@ def check_config_tts(c):
check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
check_argument('use_external_speaker_embedding_file', c, restricted=c['use_speaker_embedding'], val_type=bool)
check_argument('external_speaker_embedding_file', c, restricted=c['use_external_speaker_embedding_file'], val_type=str)
check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool)
if c['model'].lower() in ['tacotron', 'tacotron2'] and c['use_gst']:
check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool)
check_argument('gst', c, restricted=is_tacotron(c), val_type=dict)
check_argument('gst_style_input', c['gst'], restricted=is_tacotron(c), val_type=[str, dict])
check_argument('gst_embedding_dim', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000)

View File

@ -8,6 +8,17 @@ from TTS.utils.io import RenamingUnpickler
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
"""Load ```TTS.tts.models``` checkpoints.
Args:
model (TTS.tts.models): model object to load the weights for.
checkpoint_path (string): checkpoint file path.
amp (apex.amp, optional): Apex amp abject to load apex related state vars. Defaults to None.
use_cuda (bool, optional): load model to GPU if True. Defaults to False.
Returns:
[type]: [description]
"""
try:
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
except ModuleNotFoundError:
@ -26,6 +37,17 @@ def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_dict=None, **kwargs):
"""Save ```TTS.tts.models``` states with extra fields.
Args:
model (TTS.tts.models.Model): models object to be saved.
optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training.
current_step (int): current number of training steps.
epoch (int): current number of training epochs.
r (int): model reduction rate for Tacotron models.
output_path (str): output path to save the model file.
amp_state_dict (state_dict, optional): Apex.amp state dict if Apex is enabled. Defaults to None.
"""
if hasattr(model, 'module'):
model_state = model.module.state_dict()
else:
@ -45,6 +67,16 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_
def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs):
"""Save model checkpoint, intended for saving checkpoints at training.
Args:
model (TTS.tts.models.Model): models object to be saved.
optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training.
current_step (int): current number of training steps.
epoch (int): current number of training epochs.
r (int): model reduction rate for Tacotron models.
output_path (str): output path to save the model file.
"""
file_name = 'checkpoint_{}.pth.tar'.format(current_step)
checkpoint_path = os.path.join(output_folder, file_name)
print(" > CHECKPOINT : {}".format(checkpoint_path))
@ -52,6 +84,23 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **k
def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs):
"""Save model checkpoint, intended for saving the best model after each epoch.
It compares the current model loss with the best loss so far and saves the
model if the current loss is better.
Args:
target_loss (float): current model loss.
best_loss (float): best loss so far.
model (TTS.tts.models.Model): models object to be saved.
optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training.
current_step (int): current number of training steps.
epoch (int): current number of training epochs.
r (int): model reduction rate for Tacotron models.
output_path (str): output path to save the model file.
Returns:
float: updated current best loss.
"""
if target_loss < best_loss:
file_name = 'best_model.pth.tar'
checkpoint_path = os.path.join(output_folder, file_name)

View File

@ -63,7 +63,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
speaker_embedding_dim = None
save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping)
print("Training with {} speakers: {}".format(len(speakers),
print(" > Training with {} speakers: {}".format(len(speakers),
", ".join(speakers)))
else:
num_speakers = 0

View File

@ -62,7 +62,22 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
elif 'glow' in CONFIG.model.lower():
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
postnet_output, _, _, _, alignments, _, _ = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id else speaker_embeddings)
if hasattr(model, 'module'):
# distributed model
postnet_output, _, _, _, alignments, _, _ = model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
else:
postnet_output, _, _, _, alignments, _, _ = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
postnet_output = postnet_output.permute(0, 2, 1)
# these only belong to tacotron models.
decoder_output = None
stop_tokens = None
elif 'speedy_speech' in CONFIG.model.lower():
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
if hasattr(model, 'module'):
# distributed model
postnet_output, alignments= model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
else:
postnet_output, alignments= model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
postnet_output = postnet_output.permute(0, 2, 1)
# these only belong to tacotron models.
decoder_output = None
@ -145,7 +160,8 @@ def inv_spectrogram(postnet_output, ap, CONFIG):
def id_to_torch(speaker_id, cuda=False):
if speaker_id is not None:
speaker_id = np.asarray(speaker_id)
speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
# TODO: test this for tacotron models
speaker_id = torch.from_numpy(speaker_id)
if cuda:
return speaker_id.cuda()
return speaker_id

View File

@ -14,6 +14,7 @@ import re
from unidecode import unidecode
from .number_norm import normalize_numbers
from .abbreviations import abbreviations_en, abbreviations_fr
from .time import expand_time_english
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
@ -95,6 +96,7 @@ def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_time_english(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
@ -122,8 +124,8 @@ def portuguese_cleaners(text):
def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = expand_numbers(text)
text = convert_to_ascii(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
text = remove_aux_symbols(text)

View File

@ -2,14 +2,14 @@
import inflect
import re
from typing import Dict
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_currency_re = re.compile(r'(£|\$|¥)([0-9\,\.]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+')
_number_re = re.compile(r'-?[0-9]+')
def _remove_commas(m):
@ -20,24 +20,54 @@ def _expand_decimal_point(m):
return m.group(1).replace('.', ' point ')
def _expand_dollars(m):
match = m.group(1)
parts = match.split('.')
def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
parts = value.replace(",", "").split('.')
if len(parts) > 2:
return match + ' dollars' # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
if dollars:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
return '%s %s' % (dollars, dollar_unit)
if cents:
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s' % (cents, cent_unit)
return 'zero dollars'
return f"{value} {inflection[2]}" # Unexpected format
text = []
integer = int(parts[0]) if parts[0] else 0
if integer > 0:
integer_unit = inflection.get(integer, inflection[2])
text.append(f"{integer} {integer_unit}")
fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if fraction > 0:
fraction_unit = inflection.get(fraction/100, inflection[0.02])
text.append(f"{fraction} {fraction_unit}")
if len(text) == 0:
return f"zero {inflection[2]}"
return " ".join(text)
def _expand_currency(m: "re.Match") -> str:
currencies = {
"$": {
0.01: "cent",
0.02: "cents",
1: "dollar",
2: "dollars",
},
"": {
0.01: "cent",
0.02: "cents",
1: "euro",
2: "euros",
},
"£": {
0.01: "penny",
0.02: "pence",
1: "pound sterling",
2: "pounds sterling",
},
"¥": {
# TODO rin
0.02: "sen",
2: "yen",
}
}
unit = m.group(1)
currency = currencies[unit]
value = m.group(2)
return __expand_currency(value, currency)
def _expand_ordinal(m):
@ -62,8 +92,7 @@ def _expand_number(m):
def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_currency_re, _expand_currency, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)

View File

@ -0,0 +1,44 @@
import re
import inflect
_inflect = inflect.engine()
_time_re = re.compile(r"""\b
((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
:
([0-5][0-9]) # minutes
\s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
\b""",
re.IGNORECASE | re.X)
def _expand_num(n: int) -> str:
return _inflect.number_to_words(n)
def _expand_time_english(match: "re.Match") -> str:
hour = int(match.group(1))
past_noon = hour >= 12
time = []
if hour > 12:
hour -= 12
elif hour == 0:
hour = 12
past_noon = True
time.append(_expand_num(hour))
minute = int(match.group(6))
if minute > 0:
if minute < 10:
time.append("oh")
time.append(_expand_num(minute))
am_pm = match.group(7)
if am_pm is None:
time.append("p m" if past_noon else "a m")
else:
time.extend(list(am_pm.replace(".", "")))
return " ".join(time)
def expand_time_english(text: str) -> str:
return re.sub(_time_re, _expand_time_english, text)

View File

@ -17,6 +17,8 @@ def plot_alignment(alignment,
alignment_ = alignment.detach().cpu().numpy().squeeze()
else:
alignment_ = alignment
alignment_ = alignment_.astype(
np.float32) if alignment_.dtype == np.float16 else alignment_
fig, ax = plt.subplots(figsize=fig_size)
im = ax.imshow(alignment_.T,
aspect='auto',

View File

@ -11,6 +11,7 @@ from TTS.tts.utils.data import StandardScaler
class AudioProcessor(object):
def __init__(self,
sample_rate=None,
resample=False,
num_mels=None,
min_level_db=None,
frame_shift_ms=None,
@ -39,6 +40,7 @@ class AudioProcessor(object):
print(" > Setting up Audio Processor...")
# setup class attributed
self.sample_rate = sample_rate
self.resample = resample
self.num_mels = num_mels
self.min_level_db = min_level_db or 0
self.frame_shift_ms = frame_shift_ms
@ -321,7 +323,9 @@ class AudioProcessor(object):
### save and load ###
def load_wav(self, filename, sr=None):
if sr is None:
if self.resample:
x, sr = librosa.load(filename, sr=self.sample_rate)
elif sr is None:
x, sr = sf.read(filename)
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
else:

View File

@ -3,6 +3,7 @@ import re
import json
import yaml
import pickle as pickle_tts
from shutil import copyfile
class RenamingUnpickler(pickle_tts.Unpickler):
@ -44,16 +45,19 @@ def load_config(config_path: str) -> AttrDict:
return config
def copy_config_file(config_file, out_path, new_fields):
"""Copy config.json to training folder and add
def copy_model_files(c, config_file, out_path, new_fields):
"""Copy config.json and other model files to training folder and add
new fields.
Args:
c (dict): model config from config.json.
config_file (str): path to config file.
out_path (str): output path to copy the file.
new_fields (dict): new fileds to be added or edited
in the config file.
"""
# copy config.json
copy_config_path = os.path.join(out_path, 'config.json')
config_lines = open(config_file, "r").readlines()
# add extra information fields
for key, value in new_fields.items():
@ -62,6 +66,10 @@ def copy_config_file(config_file, out_path, new_fields):
else:
new_line = '"{}":{},\n'.format(key, value)
config_lines.insert(1, new_line)
config_out_file = open(out_path, "w")
config_out_file = open(copy_config_path, "w")
config_out_file.writelines(config_lines)
config_out_file.close()
# copy model stats file if available
if c.audio['stats_path'] is not None:
copy_stats_path = os.path.join(out_path, 'scale_stats.npy')
copyfile(c.audio['stats_path'], copy_stats_path)

View File

@ -105,8 +105,8 @@ class Wavegrad(nn.Module):
self.noise_level = self.noise_level.to(y_0)
if len(y_0.shape) == 3:
y_0 = y_0.squeeze(1)
s = torch.randint(1, self.num_steps + 1, [y_0.shape[0]])
l_a, l_b = self.noise_level[s-1], self.noise_level[s]
s = torch.randint(0, self.num_steps - 1, [y_0.shape[0]])
l_a, l_b = self.noise_level[s], self.noise_level[s+1]
noise_scale = l_a + torch.rand(y_0.shape[0]).to(y_0) * (l_b - l_a)
noise_scale = noise_scale.unsqueeze(1)
noise = torch.randn_like(y_0)

View File

@ -6,11 +6,12 @@ nosetests tests -x &&\
# runtime tests
./tests/test_server_package.sh && \
./tests/test_tts_train.sh && \
./tests/test_tacotron_train.sh && \
./tests/test_glow-tts_train.sh && \
./tests/test_vocoder_gan_train.sh && \
./tests/test_vocoder_wavernn_train.sh && \
./tests/test_vocoder_wavegrad_train.sh && \
./tests/test_speedy_speech_train.sh && \
# linter check
cardboardlinter --refspec master

View File

@ -33,7 +33,7 @@ args, unknown_args = parser.parse_known_args()
# Remove our arguments from argv so that setuptools doesn't see them
sys.argv = [sys.argv[0]] + unknown_args
version = '0.0.6'
version = '0.0.8'
# Adapted from https://github.com/pytorch/pytorch
cwd = os.path.dirname(os.path.abspath(__file__))

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -70,6 +70,7 @@
"eval_batch_size":1,
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
"data_dep_init_iter": 1,
// VALIDATION
"run_eval": true,
@ -85,7 +86,19 @@
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
"encoder_type": "gatedconv",
"hidden_channels_encoder": 192,
"hidden_channels_decoder": 192,
"hidden_channels_duration_predictor": 256,
"use_encoder_prenet": true,
"encoder_type": "rel_pos_transformer",
"encoder_params": {
"kernel_size":3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 768,
"input_length": null
},
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console.
@ -105,6 +118,8 @@
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 500, // DATASET-RELATED: maximum text length
"compute_f0": false, // compute f0 values in data-loader
"compute_input_seq_cache": true,
"use_noise_augment": true,
// PATHS
"output_path": "tests/train_outputs/",

View File

@ -0,0 +1,153 @@
{
"model": "speedy_speech",
"run_name": "test_sample_dataset_run",
"run_description": "sample dataset test run",
// AUDIO PARAMETERS
"audio":{
// stft parameters
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1,
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
// "characters":{
// "pad": "_",
// "eos": "&",
// "bos": "*",
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? ",
// "punctuations":"!'(),-.:;? ",
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ'̃' "
// },
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// MODEL PARAMETERS
"positional_encoding": true,
"hidden_channels": 128,
"encoder_type": "residual_conv_bn",
"encoder_type": "residual_conv_bn",
"encoder_params":{
"kernel_size": 4,
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
"num_conv_blocks": 2,
"num_res_blocks": 13
},
"decoder_type": "residual_conv_bn",
"decoder_params":{
"kernel_size": 4,
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
"num_conv_blocks": 2,
"num_res_blocks": 17
},
// TRAINING
"batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":32,
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
// LOSS PARAMETERS
"ssim_alpha": 1,
"l1_alpha": 1,
"huber_alpha": 1,
// VALIDATION
"run_eval": true,
"test_delay_epochs": -1, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule": true, // use noam warmup and lr schedule.
"grad_clip": 1.0, // upper limit for gradients for clipping.
"epochs": 1, // total number of epochs to train.
"lr": 0.002, // Initial learning rate. If Noam decay is active, maximum learning rate.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
// TENSORBOARD and LOGGING
"print_step": 1, // Number of steps to log training on console.
"tb_plot_step": 100, // Number of steps to plot TB training figures.
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n
"mixed_precision": false,
// DATA LOADING
"text_cleaner": "english_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 0, // number of evaluation data loader processes.
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 300, // DATASET-RELATED: maximum text length
"compute_f0": false, // compute f0 values in data-loader
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
// PATHS
"output_path": "tests/train_outputs/",
// PHONEMES
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronoun[ciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
// DATASETS
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "tests/data/ljspeech/",
"meta_file_train": "metadata.csv",
"meta_file_val": "metadata.csv",
"meta_file_attn_mask": "tests/data/ljspeech/metadata_attn_mask.txt"
}
]
}

View File

@ -100,7 +100,7 @@
"prenet_dropout": false, // enable/disable dropout at prenet.
// TACOTRON ATTENTION
"attention_type": "original", // 'original' or 'graves'
"attention_type": "original", // 'original' , 'graves', 'dynamic_convolution'
"attention_heads": 4, // number of attention heads (only for 'graves')
"attention_norm": "sigmoid", // softmax or sigmoid.
"windowing": false, // Enables attention windowing. Used only in eval mode.
@ -132,6 +132,7 @@
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 153, // DATASET-RELATED: maximum text length
"compute_input_seq_cache": true,
// PATHS
"output_path": "tests/train_outputs/",

View File

@ -42,60 +42,62 @@ class GlowTTSTrainTest(unittest.TestCase):
criterion = criterion = GlowTTSLoss()
# model to train
model = GlowTts(num_chars=32,
hidden_channels=128,
filter_channels=32,
filter_channels_dp=32,
out_channels=80,
kernel_size=3,
num_heads=2,
num_layers_enc=6,
dropout_p=0.1,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=5,
num_block_layers=4,
dropout_p_dec=0.,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_sqz=1,
sigmoid_scale=False,
rel_attn_window_size=None,
input_length=None,
mean_only=False,
hidden_channels_enc=None,
hidden_channels_dec=None,
use_encoder_prenet=False,
encoder_type="transformer").to(device)
model = GlowTts(
num_chars=32,
hidden_channels_enc=128,
hidden_channels_dec=128,
hidden_channels_dp=32,
out_channels=80,
encoder_type='rel_pos_transformer',
encoder_params={
'kernel_size': 3,
'dropout_p': 0.1,
'num_layers': 6,
'num_heads': 2,
'hidden_channels_ffn': 768, # 4 times the hidden_channels
'input_length': None
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=5,
num_block_layers=4,
dropout_p_dec=0.,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False).to(device)
# reference model to compare model weights
model_ref = GlowTts(num_chars=32,
hidden_channels=128,
filter_channels=32,
filter_channels_dp=32,
out_channels=80,
kernel_size=3,
num_heads=2,
num_layers_enc=6,
dropout_p=0.1,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=5,
num_block_layers=4,
dropout_p_dec=0.,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_sqz=1,
sigmoid_scale=False,
rel_attn_window_size=None,
input_length=None,
mean_only=False,
hidden_channels_enc=None,
hidden_channels_dec=None,
use_encoder_prenet=False,
encoder_type="transformer").to(device)
model_ref = GlowTts(
num_chars=32,
hidden_channels_enc=128,
hidden_channels_dec=128,
hidden_channels_dp=32,
out_channels=80,
encoder_type='rel_pos_transformer',
encoder_params={
'kernel_size': 3,
'dropout_p': 0.1,
'num_layers': 6,
'num_heads': 2,
'hidden_channels_ffn': 768, # 4 times the hidden_channels
'input_length': None
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=5,
num_block_layers=4,
dropout_p_dec=0.,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False).to(device)
model.train()
print(" > Num parameters for GlowTTS model:%s" %

View File

@ -0,0 +1,168 @@
import torch
from TTS.tts.layers.speedy_speech.encoder import Encoder
from TTS.tts.layers.speedy_speech.decoder import Decoder
from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
from TTS.tts.utils.generic_utils import sequence_mask
from TTS.tts.models.speedy_speech import SpeedySpeech
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def test_encoder():
input_dummy = torch.rand(8, 14, 37).to(device)
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
input_lengths[-1] = 37
input_mask = torch.unsqueeze(
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
# residual bn conv encoder
layer = Encoder(out_channels=11,
in_hidden_channels=14,
encoder_type='residual_conv_bn').to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# transformer encoder
layer = Encoder(out_channels=11,
in_hidden_channels=14,
encoder_type='transformer',
encoder_params={
'hidden_channels_ffn': 768,
'num_heads': 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"rel_attn_window_size": 4,
"input_length": None
}).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
def test_decoder():
input_dummy = torch.rand(8, 128, 37).to(device)
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
input_lengths[-1] = 37
input_mask = torch.unsqueeze(
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
# residual bn conv decoder
layer = Decoder(out_channels=11, in_hidden_channels=128).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# transformer decoder
layer = Decoder(out_channels=11,
in_hidden_channels=128,
decoder_type='transformer',
decoder_params={
'hidden_channels_ffn': 128,
'num_heads': 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 8,
"rel_attn_window_size": 4,
"input_length": None
}).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# wavenet decoder
layer = Decoder(out_channels=11,
in_hidden_channels=128,
decoder_type='wavenet',
decoder_params={
"num_blocks": 12,
"hidden_channels": 192,
"kernel_size": 5,
"dilation_rate": 1,
"num_layers": 4,
"dropout_p": 0.05
}).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
def test_duration_predictor():
input_dummy = torch.rand(8, 128, 27).to(device)
input_lengths = torch.randint(20, 27, (8, )).long().to(device)
input_lengths[-1] = 27
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)),
1).to(device)
layer = DurationPredictor(hidden_channels=128).to(device)
output = layer(input_dummy, x_mask)
assert list(output.shape) == [8, 1, 27]
def test_speedy_speech():
num_chars = 7
B = 8
T_en = 37
T_de = 74
x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device)
x_lengths = torch.randint(31, T_en, (B, )).long().to(device)
x_lengths[-1] = T_en
# set durations. max total duration should be equal to T_de
durations = torch.randint(1, 4, (B, T_en))
durations = durations * (T_de / durations.sum(1)).unsqueeze(1)
durations = durations.to(torch.long).to(device)
max_dur = durations.sum(1).max()
durations[:, 0] += T_de - max_dur if T_de > max_dur else 0
y_lengths = durations.sum(1)
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128)
if use_cuda:
model.cuda()
# forward pass
o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations)
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]
assert list(o_dr.shape) == [B, T_en]
# with speaker embedding
model = SpeedySpeech(num_chars,
out_channels=80,
hidden_channels=128,
num_speakers=10,
c_in_channels=256).to(device)
model.forward(x_dummy,
x_lengths,
y_lengths,
durations,
g=torch.randint(0, 10, (B,)).to(device))
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]
assert list(o_dr.shape) == [B, T_en]
# with speaker external embedding
model = SpeedySpeech(num_chars,
out_channels=80,
hidden_channels=128,
num_speakers=10,
external_c=True,
c_in_channels=256).to(device)
model.forward(x_dummy,
x_lengths,
y_lengths,
durations,
g=torch.rand((B,256)).to(device))
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]
assert list(o_dr.shape) == [B, T_en]

View File

@ -3,11 +3,11 @@ set -xe
BASEDIR=$(dirname "$0")
echo "$BASEDIR"
# run training
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_speedy_speech.py --config_path $BASEDIR/inputs/test_speedy_speech.json
# find the training folder
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
echo $LATEST_FOLDER
# continue the previous training
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_speedy_speech.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
# remove all the outputs
rm -rf $BASEDIR/train_outputs/

View File

@ -1,14 +1,13 @@
#!/usr/bin/env bash
set -xe
BASEDIR=$(dirname "$0")
echo "$BASEDIR"
# run training
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --config_path $BASEDIR/inputs/test_train_config.json
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
# find the training folder
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
echo $LATEST_FOLDER
# continue the previous training
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
# remove all the outputs
rm -rf $BASEDIR/train_outputs/

Some files were not shown because too many files have changed in this diff Show More