mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev'
commit
d320ccf6c6
.circleci
.travis
TTS
server/templates
tts
datasets
tf/layers
vocoder/models
tests
data/ljspeech
wavs
|
@ -0,0 +1,57 @@
|
|||
version: 2
|
||||
|
||||
workflows:
|
||||
version: 2
|
||||
test:
|
||||
jobs:
|
||||
- test-3.6
|
||||
- test-3.7
|
||||
- test-3.8
|
||||
|
||||
executor: ubuntu-latest
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
jobs:
|
||||
test-3.6: &test-template
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
resource_class: large
|
||||
working_directory: ~/repo
|
||||
steps:
|
||||
- checkout
|
||||
- run: |
|
||||
sudo apt update
|
||||
sudo apt install espeak git
|
||||
# so we can take advantage of pyproject.toml build-dependency support
|
||||
- run: python3 -m pip install --upgrade pip
|
||||
- run: python3 -m pip install numpy Cython
|
||||
- run: |
|
||||
python3 setup.py egg_info
|
||||
python3 -m pip install -e .
|
||||
- run: |
|
||||
python3 -m pip install --quiet --upgrade cardboardlint pylint
|
||||
cardboardlinter --refspec ${CIRCLE_BRANCH} -n auto
|
||||
- run: nosetests tests --nocapture
|
||||
- run: |
|
||||
./tests/test_server_package.sh
|
||||
./tests/test_glow-tts_train.sh
|
||||
./tests/test_server_package.sh
|
||||
./tests/test_tacotron_train.sh
|
||||
./tests/test_vocoder_gan_train.sh
|
||||
./tests/test_vocoder_wavegrad_train.sh
|
||||
./tests/test_vocoder_wavernn_train.sh
|
||||
./tests/test_speedy_speech_train.sh
|
||||
|
||||
test-3.7:
|
||||
<<: *test-template
|
||||
docker:
|
||||
- image: circleci/python:3.7
|
||||
|
||||
test-3.8:
|
||||
<<: *test-template
|
||||
docker:
|
||||
- image: circleci/python:3.8
|
32
.travis.yml
32
.travis.yml
|
@ -1,32 +0,0 @@
|
|||
language: python
|
||||
|
||||
git:
|
||||
quiet: true
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update
|
||||
- sudo apt-get -y install espeak
|
||||
- python -m pip install --upgrade pip
|
||||
- pip install six==1.12.0
|
||||
- pip install --upgrade cython
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- name: "Lint check"
|
||||
python: "3.6"
|
||||
install: pip install --quiet --upgrade cardboardlint pylint
|
||||
env: TEST_SUITE="lint"
|
||||
- name: "Unit tests"
|
||||
python: "3.6"
|
||||
install:
|
||||
- python setup.py egg_info
|
||||
- pip install -e .
|
||||
env: TEST_SUITE="unittest"
|
||||
- name: "Unit tests"
|
||||
python: "3.6"
|
||||
install:
|
||||
- python setup.py egg_info
|
||||
- pip install -e .
|
||||
env: TEST_SUITE="testscripts"
|
||||
|
||||
script: ./.travis/script
|
|
@ -1,22 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
git remote set-branches --add origin $TRAVIS_BRANCH
|
||||
git fetch
|
||||
|
||||
if [[ ( "$TRAVIS_PULL_REQUEST" != "false" ) && ( "$TEST_SUITE" == "lint" ) ]]; then
|
||||
# Run cardboardlinter, in case of pull requests
|
||||
cardboardlinter --refspec origin/$TRAVIS_BRANCH -n auto
|
||||
fi
|
||||
|
||||
if [[ "$TEST_SUITE" == "unittest" ]]; then
|
||||
nosetests tests --nocapture
|
||||
./tests/test_server_package.sh
|
||||
fi
|
||||
|
||||
if [[ "$TEST_SUITE" == "testscripts" ]]; then
|
||||
# test model training scripts
|
||||
./tests/test_tts_train.sh
|
||||
./tests/test_vocoder_gan_train.sh
|
||||
./tests/test_vocoder_wavernn_train.sh
|
||||
fi
|
123
README.md
123
README.md
|
@ -1,32 +1,63 @@
|
|||
<p align="center"><img src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" data-canonical-src="
|
||||
" width="320" height="95" /></p>
|
||||
<img src="https://user-images.githubusercontent.com/1402048/104139991-3fd15e00-53af-11eb-8640-3a78a64641dd.png" data-canonical-src="
|
||||
" width="256" height="256" align="right" />
|
||||
|
||||
<br/>
|
||||
# TTS: Text-to-Speech for all.
|
||||
|
||||
<<<<<<< HEAD
|
||||
<p align='center'>
|
||||
<img src='https://circleci.com/gh/mozilla/TTS/tree/dev.svg?style=svg' alt="mozilla"/>
|
||||
<a href='https://discourse.mozilla.org/c/tts'><img src="https://img.shields.io/badge/discourse-online-green.svg"/></a>
|
||||
<a href='https://opensource.org/licenses/MPL-2.0'> <img src="https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg"/></a>
|
||||
</p>
|
||||
=======
|
||||
TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to be achive the best trade-off among ease-of-training, speed and quality.
|
||||
TTS comes with [pretrained models](https://github.com/mozilla/TTS/wiki/Released-Models), tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
|
||||
>>>>>>> dev
|
||||
|
||||
<br/>
|
||||
[]()
|
||||
[](https://opensource.org/licenses/MPL-2.0)
|
||||
|
||||
<<<<<<< HEAD
|
||||
TTS is a deep learning based Text2Speech project, low in cost and high in quality.
|
||||
=======
|
||||
:loudspeaker: [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
|
||||
|
||||
English Voice Samples: https://erogol.github.io/ddc-samples/
|
||||
:man_cook: [TTS training recipes](https://github.com/erogol/TTS_recipes)
|
||||
>>>>>>> dev
|
||||
|
||||
TTS training recipes: https://github.com/erogol/TTS_recipes
|
||||
:page_facing_up: [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers)
|
||||
|
||||
TTS paper collection: https://github.com/erogol/TTS-papers
|
||||
## 💬 Where to ask questions
|
||||
Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly, so that more people can benefit from it.
|
||||
|
||||
[](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/7)
|
||||
| Type | Platforms |
|
||||
| ------------------------------- | --------------------------------------- |
|
||||
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
||||
| ❔ **FAQ** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/FAQ) |
|
||||
| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
|
||||
| 👩💻 **Usage Questions** | [Discourse Forum] |
|
||||
| 🗯 **General Discussion** | [Discourse Forum] and [Matrix Channel] |
|
||||
|
||||
## TTS Performance
|
||||
[github issue tracker]: https://github.com/mozilla/tts/issues
|
||||
[discourse forum]: https://discourse.mozilla.org/c/tts/
|
||||
[matrix channel]: https://matrix.to/#/!KTePhNahjgiVumkqca:matrix.org?via=matrix.org
|
||||
[Tutorials and Examples]: https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials
|
||||
|
||||
|
||||
## 🔗 Links and Resources
|
||||
| Type | Links |
|
||||
| ------------------------------- | --------------------------------------- |
|
||||
| 👩🏾🏫 **Tutorials and Examples** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials) |
|
||||
| 🤖 **Released Models** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/Released-Models)|
|
||||
| 💻 **Docker Image** | [Repository by @synesthesiam](https://github.com/synesthesiam/docker-mozillatts)|
|
||||
|
||||
## 🥇 TTS Performance
|
||||
<p align="center"><img src="https://discourse-prod-uploads-81679984178418.s3.dualstack.us-west-2.amazonaws.com/optimized/3X/6/4/6428f980e9ec751c248e591460895f7881aec0c6_2_1035x591.png" width="800" /></p>
|
||||
|
||||
"Mozilla*" and "Judy*" are our models.
|
||||
[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results)
|
||||
|
||||
<<<<<<< HEAD
|
||||
## Provided Models and Methods
|
||||
Text-to-Spectrogram:
|
||||
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
|
||||
|
@ -52,6 +83,8 @@ Vocoders:
|
|||
|
||||
You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).
|
||||
|
||||
=======
|
||||
>>>>>>> dev
|
||||
## Features
|
||||
- High performance Deep Learning models for Text2Speech tasks.
|
||||
- Text2Spec models (Tacotron, Tacotron2).
|
||||
|
@ -68,26 +101,39 @@ You can also help us implement more models. Some TTS related work can be found [
|
|||
- Notebooks for extensive model benchmarking.
|
||||
- Modular (but not too much) code base enabling easy testing for new ideas.
|
||||
|
||||
## Main Requirements and Installation
|
||||
Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation.
|
||||
* python>=3.6
|
||||
* pytorch>=1.5.0
|
||||
* tensorflow>=2.3
|
||||
* librosa
|
||||
* tensorboard
|
||||
* tensorboardX
|
||||
* matplotlib
|
||||
* unidecode
|
||||
## Implemented Models
|
||||
### Text-to-Spectrogram
|
||||
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
|
||||
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
|
||||
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
|
||||
- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
|
||||
|
||||
Install TTS using ```setup.py```. It will install all of the requirements automatically and make TTS available to all the python environment as an ordinary python module.
|
||||
### Attention Methods
|
||||
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
|
||||
- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
|
||||
- Graves Attention: [paper](https://arxiv.org/abs/1907.09006)
|
||||
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
|
||||
|
||||
```python setup.py develop```
|
||||
### Speaker Encoder
|
||||
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
|
||||
- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
|
||||
|
||||
Or you can use ```requirements.txt``` to install the requirements only.
|
||||
### Vocoders
|
||||
- MelGAN: [paper](https://arxiv.org/abs/1710.10467)
|
||||
- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
|
||||
- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
|
||||
- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
|
||||
- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
|
||||
- WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
|
||||
|
||||
```pip install -r requirements.txt```
|
||||
You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).
|
||||
|
||||
### Directory Structure
|
||||
## Install TTS
|
||||
TTS supports **python >= 3.6**.
|
||||
|
||||
```python setup.py install``` or ```python setup.py develop``` to keep your installation in your working directory.
|
||||
|
||||
## Directory Structure
|
||||
```
|
||||
|- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
|
||||
|- utils/ (common utilities.)
|
||||
|
@ -108,12 +154,6 @@ Or you can use ```requirements.txt``` to install the requirements only.
|
|||
|- (same)
|
||||
```
|
||||
|
||||
### Docker
|
||||
A docker image is created by [@synesthesiam](https://github.com/synesthesiam) and shared in a separate [repository](https://github.com/synesthesiam/docker-mozillatts) with the latest LJSpeech models.
|
||||
|
||||
## Release Models
|
||||
Please visit [our wiki.](https://github.com/mozilla/TTS/wiki/Released-Models)
|
||||
|
||||
## Sample Model Output
|
||||
Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset.
|
||||
|
||||
|
@ -123,8 +163,11 @@ Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-articl
|
|||
|
||||
<img src="images/example_model_output.png?raw=true" alt="example_output" width="400"/>
|
||||
|
||||
<<<<<<< HEAD
|
||||
## [TTS Tutorials and Notebooks](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials)
|
||||
|
||||
=======
|
||||
>>>>>>> dev
|
||||
## Datasets and Data-Loading
|
||||
TTS provides a generic dataloader easy to use for your custom dataset.
|
||||
You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples.
|
||||
|
@ -139,7 +182,7 @@ Some of the public datasets that we successfully applied TTS:
|
|||
- [LibriTTS](https://openslr.org/60/)
|
||||
- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01
|
||||
|
||||
## Training and Fine-tuning LJ-Speech
|
||||
## Example: Training and Fine-tuning LJ-Speech Dataset
|
||||
Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below.
|
||||
|
||||
To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listening to the results is the best way to go.
|
||||
|
@ -189,11 +232,8 @@ If you like to use TTS to try a new idea and like to share your experiments with
|
|||
(If you have an idea for better collaboration, let us know)
|
||||
- Create a new branch.
|
||||
- Open an issue pointing your branch.
|
||||
- Explain your experiment.
|
||||
- Share your results as you proceed. (Tensorboard log files, audio results, visuals etc.)
|
||||
- Use LJSpeech dataset (for English) if you like to compare results with the released models. (It is the most open scalable dataset for quick experimentation)
|
||||
|
||||
## [Contact/Getting Help](https://github.com/mozilla/TTS/wiki/Contact-and-Getting-Help)
|
||||
- Explain your idea and experiment.
|
||||
- Share your results regularly. (Tensorboard log files, audio results, visuals etc.)
|
||||
|
||||
## Major TODOs
|
||||
- [x] Implement the model.
|
||||
|
@ -205,17 +245,6 @@ If you like to use TTS to try a new idea and like to share your experiments with
|
|||
- [x] Multi-speaker embedding.
|
||||
- [x] Model optimization (model export, model pruning etc.)
|
||||
|
||||
<!--## References
|
||||
- [Efficient Neural Audio Synthesis](https://arxiv.org/pdf/1802.08435.pdf)
|
||||
- [Attention-Based models for speech recognition](https://arxiv.org/pdf/1506.07503.pdf)
|
||||
- [Generating Sequences With Recurrent Neural Networks](https://arxiv.org/pdf/1308.0850.pdf)
|
||||
- [Char2Wav: End-to-End Speech Synthesis](https://openreview.net/pdf?id=B1VWyySKx)
|
||||
- [VoiceLoop: Voice Fitting and Synthesis via a Phonological Loop](https://arxiv.org/pdf/1707.06588.pdf)
|
||||
- [WaveRNN](https://arxiv.org/pdf/1802.08435.pdf)
|
||||
- [Faster WaveNet](https://arxiv.org/abs/1611.09482)
|
||||
- [Parallel WaveNet](https://arxiv.org/abs/1711.10433)
|
||||
-->
|
||||
|
||||
### Acknowledgement
|
||||
- https://github.com/keithito/tacotron (Dataset pre-processing)
|
||||
- https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture)
|
||||
|
|
|
@ -0,0 +1,166 @@
|
|||
import argparse
|
||||
import importlib
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
from argparse import RawTextHelpFormatter
|
||||
from TTS.tts.datasets.TTSDataset import MyDataset
|
||||
from TTS.tts.utils.generic_utils import setup_model
|
||||
from TTS.tts.utils.io import load_checkpoint
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='''Extract attention masks from trained Tacotron/Tacotron2 models.
|
||||
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n'''
|
||||
|
||||
'''Each attention mask is written to the same path as the input wav file with ".npy" file extension.
|
||||
(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n'''
|
||||
|
||||
'''
|
||||
Example run:
|
||||
CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
|
||||
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
|
||||
--config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
|
||||
--dataset_metafile /root/LJSpeech-1.1/metadata.csv
|
||||
--data_path /root/LJSpeech-1.1/
|
||||
--batch_size 32
|
||||
--dataset ljspeech
|
||||
--use_cuda True
|
||||
''',
|
||||
formatter_class=RawTextHelpFormatter
|
||||
)
|
||||
parser.add_argument('--model_path',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Path to Tacotron/Tacotron2 model file ')
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Path to Tacotron/Tacotron2 config file.',
|
||||
)
|
||||
parser.add_argument('--dataset',
|
||||
type=str,
|
||||
default='',
|
||||
required=True,
|
||||
help='Target dataset processor name from TTS.tts.dataset.preprocess.')
|
||||
|
||||
parser.add_argument(
|
||||
'--dataset_metafile',
|
||||
type=str,
|
||||
default='',
|
||||
required=True,
|
||||
help='Dataset metafile inclusing file paths with transcripts.')
|
||||
parser.add_argument(
|
||||
'--data_path',
|
||||
type=str,
|
||||
default='',
|
||||
help='Defines the data path. It overwrites config.json.')
|
||||
parser.add_argument('--use_cuda',
|
||||
type=bool,
|
||||
default=False,
|
||||
help="enable/disable cuda.")
|
||||
|
||||
parser.add_argument(
|
||||
'--batch_size',
|
||||
default=16,
|
||||
type=int,
|
||||
help='Batch size for the model. Use batch_size=1 if you have no CUDA.')
|
||||
args = parser.parse_args()
|
||||
|
||||
C = load_config(args.config_path)
|
||||
ap = AudioProcessor(**C.audio)
|
||||
|
||||
# if the vocabulary was passed, replace the default
|
||||
if 'characters' in C.keys():
|
||||
symbols, phonemes = make_symbols(**C.characters)
|
||||
|
||||
# load the model
|
||||
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
||||
# TODO: handle multi-speaker
|
||||
model = setup_model(num_chars, num_speakers=0, c=C)
|
||||
model, _ = load_checkpoint(model, args.model_path, None, args.use_cuda)
|
||||
model.eval()
|
||||
|
||||
# data loader
|
||||
preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')
|
||||
preprocessor = getattr(preprocessor, args.dataset)
|
||||
meta_data = preprocessor(args.data_path, args.dataset_metafile)
|
||||
dataset = MyDataset(model.decoder.r,
|
||||
C.text_cleaner,
|
||||
compute_linear_spec=False,
|
||||
ap=ap,
|
||||
meta_data=meta_data,
|
||||
tp=C.characters if 'characters' in C.keys() else None,
|
||||
add_blank=C['add_blank'] if 'add_blank' in C.keys() else False,
|
||||
use_phonemes=C.use_phonemes,
|
||||
phoneme_cache_path=C.phoneme_cache_path,
|
||||
phoneme_language=C.phoneme_language,
|
||||
enable_eos_bos=C.enable_eos_bos_chars)
|
||||
|
||||
dataset.sort_items()
|
||||
loader = DataLoader(dataset,
|
||||
batch_size=args.batch_size,
|
||||
num_workers=4,
|
||||
collate_fn=dataset.collate_fn,
|
||||
shuffle=False,
|
||||
drop_last=False)
|
||||
|
||||
# compute attentions
|
||||
file_paths = []
|
||||
with torch.no_grad():
|
||||
for data in tqdm(loader):
|
||||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
linear_input = data[3]
|
||||
mel_input = data[4]
|
||||
mel_lengths = data[5]
|
||||
stop_targets = data[6]
|
||||
item_idxs = data[7]
|
||||
|
||||
# dispatch data to GPU
|
||||
if args.use_cuda:
|
||||
text_input = text_input.cuda()
|
||||
text_lengths = text_lengths.cuda()
|
||||
mel_input = mel_input.cuda()
|
||||
mel_lengths = mel_lengths.cuda()
|
||||
|
||||
mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(
|
||||
text_input, text_lengths, mel_input)
|
||||
|
||||
alignments = alignments.detach()
|
||||
for idx, alignment in enumerate(alignments):
|
||||
item_idx = item_idxs[idx]
|
||||
# interpolate if r > 1
|
||||
alignment = torch.nn.functional.interpolate(
|
||||
alignment.transpose(0, 1).unsqueeze(0),
|
||||
size=None,
|
||||
scale_factor=model.decoder.r,
|
||||
mode='nearest',
|
||||
align_corners=None,
|
||||
recompute_scale_factor=None).squeeze(0).transpose(0, 1)
|
||||
# remove paddings
|
||||
alignment = alignment[:mel_lengths[idx], :text_lengths[idx]].cpu().numpy()
|
||||
# set file paths
|
||||
wav_file_name = os.path.basename(item_idx)
|
||||
align_file_name = os.path.splitext(wav_file_name)[0] + '.npy'
|
||||
file_path = item_idx.replace(wav_file_name, align_file_name)
|
||||
# save output
|
||||
file_paths.append([item_idx, file_path])
|
||||
np.save(file_path, alignment)
|
||||
|
||||
# ourput metafile
|
||||
metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
|
||||
|
||||
with open(metafile, "w") as f:
|
||||
for p in file_paths:
|
||||
f.write(f"{p[0]}|{p[1]}\n")
|
||||
print(f" >> Metafile created: {metafile}")
|
|
@ -1,65 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pathlib
|
||||
import time
|
||||
import subprocess
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Call train.py as a new process and pass command arguments
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--continue_path',
|
||||
type=str,
|
||||
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default='',
|
||||
required='--config_path' not in sys.argv)
|
||||
parser.add_argument(
|
||||
'--restore_path',
|
||||
type=str,
|
||||
help='Model file to be restored. Use to finetune a model.',
|
||||
default='')
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
required='--continue_path' not in sys.argv
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
num_gpus = torch.cuda.device_count()
|
||||
group_id = time.strftime("%Y_%m_%d-%H%M%S")
|
||||
|
||||
# set arguments for train.py
|
||||
folder_path = pathlib.Path(__file__).parent.absolute()
|
||||
command = [os.path.join(folder_path, 'train_tts.py')]
|
||||
command.append('--continue_path={}'.format(args.continue_path))
|
||||
command.append('--restore_path={}'.format(args.restore_path))
|
||||
command.append('--config_path={}'.format(args.config_path))
|
||||
command.append('--group_id=group_{}'.format(group_id))
|
||||
command.append('')
|
||||
|
||||
# run processes
|
||||
processes = []
|
||||
for i in range(num_gpus):
|
||||
my_env = os.environ.copy()
|
||||
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
|
||||
command[-1] = '--rank={}'.format(i)
|
||||
stdout = None if i == 0 else open(os.devnull, 'w')
|
||||
p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
|
||||
processes.append(p)
|
||||
print(command)
|
||||
|
||||
for p in processes:
|
||||
p.wait()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -9,6 +9,7 @@ import string
|
|||
import time
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from TTS.tts.utils.generic_utils import setup_model, is_tacotron
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
|
@ -21,10 +22,31 @@ from TTS.vocoder.utils.generic_utils import setup_generator
|
|||
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):
|
||||
t_1 = time.time()
|
||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
|
||||
|
||||
# grab spectrogram (thx to the nice guys at mozilla discourse for codesnipplet)
|
||||
if args.save_spectogram:
|
||||
spec_file_name = args.text.replace(" ", "_")[0:10]
|
||||
spec_file_name = spec_file_name.translate(
|
||||
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.npy'
|
||||
spec_file_name = os.path.join(args.out_path, spec_file_name)
|
||||
spectrogram = torch.FloatTensor(mel_postnet_spec.T)
|
||||
spectrogram = spectrogram.unsqueeze(0)
|
||||
np.save(spec_file_name, spectrogram)
|
||||
print(" > Saving raw spectogram to " + spec_file_name)
|
||||
|
||||
if CONFIG.model == "Tacotron" and not use_gl:
|
||||
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
|
||||
if not use_gl:
|
||||
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
|
||||
# Use if not computed noise schedule with tune_wavegrad
|
||||
beta = np.linspace(1e-6, 0.01, 50)
|
||||
vocoder_model.compute_noise_level(beta)
|
||||
|
||||
# Use alternative when using output npy file from tune_wavegrad
|
||||
# beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item()
|
||||
# vocoder_model.compute_noise_level(beta['beta'])
|
||||
|
||||
device_type = "cuda" if use_cuda else "cpu"
|
||||
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0))
|
||||
if use_cuda and not use_gl:
|
||||
waveform = waveform.cpu()
|
||||
if not use_gl:
|
||||
|
@ -88,6 +110,11 @@ if __name__ == "__main__":
|
|||
'--gst_style',
|
||||
help="Wav path file for GST stylereference.",
|
||||
default=None)
|
||||
parser.add_argument(
|
||||
'--save_spectogram',
|
||||
type=bool,
|
||||
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
||||
default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -170,7 +197,7 @@ if __name__ == "__main__":
|
|||
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style)
|
||||
|
||||
# save the results
|
||||
file_name = args.text.replace(" ", "_")
|
||||
file_name = args.text.replace(" ", "_")[0:10]
|
||||
file_name = file_name.translate(
|
||||
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
|
||||
out_path = os.path.join(args.out_path, file_name)
|
||||
|
|
|
@ -13,15 +13,14 @@ from TTS.speaker_encoder.dataset import MyDataset
|
|||
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.speaker_encoder.utils.generic_utils import \
|
||||
check_config_speaker_encoder
|
||||
check_config_speaker_encoder, save_best_model
|
||||
from TTS.speaker_encoder.utils.visual import plot_embeddings
|
||||
from TTS.tts.datasets.preprocess import load_meta_data
|
||||
from TTS.tts.utils.io import save_best_model
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import (count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.utils.io import copy_model_files, load_config
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.training import NoamLR, check_update
|
||||
|
@ -255,7 +254,7 @@ if __name__ == '__main__':
|
|||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'),
|
||||
copy_model_files(c, args.config_path, OUT_PATH,
|
||||
new_fields)
|
||||
|
||||
LOG_DIR = OUT_PATH
|
||||
|
|
|
@ -7,41 +7,37 @@ import os
|
|||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from random import randrange
|
||||
|
||||
import torch
|
||||
from random import randrange
|
||||
# DISTRIBUTED
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP_th
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from TTS.tts.datasets.preprocess import load_meta_data
|
||||
from TTS.tts.datasets.TTSDataset import MyDataset
|
||||
from TTS.tts.layers.losses import GlowTTSLoss
|
||||
from TTS.tts.utils.generic_utils import setup_model, check_config_tts
|
||||
from TTS.tts.utils.generic_utils import check_config_tts, setup_model
|
||||
from TTS.tts.utils.io import save_best_model, save_checkpoint
|
||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
||||
from TTS.tts.utils.speakers import parse_speakers, load_speaker_mapping
|
||||
from TTS.tts.utils.speakers import parse_speakers
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.console_logger import ConsoleLogger
|
||||
from TTS.utils.distribute import init_distributed, reduce_tensor
|
||||
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.utils.io import copy_model_files, load_config
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.training import (NoamLR, check_update,
|
||||
setup_torch_training_env)
|
||||
|
||||
# DISTRIBUTED
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP_th
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from TTS.utils.distribute import init_distributed, reduce_tensor
|
||||
|
||||
from TTS.utils.training import NoamLR, setup_torch_training_env
|
||||
|
||||
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
||||
|
||||
def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
|
||||
def setup_loader(ap, r, is_val=False, verbose=False):
|
||||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
|
@ -61,8 +57,15 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
|
|||
use_phonemes=c.use_phonemes,
|
||||
phoneme_language=c.phoneme_language,
|
||||
enable_eos_bos=c.enable_eos_bos_chars,
|
||||
use_noise_augment=c['use_noise_augment'] and not is_val,
|
||||
verbose=verbose,
|
||||
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
|
||||
|
||||
if c.use_phonemes and c.compute_input_seq_cache:
|
||||
# precompute phonemes to have a better estimate of sequence lengths.
|
||||
dataset.compute_input_seq(c.num_loader_workers)
|
||||
dataset.sort_items()
|
||||
|
||||
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
|
@ -78,29 +81,29 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
|
|||
|
||||
|
||||
def format_data(data):
|
||||
if c.use_speaker_embedding:
|
||||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||
|
||||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
speaker_names = data[2]
|
||||
mel_input = data[4].permute(0, 2, 1) # B x D x T
|
||||
mel_lengths = data[5]
|
||||
attn_mask = data[8]
|
||||
item_idx = data[7]
|
||||
attn_mask = data[9]
|
||||
avg_text_length = torch.mean(text_lengths.float())
|
||||
avg_spec_length = torch.mean(mel_lengths.float())
|
||||
|
||||
if c.use_speaker_embedding:
|
||||
if c.use_external_speaker_embedding_file:
|
||||
speaker_ids = data[8]
|
||||
# return precomputed embedding vector
|
||||
speaker_c = data[8]
|
||||
else:
|
||||
speaker_ids = [
|
||||
# return speaker_id to be used by an embedding layer
|
||||
speaker_c = [
|
||||
speaker_mapping[speaker_name] for speaker_name in speaker_names
|
||||
]
|
||||
speaker_ids = torch.LongTensor(speaker_ids)
|
||||
speaker_c = torch.LongTensor(speaker_c)
|
||||
else:
|
||||
speaker_ids = None
|
||||
speaker_c = None
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
|
@ -108,15 +111,15 @@ def format_data(data):
|
|||
text_lengths = text_lengths.cuda(non_blocking=True)
|
||||
mel_input = mel_input.cuda(non_blocking=True)
|
||||
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
||||
if speaker_ids is not None:
|
||||
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
||||
if speaker_c is not None:
|
||||
speaker_c = speaker_c.cuda(non_blocking=True)
|
||||
if attn_mask is not None:
|
||||
attn_mask = attn_mask.cuda(non_blocking=True)
|
||||
return text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\
|
||||
avg_text_length, avg_spec_length, attn_mask
|
||||
return text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
||||
avg_text_length, avg_spec_length, attn_mask, item_idx
|
||||
|
||||
|
||||
def data_depended_init(model, ap, speaker_mapping=None):
|
||||
def data_depended_init(data_loader, model, ap):
|
||||
"""Data depended initialization for activation normalization."""
|
||||
if hasattr(model, 'module'):
|
||||
for f in model.module.decoder.flows:
|
||||
|
@ -127,20 +130,22 @@ def data_depended_init(model, ap, speaker_mapping=None):
|
|||
if getattr(f, "set_ddi", False):
|
||||
f.set_ddi(True)
|
||||
|
||||
data_loader = setup_loader(ap, 1, is_val=False, speaker_mapping=speaker_mapping)
|
||||
model.train()
|
||||
print(" > Data depended initialization ... ")
|
||||
num_iter = 0
|
||||
with torch.no_grad():
|
||||
for _, data in enumerate(data_loader):
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\
|
||||
_, _, attn_mask = format_data(data)
|
||||
text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\
|
||||
_, _, attn_mask, item_idx = format_data(data)
|
||||
|
||||
# forward pass model
|
||||
_ = model.forward(
|
||||
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids)
|
||||
break
|
||||
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=spekaer_embed)
|
||||
if num_iter == c.data_dep_init_iter:
|
||||
break
|
||||
num_iter += 1
|
||||
|
||||
if hasattr(model, 'module'):
|
||||
for f in model.module.decoder.flows:
|
||||
|
@ -153,10 +158,9 @@ def data_depended_init(model, ap, speaker_mapping=None):
|
|||
return model
|
||||
|
||||
|
||||
def train(model, criterion, optimizer, scheduler,
|
||||
ap, global_step, epoch, speaker_mapping=None):
|
||||
data_loader = setup_loader(ap, 1, is_val=False,
|
||||
verbose=(epoch == 0), speaker_mapping=speaker_mapping)
|
||||
def train(data_loader, model, criterion, optimizer, scheduler,
|
||||
ap, global_step, epoch):
|
||||
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
|
@ -172,8 +176,8 @@ def train(model, criterion, optimizer, scheduler,
|
|||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\
|
||||
avg_text_length, avg_spec_length, attn_mask = format_data(data)
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
||||
avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data)
|
||||
|
||||
loader_time = time.time() - end_time
|
||||
|
||||
|
@ -183,7 +187,7 @@ def train(model, criterion, optimizer, scheduler,
|
|||
# forward pass model
|
||||
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
|
||||
z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
|
||||
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids)
|
||||
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c)
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
|
||||
|
@ -203,10 +207,6 @@ def train(model, criterion, optimizer, scheduler,
|
|||
c.grad_clip)
|
||||
optimizer.step()
|
||||
|
||||
|
||||
grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
|
||||
optimizer.step()
|
||||
|
||||
# setup lr
|
||||
if c.noam_schedule:
|
||||
scheduler.step()
|
||||
|
@ -215,7 +215,7 @@ def train(model, criterion, optimizer, scheduler,
|
|||
current_lr = optimizer.param_groups[0]['lr']
|
||||
|
||||
# compute alignment error (the lower the better )
|
||||
align_error = 1 - alignment_diagonal_score(alignments)
|
||||
align_error = 1 - alignment_diagonal_score(alignments, binary=True)
|
||||
loss_dict['align_error'] = align_error
|
||||
|
||||
step_time = time.time() - start_time
|
||||
|
@ -274,10 +274,18 @@ def train(model, criterion, optimizer, scheduler,
|
|||
save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH,
|
||||
model_loss=loss_dict['loss'])
|
||||
|
||||
# wait all kernels to be completed
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Diagnostic visualizations
|
||||
# direct pass on model for spec predictions
|
||||
target_speaker = None if speaker_ids is None else speaker_ids[:1]
|
||||
spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker)
|
||||
target_speaker = None if speaker_c is None else speaker_c[:1]
|
||||
|
||||
if hasattr(model, 'module'):
|
||||
spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker)
|
||||
else:
|
||||
spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker)
|
||||
|
||||
spec_pred = spec_pred.permute(0, 2, 1)
|
||||
gt_spec = mel_input.permute(0, 2, 1)
|
||||
const_spec = spec_pred[0].data.cpu().numpy()
|
||||
|
@ -313,8 +321,7 @@ def train(model, criterion, optimizer, scheduler,
|
|||
|
||||
|
||||
@torch.no_grad()
|
||||
def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping):
|
||||
data_loader = setup_loader(ap, 1, is_val=True, speaker_mapping=speaker_mapping)
|
||||
def evaluate(data_loader, model, criterion, ap, global_step, epoch):
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
|
@ -324,12 +331,12 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping):
|
|||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\
|
||||
_, _, attn_mask = format_data(data)
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
||||
_, _, attn_mask, item_idx = format_data(data)
|
||||
|
||||
# forward pass model
|
||||
z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
|
||||
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids)
|
||||
text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c)
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
|
||||
|
@ -370,7 +377,7 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping):
|
|||
if args.rank == 0:
|
||||
# Diagnostic visualizations
|
||||
# direct pass on model for spec predictions
|
||||
target_speaker = None if speaker_ids is None else speaker_ids[:1]
|
||||
target_speaker = None if speaker_c is None else speaker_c[:1]
|
||||
if hasattr(model, 'module'):
|
||||
spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker)
|
||||
else:
|
||||
|
@ -464,7 +471,7 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping):
|
|||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train, meta_data_eval, symbols, phonemes
|
||||
global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
if 'characters' in c.keys():
|
||||
|
@ -538,14 +545,18 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if 'best_loss' not in locals():
|
||||
best_loss = float('inf')
|
||||
|
||||
# define dataloaders
|
||||
train_loader = setup_loader(ap, 1, is_val=False, verbose=True)
|
||||
eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
|
||||
|
||||
global_step = args.restore_step
|
||||
model = data_depended_init(model, ap, speaker_mapping)
|
||||
model = data_depended_init(train_loader, model, ap)
|
||||
for epoch in range(0, c.epochs):
|
||||
c_logger.print_epoch_start(epoch, c.epochs)
|
||||
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
||||
train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
|
||||
scheduler, ap, global_step,
|
||||
epoch, speaker_mapping)
|
||||
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=speaker_mapping)
|
||||
epoch)
|
||||
eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch)
|
||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||
target_loss = train_avg_loss_dict['avg_loss']
|
||||
if c.run_eval:
|
||||
|
@ -621,8 +632,8 @@ if __name__ == '__main__':
|
|||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path,
|
||||
os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
copy_model_files(c, args.config_path,
|
||||
OUT_PATH, new_fields)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
|
|
|
@ -0,0 +1,618 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
import numpy as np
|
||||
from random import randrange
|
||||
|
||||
import torch
|
||||
# DISTRIBUTED
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP_th
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from TTS.tts.datasets.preprocess import load_meta_data
|
||||
from TTS.tts.datasets.TTSDataset import MyDataset
|
||||
from TTS.tts.layers.losses import SpeedySpeechLoss
|
||||
from TTS.tts.utils.generic_utils import check_config_tts, setup_model
|
||||
from TTS.tts.utils.io import save_best_model, save_checkpoint
|
||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
||||
from TTS.tts.utils.speakers import parse_speakers
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.console_logger import ConsoleLogger
|
||||
from TTS.utils.distribute import init_distributed, reduce_tensor
|
||||
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import copy_model_files, load_config
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.training import NoamLR, setup_torch_training_env
|
||||
|
||||
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
||||
|
||||
|
||||
def setup_loader(ap, r, is_val=False, verbose=False):
|
||||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
dataset = MyDataset(
|
||||
r,
|
||||
c.text_cleaner,
|
||||
compute_linear_spec=False,
|
||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||
ap=ap,
|
||||
tp=c.characters if 'characters' in c.keys() else None,
|
||||
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
|
||||
batch_group_size=0 if is_val else c.batch_group_size *
|
||||
c.batch_size,
|
||||
min_seq_len=c.min_seq_len,
|
||||
max_seq_len=c.max_seq_len,
|
||||
phoneme_cache_path=c.phoneme_cache_path,
|
||||
use_phonemes=c.use_phonemes,
|
||||
phoneme_language=c.phoneme_language,
|
||||
enable_eos_bos=c.enable_eos_bos_chars,
|
||||
use_noise_augment=not is_val,
|
||||
verbose=verbose,
|
||||
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
|
||||
|
||||
if c.use_phonemes and c.compute_input_seq_cache:
|
||||
# precompute phonemes to have a better estimate of sequence lengths.
|
||||
dataset.compute_input_seq(c.num_loader_workers)
|
||||
dataset.sort_items()
|
||||
|
||||
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=c.eval_batch_size if is_val else c.batch_size,
|
||||
shuffle=False,
|
||||
collate_fn=dataset.collate_fn,
|
||||
drop_last=False,
|
||||
sampler=sampler,
|
||||
num_workers=c.num_val_loader_workers
|
||||
if is_val else c.num_loader_workers,
|
||||
pin_memory=False)
|
||||
return loader
|
||||
|
||||
|
||||
def format_data(data):
|
||||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
speaker_names = data[2]
|
||||
mel_input = data[4].permute(0, 2, 1) # B x D x T
|
||||
mel_lengths = data[5]
|
||||
item_idx = data[7]
|
||||
attn_mask = data[9]
|
||||
avg_text_length = torch.mean(text_lengths.float())
|
||||
avg_spec_length = torch.mean(mel_lengths.float())
|
||||
|
||||
if c.use_speaker_embedding:
|
||||
if c.use_external_speaker_embedding_file:
|
||||
# return precomputed embedding vector
|
||||
speaker_c = data[8]
|
||||
else:
|
||||
# return speaker_id to be used by an embedding layer
|
||||
speaker_c = [
|
||||
speaker_mapping[speaker_name] for speaker_name in speaker_names
|
||||
]
|
||||
speaker_c = torch.LongTensor(speaker_c)
|
||||
else:
|
||||
speaker_c = None
|
||||
# compute durations from attention mask
|
||||
durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2])
|
||||
for idx, am in enumerate(attn_mask):
|
||||
# compute raw durations
|
||||
c_idxs = am[:, :text_lengths[idx], :mel_lengths[idx]].max(1)[1]
|
||||
# c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True)
|
||||
c_idxs, counts = torch.unique(c_idxs, return_counts=True)
|
||||
dur = torch.ones([text_lengths[idx]]).to(counts.dtype)
|
||||
dur[c_idxs] = counts
|
||||
# smooth the durations and set any 0 duration to 1
|
||||
# by cutting off from the largest duration indeces.
|
||||
extra_frames = dur.sum() - mel_lengths[idx]
|
||||
largest_idxs = torch.argsort(-dur)[:extra_frames]
|
||||
dur[largest_idxs] -= 1
|
||||
assert dur.sum() == mel_lengths[idx], f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
|
||||
durations[idx, :text_lengths[idx]] = dur
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
text_input = text_input.cuda(non_blocking=True)
|
||||
text_lengths = text_lengths.cuda(non_blocking=True)
|
||||
mel_input = mel_input.cuda(non_blocking=True)
|
||||
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
||||
if speaker_c is not None:
|
||||
speaker_c = speaker_c.cuda(non_blocking=True)
|
||||
attn_mask = attn_mask.cuda(non_blocking=True)
|
||||
durations = durations.cuda(non_blocking=True)
|
||||
return text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
||||
avg_text_length, avg_spec_length, attn_mask, durations, item_idx
|
||||
|
||||
|
||||
def train(data_loader, model, criterion, optimizer, scheduler,
|
||||
ap, global_step, epoch):
|
||||
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
if use_cuda:
|
||||
batch_n_iter = int(
|
||||
len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||
else:
|
||||
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
|
||||
end_time = time.time()
|
||||
c_logger.print_train_start()
|
||||
scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
|
||||
avg_text_length, avg_spec_length, _, dur_target, _ = format_data(data)
|
||||
|
||||
loader_time = time.time() - end_time
|
||||
|
||||
global_step += 1
|
||||
optimizer.zero_grad()
|
||||
|
||||
# forward pass model
|
||||
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
|
||||
decoder_output, dur_output, alignments = model.forward(
|
||||
text_input, text_lengths, mel_lengths, dur_target, g=speaker_c)
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths)
|
||||
|
||||
# backward pass with loss scaling
|
||||
if c.mixed_precision:
|
||||
scaler.scale(loss_dict['loss']).backward()
|
||||
scaler.unscale_(optimizer)
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
|
||||
c.grad_clip)
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
else:
|
||||
loss_dict['loss'].backward()
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
|
||||
c.grad_clip)
|
||||
optimizer.step()
|
||||
|
||||
# setup lr
|
||||
if c.noam_schedule:
|
||||
scheduler.step()
|
||||
|
||||
# current_lr
|
||||
current_lr = optimizer.param_groups[0]['lr']
|
||||
|
||||
# compute alignment error (the lower the better )
|
||||
align_error = 1 - alignment_diagonal_score(alignments, binary=True)
|
||||
loss_dict['align_error'] = align_error
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
loss_dict['loss_l1'] = reduce_tensor(loss_dict['loss_l1'].data, num_gpus)
|
||||
loss_dict['loss_ssim'] = reduce_tensor(loss_dict['loss_ssim'].data, num_gpus)
|
||||
loss_dict['loss_dur'] = reduce_tensor(loss_dict['loss_dur'].data, num_gpus)
|
||||
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
|
||||
|
||||
# detach loss values
|
||||
loss_dict_new = dict()
|
||||
for key, value in loss_dict.items():
|
||||
if isinstance(value, (int, float)):
|
||||
loss_dict_new[key] = value
|
||||
else:
|
||||
loss_dict_new[key] = value.item()
|
||||
loss_dict = loss_dict_new
|
||||
|
||||
# update avg stats
|
||||
update_train_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_train_values['avg_' + key] = value
|
||||
update_train_values['avg_loader_time'] = loader_time
|
||||
update_train_values['avg_step_time'] = step_time
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
# print training progress
|
||||
if global_step % c.print_step == 0:
|
||||
log_dict = {
|
||||
|
||||
"avg_spec_length": [avg_spec_length, 1], # value, precision
|
||||
"avg_text_length": [avg_text_length, 1],
|
||||
"step_time": [step_time, 4],
|
||||
"loader_time": [loader_time, 2],
|
||||
"current_lr": current_lr,
|
||||
}
|
||||
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
||||
log_dict, loss_dict, keep_avg.avg_values)
|
||||
|
||||
if args.rank == 0:
|
||||
# Plot Training Iter Stats
|
||||
# reduce TB load
|
||||
if global_step % c.tb_plot_step == 0:
|
||||
iter_stats = {
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"step_time": step_time
|
||||
}
|
||||
iter_stats.update(loss_dict)
|
||||
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||
|
||||
if global_step % c.save_step == 0:
|
||||
if c.checkpoint:
|
||||
# save model
|
||||
save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH,
|
||||
model_loss=loss_dict['loss'])
|
||||
|
||||
# wait all kernels to be completed
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Diagnostic visualizations
|
||||
idx = np.random.randint(mel_targets.shape[0])
|
||||
pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
|
||||
gt_spec = mel_targets[idx].data.cpu().numpy().T
|
||||
align_img = alignments[idx].data.cpu()
|
||||
|
||||
figures = {
|
||||
"prediction": plot_spectrogram(pred_spec, ap),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap),
|
||||
"alignment": plot_alignment(align_img),
|
||||
}
|
||||
|
||||
tb_logger.tb_train_figures(global_step, figures)
|
||||
|
||||
# Sample audio
|
||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||
tb_logger.tb_train_audios(global_step,
|
||||
{'TrainAudio': train_audio},
|
||||
c.audio["sample_rate"])
|
||||
end_time = time.time()
|
||||
|
||||
# print epoch stats
|
||||
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
|
||||
|
||||
# Plot Epoch Stats
|
||||
if args.rank == 0:
|
||||
epoch_stats = {"epoch_time": epoch_time}
|
||||
epoch_stats.update(keep_avg.avg_values)
|
||||
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
|
||||
if c.tb_model_param_stats:
|
||||
tb_logger.tb_model_weights(model, global_step)
|
||||
return keep_avg.avg_values, global_step
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def evaluate(data_loader, model, criterion, ap, global_step, epoch):
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
c_logger.print_eval_start()
|
||||
if data_loader is not None:
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
|
||||
_, _, _, dur_target, _ = format_data(data)
|
||||
|
||||
# forward pass model
|
||||
with torch.cuda.amp.autocast(enabled=c.mixed_precision):
|
||||
decoder_output, dur_output, alignments = model.forward(
|
||||
text_input, text_lengths, mel_lengths, dur_target, g=speaker_c)
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths)
|
||||
|
||||
# step time
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# compute alignment score
|
||||
align_error = 1 - alignment_diagonal_score(alignments, binary=True)
|
||||
loss_dict['align_error'] = align_error
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
loss_dict['loss_l1'] = reduce_tensor(loss_dict['loss_l1'].data, num_gpus)
|
||||
loss_dict['loss_ssim'] = reduce_tensor(loss_dict['loss_ssim'].data, num_gpus)
|
||||
loss_dict['loss_dur'] = reduce_tensor(loss_dict['loss_dur'].data, num_gpus)
|
||||
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
|
||||
|
||||
# detach loss values
|
||||
loss_dict_new = dict()
|
||||
for key, value in loss_dict.items():
|
||||
if isinstance(value, (int, float)):
|
||||
loss_dict_new[key] = value
|
||||
else:
|
||||
loss_dict_new[key] = value.item()
|
||||
loss_dict = loss_dict_new
|
||||
|
||||
# update avg stats
|
||||
update_train_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_train_values['avg_' + key] = value
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
if c.print_eval:
|
||||
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
|
||||
|
||||
if args.rank == 0:
|
||||
# Diagnostic visualizations
|
||||
idx = np.random.randint(mel_targets.shape[0])
|
||||
pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
|
||||
gt_spec = mel_targets[idx].data.cpu().numpy().T
|
||||
align_img = alignments[idx].data.cpu()
|
||||
|
||||
eval_figures = {
|
||||
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
||||
"alignment": plot_alignment(align_img, output_fig=False)
|
||||
}
|
||||
|
||||
# Sample audio
|
||||
eval_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
|
||||
c.audio["sample_rate"])
|
||||
|
||||
# Plot Validation Stats
|
||||
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
||||
tb_logger.tb_eval_figures(global_step, eval_figures)
|
||||
|
||||
if args.rank == 0 and epoch >= c.test_delay_epochs:
|
||||
if c.test_sentences_file is None:
|
||||
test_sentences = [
|
||||
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||
"Be a voice, not an echo.",
|
||||
"I'm sorry Dave. I'm afraid I can't do that.",
|
||||
"This cake is great. It's so delicious and moist.",
|
||||
"Prior to November 22, 1963."
|
||||
]
|
||||
else:
|
||||
with open(c.test_sentences_file, "r") as f:
|
||||
test_sentences = [s.strip() for s in f.readlines()]
|
||||
|
||||
# test sentences
|
||||
test_audios = {}
|
||||
test_figures = {}
|
||||
print(" | > Synthesizing test sentences")
|
||||
if c.use_speaker_embedding:
|
||||
if c.use_external_speaker_embedding_file:
|
||||
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping)-1)]]['embedding']
|
||||
speaker_id = None
|
||||
else:
|
||||
speaker_id = 0
|
||||
speaker_embedding = None
|
||||
else:
|
||||
speaker_id = None
|
||||
speaker_embedding = None
|
||||
|
||||
style_wav = c.get("style_wav_for_test")
|
||||
for idx, test_sentence in enumerate(test_sentences):
|
||||
try:
|
||||
wav, alignment, _, postnet_output, _, _ = synthesis(
|
||||
model,
|
||||
test_sentence,
|
||||
c,
|
||||
use_cuda,
|
||||
ap,
|
||||
speaker_id=speaker_id,
|
||||
speaker_embedding=speaker_embedding,
|
||||
style_wav=style_wav,
|
||||
truncated=False,
|
||||
enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
|
||||
use_griffin_lim=True,
|
||||
do_trim_silence=False)
|
||||
|
||||
file_path = os.path.join(AUDIO_PATH, str(global_step))
|
||||
os.makedirs(file_path, exist_ok=True)
|
||||
file_path = os.path.join(file_path,
|
||||
"TestSentence_{}.wav".format(idx))
|
||||
ap.save_wav(wav, file_path)
|
||||
test_audios['{}-audio'.format(idx)] = wav
|
||||
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
|
||||
postnet_output, ap)
|
||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(
|
||||
alignment)
|
||||
except: #pylint: disable=bare-except
|
||||
print(" !! Error creating Test Sentence -", idx)
|
||||
traceback.print_exc()
|
||||
tb_logger.tb_test_audios(global_step, test_audios,
|
||||
c.audio['sample_rate'])
|
||||
tb_logger.tb_test_figures(global_step, test_figures)
|
||||
return keep_avg.avg_values
|
||||
|
||||
|
||||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
if 'characters' in c.keys():
|
||||
symbols, phonemes = make_symbols(**c.characters)
|
||||
|
||||
# DISTRUBUTED
|
||||
if num_gpus > 1:
|
||||
init_distributed(args.rank, num_gpus, args.group_id,
|
||||
c.distributed["backend"], c.distributed["url"])
|
||||
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
|
||||
|
||||
# load data instances
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=True)
|
||||
|
||||
# set the portion of the data used for training if set in config.json
|
||||
if 'train_portion' in c.keys():
|
||||
meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)]
|
||||
if 'eval_portion' in c.keys():
|
||||
meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)]
|
||||
|
||||
# parse speakers
|
||||
num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, OUT_PATH)
|
||||
|
||||
# setup model
|
||||
model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim)
|
||||
optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9)
|
||||
criterion = SpeedySpeechLoss(c)
|
||||
|
||||
if args.restore_path:
|
||||
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
||||
try:
|
||||
# TODO: fix optimizer init, model.cuda() needs to be called before
|
||||
# optimizer restore
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if c.reinit_layers:
|
||||
raise RuntimeError
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
except: #pylint: disable=bare-except
|
||||
print(" > Partial model initialization.")
|
||||
model_dict = model.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||
model.load_state_dict(model_dict)
|
||||
del model_dict
|
||||
|
||||
for group in optimizer.param_groups:
|
||||
group['initial_lr'] = c.lr
|
||||
print(" > Model restored from step %d" % checkpoint['step'],
|
||||
flush=True)
|
||||
args.restore_step = checkpoint['step']
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
criterion.cuda()
|
||||
|
||||
# DISTRUBUTED
|
||||
if num_gpus > 1:
|
||||
model = DDP_th(model, device_ids=[args.rank])
|
||||
|
||||
if c.noam_schedule:
|
||||
scheduler = NoamLR(optimizer,
|
||||
warmup_steps=c.warmup_steps,
|
||||
last_epoch=args.restore_step - 1)
|
||||
else:
|
||||
scheduler = None
|
||||
|
||||
num_params = count_parameters(model)
|
||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||
|
||||
if 'best_loss' not in locals():
|
||||
best_loss = float('inf')
|
||||
|
||||
# define dataloaders
|
||||
train_loader = setup_loader(ap, 1, is_val=False, verbose=True)
|
||||
eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
|
||||
|
||||
global_step = args.restore_step
|
||||
for epoch in range(0, c.epochs):
|
||||
c_logger.print_epoch_start(epoch, c.epochs)
|
||||
train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
|
||||
scheduler, ap, global_step,
|
||||
epoch)
|
||||
eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch)
|
||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||
target_loss = train_avg_loss_dict['avg_loss']
|
||||
if c.run_eval:
|
||||
target_loss = eval_avg_loss_dict['avg_loss']
|
||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
|
||||
OUT_PATH)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--continue_path',
|
||||
type=str,
|
||||
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default='',
|
||||
required='--config_path' not in sys.argv)
|
||||
parser.add_argument(
|
||||
'--restore_path',
|
||||
type=str,
|
||||
help='Model file to be restored. Use to finetune a model.',
|
||||
default='')
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
required='--continue_path' not in sys.argv
|
||||
)
|
||||
parser.add_argument('--debug',
|
||||
type=bool,
|
||||
default=False,
|
||||
help='Do not verify commit integrity to run training.')
|
||||
|
||||
# DISTRUBUTED
|
||||
parser.add_argument(
|
||||
'--rank',
|
||||
type=int,
|
||||
default=0,
|
||||
help='DISTRIBUTED: process rank for distributed training.')
|
||||
parser.add_argument('--group_id',
|
||||
type=str,
|
||||
default="",
|
||||
help='DISTRIBUTED: process group id.')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.continue_path != '':
|
||||
args.output_path = args.continue_path
|
||||
args.config_path = os.path.join(args.continue_path, 'config.json')
|
||||
list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv
|
||||
latest_model_file = max(list_of_files, key=os.path.getctime)
|
||||
args.restore_path = latest_model_file
|
||||
print(f" > Training continues for {args.restore_path}")
|
||||
|
||||
# setup output paths and read configs
|
||||
c = load_config(args.config_path)
|
||||
# check_config(c)
|
||||
check_config_tts(c)
|
||||
_ = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
if c.mixed_precision:
|
||||
print(" > Mixed precision enabled.")
|
||||
|
||||
OUT_PATH = args.continue_path
|
||||
if args.continue_path == '':
|
||||
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
|
||||
|
||||
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
|
||||
|
||||
c_logger = ConsoleLogger()
|
||||
|
||||
if args.rank == 0:
|
||||
os.makedirs(AUDIO_PATH, exist_ok=True)
|
||||
new_fields = {}
|
||||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_model_files(c, args.config_path, OUT_PATH, new_fields)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
LOG_DIR = OUT_PATH
|
||||
tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
|
||||
|
||||
# write model desc to tensorboard
|
||||
tb_logger.tb_add_text('model-description', c['run_description'], 0)
|
||||
|
||||
try:
|
||||
main(args)
|
||||
except KeyboardInterrupt:
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
try:
|
||||
sys.exit(0)
|
||||
except SystemExit:
|
||||
os._exit(0) # pylint: disable=protected-access
|
||||
except Exception: # pylint: disable=broad-except
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
|
@ -18,7 +18,7 @@ from TTS.tts.layers.losses import TacotronLoss
|
|||
from TTS.tts.utils.generic_utils import check_config_tts, setup_model
|
||||
from TTS.tts.utils.io import save_best_model, save_checkpoint
|
||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
||||
from TTS.tts.utils.speakers import load_speaker_mapping, parse_speakers
|
||||
from TTS.tts.utils.speakers import parse_speakers
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
|
@ -29,7 +29,7 @@ from TTS.utils.distribute import (DistributedSampler, apply_gradient_allreduce,
|
|||
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.utils.io import copy_model_files, load_config
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
|
||||
|
@ -39,28 +39,35 @@ from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
|
|||
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
||||
|
||||
|
||||
def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
|
||||
def setup_loader(ap, r, is_val=False, verbose=False, dataset=None):
|
||||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
dataset = MyDataset(
|
||||
r,
|
||||
c.text_cleaner,
|
||||
compute_linear_spec=c.model.lower() == 'tacotron',
|
||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||
ap=ap,
|
||||
tp=c.characters if 'characters' in c.keys() else None,
|
||||
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
|
||||
batch_group_size=0 if is_val else c.batch_group_size *
|
||||
c.batch_size,
|
||||
min_seq_len=c.min_seq_len,
|
||||
max_seq_len=c.max_seq_len,
|
||||
phoneme_cache_path=c.phoneme_cache_path,
|
||||
use_phonemes=c.use_phonemes,
|
||||
phoneme_language=c.phoneme_language,
|
||||
enable_eos_bos=c.enable_eos_bos_chars,
|
||||
verbose=verbose,
|
||||
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
|
||||
if dataset is None:
|
||||
dataset = MyDataset(
|
||||
r,
|
||||
c.text_cleaner,
|
||||
compute_linear_spec=c.model.lower() == 'tacotron',
|
||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||
ap=ap,
|
||||
tp=c.characters if 'characters' in c.keys() else None,
|
||||
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
|
||||
batch_group_size=0 if is_val else c.batch_group_size *
|
||||
c.batch_size,
|
||||
min_seq_len=c.min_seq_len,
|
||||
max_seq_len=c.max_seq_len,
|
||||
phoneme_cache_path=c.phoneme_cache_path,
|
||||
use_phonemes=c.use_phonemes,
|
||||
phoneme_language=c.phoneme_language,
|
||||
enable_eos_bos=c.enable_eos_bos_chars,
|
||||
verbose=verbose,
|
||||
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
|
||||
|
||||
if c.use_phonemes and c.compute_input_seq_cache:
|
||||
# precompute phonemes to have a better estimate of sequence lengths.
|
||||
dataset.compute_input_seq(c.num_loader_workers)
|
||||
dataset.sort_items()
|
||||
|
||||
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
|
@ -74,10 +81,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
|
|||
pin_memory=False)
|
||||
return loader
|
||||
|
||||
def format_data(data, speaker_mapping=None):
|
||||
if speaker_mapping is None and c.use_speaker_embedding and not c.use_external_speaker_embedding_file:
|
||||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||
|
||||
def format_data(data):
|
||||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
|
@ -126,10 +130,8 @@ def format_data(data, speaker_mapping=None):
|
|||
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length
|
||||
|
||||
|
||||
def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||
ap, global_step, epoch, scaler, scaler_st, speaker_mapping=None):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
|
||||
verbose=(epoch == 0), speaker_mapping=speaker_mapping)
|
||||
def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler,
|
||||
ap, global_step, epoch, scaler, scaler_st):
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
|
@ -144,7 +146,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length = format_data(data, speaker_mapping)
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length = format_data(data)
|
||||
loader_time = time.time() - end_time
|
||||
|
||||
global_step += 1
|
||||
|
@ -327,8 +329,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
|
||||
|
||||
@torch.no_grad()
|
||||
def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=True, speaker_mapping=speaker_mapping)
|
||||
def evaluate(data_loader, model, criterion, ap, global_step, epoch):
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
|
@ -338,7 +339,7 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
|
|||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data, speaker_mapping)
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data)
|
||||
assert mel_input.shape[1] % model.decoder.r == 0
|
||||
|
||||
# forward pass model
|
||||
|
@ -493,7 +494,7 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
|
|||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train, meta_data_eval, symbols, phonemes
|
||||
global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
if 'characters' in c.keys():
|
||||
|
@ -586,6 +587,13 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if 'best_loss' not in locals():
|
||||
best_loss = float('inf')
|
||||
|
||||
# define data loaders
|
||||
train_loader = setup_loader(ap,
|
||||
model.decoder.r,
|
||||
is_val=False,
|
||||
verbose=True)
|
||||
eval_loader = setup_loader(ap, model.decoder.r, is_val=True)
|
||||
|
||||
global_step = args.restore_step
|
||||
for epoch in range(0, c.epochs):
|
||||
c_logger.print_epoch_start(epoch, c.epochs)
|
||||
|
@ -596,17 +604,40 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
model.decoder.set_r(r)
|
||||
if c.bidirectional_decoder:
|
||||
model.decoder_backward.set_r(r)
|
||||
train_loader.dataset.outputs_per_step = r
|
||||
eval_loader.dataset.outputs_per_step = r
|
||||
train_loader = setup_loader(ap,
|
||||
model.decoder.r,
|
||||
is_val=False,
|
||||
dataset=train_loader.dataset)
|
||||
eval_loader = setup_loader(ap,
|
||||
model.decoder.r,
|
||||
is_val=True,
|
||||
dataset=eval_loader.dataset)
|
||||
print("\n > Number of output frames:", model.decoder.r)
|
||||
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
||||
# train one epoch
|
||||
train_avg_loss_dict, global_step = train(train_loader, model,
|
||||
criterion, optimizer,
|
||||
optimizer_st, scheduler, ap,
|
||||
global_step, epoch, scaler, scaler_st, speaker_mapping)
|
||||
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping)
|
||||
global_step, epoch, scaler,
|
||||
scaler_st)
|
||||
# eval one epoch
|
||||
eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap,
|
||||
global_step, epoch)
|
||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||
target_loss = train_avg_loss_dict['avg_postnet_loss']
|
||||
if c.run_eval:
|
||||
target_loss = eval_avg_loss_dict['avg_postnet_loss']
|
||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
|
||||
OUT_PATH, scaler=scaler.state_dict() if c.mixed_precision else None)
|
||||
best_loss = save_best_model(
|
||||
target_loss,
|
||||
best_loss,
|
||||
model,
|
||||
optimizer,
|
||||
global_step,
|
||||
epoch,
|
||||
c.r,
|
||||
OUT_PATH,
|
||||
scaler=scaler.state_dict() if c.mixed_precision else None)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -675,8 +706,8 @@ if __name__ == '__main__':
|
|||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path,
|
||||
os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
copy_model_files(c, args.config_path,
|
||||
OUT_PATH, new_fields)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ from TTS.utils.console_logger import ConsoleLogger
|
|||
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.utils.io import copy_model_files, load_config
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.training import setup_torch_training_env
|
||||
|
@ -639,8 +639,8 @@ if __name__ == '__main__':
|
|||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path,
|
||||
os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
copy_model_files(c, args.config_path,
|
||||
OUT_PATH, new_fields)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ from TTS.utils.distribute import init_distributed
|
|||
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.utils.io import copy_model_files, load_config
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.training import setup_torch_training_env
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||
|
@ -486,8 +486,8 @@ if __name__ == '__main__':
|
|||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path,
|
||||
os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
copy_model_files(c, args.config_path,
|
||||
OUT_PATH, new_fields)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ from torch.utils.data import DataLoader
|
|||
from TTS.tts.utils.visual import plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.utils.io import copy_model_files, load_config
|
||||
from TTS.utils.training import setup_torch_training_env
|
||||
from TTS.utils.console_logger import ConsoleLogger
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
|
@ -513,8 +513,8 @@ if __name__ == "__main__":
|
|||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(
|
||||
args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
|
||||
copy_model_files(
|
||||
c, args.config_path, OUT_PATH, new_fields
|
||||
)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
<meta name="description" content="">
|
||||
<meta name="author" content="">
|
||||
|
||||
<title>Mozilla - Text2Speech engine</title>
|
||||
<title>TTS engine</title>
|
||||
|
||||
<!-- Bootstrap core CSS -->
|
||||
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
||||
|
@ -57,7 +57,6 @@
|
|||
<div class="row">
|
||||
<div class="col-lg-12 text-center">
|
||||
<img class="mt-5" src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" alt=></img>
|
||||
<h1 class="mt-5">Mozilla TTS</h1>
|
||||
<ul class="list-unstyled">
|
||||
</ul>
|
||||
<input id="text" placeholder="Type here..." size=45 type="text" name="text">
|
||||
|
|
|
@ -99,7 +99,7 @@
|
|||
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||
|
||||
// TACOTRON ATTENTION
|
||||
"attention_type": "original", // 'original' or 'graves'
|
||||
"attention_type": "original", // 'original' , 'graves', 'dynamic_convolution'
|
||||
"attention_heads": 4, // number of attention heads (only for 'graves')
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid.
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
|
@ -131,6 +131,8 @@
|
|||
"batch_group_size": 4, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
||||
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
|
||||
"use_noise_augment": true,
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/",
|
||||
|
|
|
@ -105,6 +105,7 @@
|
|||
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 500, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/",
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"model": "glow_tts",
|
||||
"run_name": "glow-tts-tdsep-conv",
|
||||
"run_description": "glow-tts model training with time-depth separable conv encoder.",
|
||||
"run_name": "glow-tts-residual_bn_conv",
|
||||
"run_description": "glow-tts model training with residual BN conv.",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
|
@ -28,15 +28,15 @@
|
|||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
||||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.00
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
|
@ -62,13 +62,28 @@
|
|||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
|
||||
// "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
|
||||
"hidden_channels_encoder": 192,
|
||||
"hidden_channels_decoder": 192,
|
||||
"hidden_channels_duration_predictor": 256,
|
||||
"use_encoder_prenet": true,
|
||||
"encoder_type": "rel_pos_transformer",
|
||||
"encoder_params": {
|
||||
"kernel_size":3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"num_heads": 2,
|
||||
"hidden_channels_ffn": 768,
|
||||
"input_length": null
|
||||
},
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":16,
|
||||
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"mixed_precision": true,
|
||||
"data_dep_init_iter": 10,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
|
@ -84,8 +99,6 @@
|
|||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
"encoder_type": "time-depth-separable",
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
|
@ -93,7 +106,6 @@
|
|||
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"apex_amp_level": null,
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
|
@ -104,6 +116,8 @@
|
|||
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 500, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
"use_noise_augment": true, //add a random noise to audio signal for augmentation at training .
|
||||
"compute_input_seq_cache": true,
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/",
|
||||
|
@ -115,6 +129,7 @@
|
|||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_external_speaker_embedding_file": false,
|
||||
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
|
||||
"use_gst": false, // TACOTRON ONLY: use global style tokens
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
{
|
||||
"model": "Tacotron2",
|
||||
"run_name": "ljspeech-dcattn",
|
||||
"run_description": "tacotron2 with dynamic convolution attention.",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
// stft parameters
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// Griffin-Lim
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1,
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
// "characters":{
|
||||
// "pad": "_",
|
||||
// "eos": "~",
|
||||
// "bos": "^",
|
||||
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
|
||||
// "punctuations":"!'(),-.:;? ",
|
||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
||||
// },
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":16,
|
||||
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
||||
"mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
|
||||
|
||||
// LOSS SETTINGS
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled
|
||||
"postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
|
||||
"postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
|
||||
"decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
|
||||
"decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled
|
||||
"postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled
|
||||
"ga_alpha": 0.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
||||
"stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
|
||||
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": false, // use noam warmup and lr schedule.
|
||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
// TACOTRON PRENET
|
||||
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
||||
"prenet_type": "original", // "original" or "bn".
|
||||
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||
|
||||
// TACOTRON ATTENTION
|
||||
"attention_type": "dynamic_convolution", // 'original' , 'graves', 'dynamic_convolution'
|
||||
"attention_heads": 4, // number of attention heads (only for 'graves')
|
||||
"attention_norm": "softmax", // softmax or sigmoid.
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
|
||||
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
|
||||
"transition_agent": false, // enable/disable transition agent of forward attention.
|
||||
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
|
||||
"double_decoder_consistency": false, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
|
||||
"ddc_r": 7, // reduction rate for coarse decoder.
|
||||
|
||||
// STOPNET
|
||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"batch_group_size": 4, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
||||
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "/home/erogol/Models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_gst": false, // use global style tokens
|
||||
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
"external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
"gst": { // gst parameter if gst is enabled
|
||||
"gst_style_input": null, // Condition the style input either on a
|
||||
// -> wave file [path to wave] or
|
||||
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
|
||||
// with the dictionary being len(dict) <= len(gst_style_tokens).
|
||||
"gst_embedding_dim": 512,
|
||||
"gst_num_heads": 4,
|
||||
"gst_style_tokens": 10,
|
||||
"gst_use_speaker_embedding": false
|
||||
},
|
||||
|
||||
// DATASETS
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "/home/erogol/Data/LJSpeech-1.1/",
|
||||
"meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
|
||||
"meta_file_val": null
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
{
|
||||
"model": "speedy_speech",
|
||||
"run_name": "speedy-speech-ljspeech",
|
||||
"run_description": "speedy-speech model for LJSpeech dataset.",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
// stft parameters
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// Griffin-Lim
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1,
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
// "characters":{
|
||||
// "pad": "_",
|
||||
// "eos": "&",
|
||||
// "bos": "*",
|
||||
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? ",
|
||||
// "punctuations":"!'(),-.:;? ",
|
||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ'̃' "
|
||||
// },
|
||||
|
||||
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"positional_encoding": true,
|
||||
"hidden_channels": 128, // defined globally all the hidden channels of the model - 128 default
|
||||
"encoder_type": "residual_conv_bn",
|
||||
"encoder_params":{
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
},
|
||||
"decoder_type": "residual_conv_bn",
|
||||
"decoder_params":{
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17
|
||||
},
|
||||
|
||||
// TRAINING
|
||||
"batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":32,
|
||||
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
|
||||
// LOSS PARAMETERS
|
||||
"ssim_alpha": 1,
|
||||
"l1_alpha": 1,
|
||||
"huber_alpha": 1,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": -1, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": true, // use noam warmup and lr schedule.
|
||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
||||
"epochs": 10000, // total number of epochs to train.
|
||||
"lr": 0.002, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n
|
||||
"mixed_precision": false,
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "english_cleaners",
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 8, // number of evaluation data loader processes.
|
||||
"batch_group_size": 4, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 300, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/ljspeech/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "/home/erogol/Models/ljspeech_phonemes/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronoun[ciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
"external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
|
||||
|
||||
// DATASETS
|
||||
"datasets": // List of datasets. They all merged and they get different s$
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "/home/erogol/Data/LJSpeech-1.1/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": null,
|
||||
"meta_file_attn_mask": "/home/erogol/Data/LJSpeech-1.1/metadata_attn_mask.txt" // created by bin/compute_attention_masks.py
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,12 +1,16 @@
|
|||
import os
|
||||
import numpy as np
|
||||
import collections
|
||||
import torch
|
||||
import os
|
||||
import random
|
||||
from torch.utils.data import Dataset
|
||||
from multiprocessing import Manager, Pool
|
||||
|
||||
from TTS.tts.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
|
||||
from TTS.tts.utils.data import prepare_data, prepare_tensor, prepare_stop_target
|
||||
import numpy as np
|
||||
import torch
|
||||
import tqdm
|
||||
from torch.utils.data import Dataset
|
||||
from TTS.tts.utils.data import (prepare_data, prepare_stop_target,
|
||||
prepare_tensor)
|
||||
from TTS.tts.utils.text import (pad_with_eos_bos, phoneme_to_sequence,
|
||||
text_to_sequence)
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
|
@ -26,6 +30,7 @@ class MyDataset(Dataset):
|
|||
phoneme_language="en-us",
|
||||
enable_eos_bos=False,
|
||||
speaker_mapping=None,
|
||||
use_noise_augment=False,
|
||||
verbose=False):
|
||||
"""
|
||||
Args:
|
||||
|
@ -44,6 +49,7 @@ class MyDataset(Dataset):
|
|||
phoneme_language (str): one the languages from
|
||||
https://github.com/bootphon/phonemizer#languages
|
||||
enable_eos_bos (bool): enable end of sentence and beginning of sentences characters.
|
||||
use_noise_augment (bool): enable adding random noise to wav for augmentation.
|
||||
verbose (bool): print diagnostic information.
|
||||
"""
|
||||
self.batch_group_size = batch_group_size
|
||||
|
@ -62,7 +68,9 @@ class MyDataset(Dataset):
|
|||
self.phoneme_language = phoneme_language
|
||||
self.enable_eos_bos = enable_eos_bos
|
||||
self.speaker_mapping = speaker_mapping
|
||||
self.use_noise_augment = use_noise_augment
|
||||
self.verbose = verbose
|
||||
self.input_seq_computed = False
|
||||
if use_phonemes and not os.path.isdir(phoneme_cache_path):
|
||||
os.makedirs(phoneme_cache_path, exist_ok=True)
|
||||
if self.verbose:
|
||||
|
@ -71,7 +79,6 @@ class MyDataset(Dataset):
|
|||
if use_phonemes:
|
||||
print(" | > phoneme language: {}".format(phoneme_language))
|
||||
print(" | > Number of instances : {}".format(len(self.items)))
|
||||
self.sort_items()
|
||||
|
||||
def load_wav(self, filename):
|
||||
audio = self.ap.load_wav(filename)
|
||||
|
@ -82,35 +89,40 @@ class MyDataset(Dataset):
|
|||
data = np.load(filename).astype('float32')
|
||||
return data
|
||||
|
||||
def _generate_and_cache_phoneme_sequence(self, text, cache_path):
|
||||
@staticmethod
|
||||
def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, tp, add_blank):
|
||||
"""generate a phoneme sequence from text.
|
||||
since the usage is for subsequent caching, we never add bos and
|
||||
eos chars here. Instead we add those dynamically later; based on the
|
||||
config option."""
|
||||
phonemes = phoneme_to_sequence(text, [self.cleaners],
|
||||
language=self.phoneme_language,
|
||||
phonemes = phoneme_to_sequence(text, [cleaners],
|
||||
language=language,
|
||||
enable_eos_bos=False,
|
||||
tp=self.tp, add_blank=self.add_blank)
|
||||
tp=tp, add_blank=add_blank)
|
||||
phonemes = np.asarray(phonemes, dtype=np.int32)
|
||||
np.save(cache_path, phonemes)
|
||||
return phonemes
|
||||
|
||||
def _load_or_generate_phoneme_sequence(self, wav_file, text):
|
||||
@staticmethod
|
||||
def _load_or_generate_phoneme_sequence(wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, tp, add_blank):
|
||||
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
||||
cache_path = os.path.join(self.phoneme_cache_path,
|
||||
file_name + '_phoneme.npy')
|
||||
|
||||
# different names for normal phonemes and with blank chars.
|
||||
file_name_ext = '_blanked_phoneme.npy' if add_blank else '_phoneme.npy'
|
||||
cache_path = os.path.join(phoneme_cache_path,
|
||||
file_name + file_name_ext)
|
||||
try:
|
||||
phonemes = np.load(cache_path)
|
||||
except FileNotFoundError:
|
||||
phonemes = self._generate_and_cache_phoneme_sequence(
|
||||
text, cache_path)
|
||||
phonemes = MyDataset._generate_and_cache_phoneme_sequence(
|
||||
text, cache_path, cleaners, language, tp, add_blank)
|
||||
except (ValueError, IOError):
|
||||
print(" > ERROR: failed loading phonemes for {}. "
|
||||
print(" [!] failed loading phonemes for {}. "
|
||||
"Recomputing.".format(wav_file))
|
||||
phonemes = self._generate_and_cache_phoneme_sequence(
|
||||
text, cache_path)
|
||||
if self.enable_eos_bos:
|
||||
phonemes = pad_with_eos_bos(phonemes, tp=self.tp)
|
||||
phonemes = MyDataset._generate_and_cache_phoneme_sequence(
|
||||
text, cache_path, cleaners, language, tp, add_blank)
|
||||
if enable_eos_bos:
|
||||
phonemes = pad_with_eos_bos(phonemes, tp=tp)
|
||||
phonemes = np.asarray(phonemes, dtype=np.int32)
|
||||
return phonemes
|
||||
|
||||
|
@ -125,11 +137,17 @@ class MyDataset(Dataset):
|
|||
|
||||
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
|
||||
|
||||
if self.use_phonemes:
|
||||
text = self._load_or_generate_phoneme_sequence(wav_file, text)
|
||||
else:
|
||||
text = np.asarray(text_to_sequence(text, [self.cleaners],
|
||||
tp=self.tp, add_blank=self.add_blank),
|
||||
# apply noise for augmentation
|
||||
if self.use_noise_augment:
|
||||
wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
|
||||
|
||||
if not self.input_seq_computed:
|
||||
if self.use_phonemes:
|
||||
text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank)
|
||||
|
||||
else:
|
||||
text = np.asarray(text_to_sequence(text, [self.cleaners],
|
||||
tp=self.tp, add_blank=self.add_blank),
|
||||
dtype=np.int32)
|
||||
|
||||
assert text.size > 0, self.items[idx][1]
|
||||
|
@ -138,6 +156,12 @@ class MyDataset(Dataset):
|
|||
if "attn_file" in locals():
|
||||
attn = np.load(attn_file)
|
||||
|
||||
if len(text) > self.max_seq_len:
|
||||
# return a different sample if the phonemized
|
||||
# text is longer than the threshold
|
||||
# TODO: find a better fix
|
||||
return self.load_data(100)
|
||||
|
||||
sample = {
|
||||
'text': text,
|
||||
'wav': wav,
|
||||
|
@ -148,6 +172,41 @@ class MyDataset(Dataset):
|
|||
}
|
||||
return sample
|
||||
|
||||
@staticmethod
|
||||
def _phoneme_worker(args):
|
||||
item = args[0]
|
||||
func_args = args[1]
|
||||
text, wav_file, *_ = item
|
||||
phonemes = MyDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args)
|
||||
return phonemes
|
||||
|
||||
def compute_input_seq(self, num_workers=0):
|
||||
"""compute input sequences separately. Call it before
|
||||
passing dataset to data loader."""
|
||||
if not self.use_phonemes:
|
||||
if self.verbose:
|
||||
print(" | > Computing input sequences ...")
|
||||
for idx, item in enumerate(tqdm.tqdm(self.items)):
|
||||
text, *_ = item
|
||||
sequence = np.asarray(text_to_sequence(text, [self.cleaners],
|
||||
tp=self.tp, add_blank=self.add_blank),
|
||||
dtype=np.int32)
|
||||
self.items[idx][0] = sequence
|
||||
|
||||
else:
|
||||
func_args = [self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank]
|
||||
if self.verbose:
|
||||
print(" | > Computing phonemes ...")
|
||||
if num_workers == 0:
|
||||
for idx, item in enumerate(tqdm.tqdm(self.items)):
|
||||
phonemes = self._phoneme_worker([item, func_args])
|
||||
self.items[idx][0] = phonemes
|
||||
else:
|
||||
with Pool(num_workers) as p:
|
||||
phonemes = list(tqdm.tqdm(p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items)))
|
||||
for idx, p in enumerate(phonemes):
|
||||
self.items[idx][0] = p
|
||||
|
||||
def sort_items(self):
|
||||
r"""Sort instances based on text length in ascending order"""
|
||||
lengths = np.array([len(ins[0]) for ins in self.items])
|
||||
|
|
|
@ -8,6 +8,9 @@ from tqdm import tqdm
|
|||
|
||||
from TTS.tts.utils.generic_utils import split_dataset
|
||||
|
||||
####################
|
||||
# UTILITIES
|
||||
####################
|
||||
|
||||
def load_meta_data(datasets, eval_split=True):
|
||||
meta_data_train_all = []
|
||||
|
@ -17,9 +20,12 @@ def load_meta_data(datasets, eval_split=True):
|
|||
root_path = dataset['path']
|
||||
meta_file_train = dataset['meta_file_train']
|
||||
meta_file_val = dataset['meta_file_val']
|
||||
# setup the right data processor
|
||||
preprocessor = get_preprocessor_by_name(name)
|
||||
# load train set
|
||||
meta_data_train = preprocessor(root_path, meta_file_train)
|
||||
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
||||
# load evaluation split if set
|
||||
if eval_split:
|
||||
if meta_file_val is None:
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
||||
|
@ -27,15 +33,41 @@ def load_meta_data(datasets, eval_split=True):
|
|||
meta_data_eval = preprocessor(root_path, meta_file_val)
|
||||
meta_data_eval_all += meta_data_eval
|
||||
meta_data_train_all += meta_data_train
|
||||
# load attention masks for duration predictor training
|
||||
if 'meta_file_attn_mask' in dataset:
|
||||
meta_data = dict(load_attention_mask_meta_data(dataset['meta_file_attn_mask']))
|
||||
for idx, ins in enumerate(meta_data_train_all):
|
||||
attn_file = meta_data[ins[1]].strip()
|
||||
meta_data_train_all[idx].append(attn_file)
|
||||
if meta_data_eval_all is not None:
|
||||
for idx, ins in enumerate(meta_data_eval_all):
|
||||
attn_file = meta_data[ins[1]].strip()
|
||||
meta_data_eval_all[idx].append(attn_file)
|
||||
return meta_data_train_all, meta_data_eval_all
|
||||
|
||||
|
||||
def load_attention_mask_meta_data(metafile_path):
|
||||
"""Load meta data file created by compute_attention_masks.py"""
|
||||
with open(metafile_path, 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
meta_data = []
|
||||
for line in lines:
|
||||
wav_file, attn_file = line.split('|')
|
||||
meta_data.append([wav_file, attn_file])
|
||||
return meta_data
|
||||
|
||||
|
||||
def get_preprocessor_by_name(name):
|
||||
"""Returns the respective preprocessing function."""
|
||||
thismodule = sys.modules[__name__]
|
||||
return getattr(thismodule, name.lower())
|
||||
|
||||
|
||||
########################
|
||||
# DATASETS
|
||||
########################
|
||||
|
||||
def tweb(root_path, meta_file):
|
||||
"""Normalize TWEB dataset.
|
||||
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
|
||||
|
@ -52,19 +84,6 @@ def tweb(root_path, meta_file):
|
|||
return items
|
||||
|
||||
|
||||
# def kusal(root_path, meta_file):
|
||||
# txt_file = os.path.join(root_path, meta_file)
|
||||
# texts = []
|
||||
# wavs = []
|
||||
# with open(txt_file, "r", encoding="utf8") as f:
|
||||
# frames = [
|
||||
# line.split('\t') for line in f
|
||||
# if line.split('\t')[0] in self.wav_files_dict.keys()
|
||||
# ]
|
||||
# # TODO: code the rest
|
||||
# return {'text': texts, 'wavs': wavs}
|
||||
|
||||
|
||||
def mozilla(root_path, meta_file):
|
||||
"""Normalizes Mozilla meta data files to TTS format"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
|
|
|
@ -0,0 +1,482 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from TTS.tts.layers.common_layers import Linear
|
||||
from scipy.stats import betabinom
|
||||
|
||||
|
||||
class LocationLayer(nn.Module):
|
||||
"""Layers for Location Sensitive Attention
|
||||
|
||||
Args:
|
||||
attention_dim (int): number of channels in the input tensor.
|
||||
attention_n_filters (int, optional): number of filters in convolution. Defaults to 32.
|
||||
attention_kernel_size (int, optional): kernel size of convolution filter. Defaults to 31.
|
||||
"""
|
||||
def __init__(self,
|
||||
attention_dim,
|
||||
attention_n_filters=32,
|
||||
attention_kernel_size=31):
|
||||
super(LocationLayer, self).__init__()
|
||||
self.location_conv1d = nn.Conv1d(
|
||||
in_channels=2,
|
||||
out_channels=attention_n_filters,
|
||||
kernel_size=attention_kernel_size,
|
||||
stride=1,
|
||||
padding=(attention_kernel_size - 1) // 2,
|
||||
bias=False)
|
||||
self.location_dense = Linear(
|
||||
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
|
||||
|
||||
def forward(self, attention_cat):
|
||||
"""
|
||||
Shapes:
|
||||
attention_cat: [B, 2, C]
|
||||
"""
|
||||
processed_attention = self.location_conv1d(attention_cat)
|
||||
processed_attention = self.location_dense(
|
||||
processed_attention.transpose(1, 2))
|
||||
return processed_attention
|
||||
|
||||
|
||||
class GravesAttention(nn.Module):
|
||||
"""Graves Attention as is ref1 with updates from ref2.
|
||||
ref1: https://arxiv.org/abs/1910.10288
|
||||
ref2: https://arxiv.org/pdf/1906.01083.pdf
|
||||
|
||||
Args:
|
||||
query_dim (int): number of channels in query tensor.
|
||||
K (int): number of Gaussian heads to be used for computing attention.
|
||||
"""
|
||||
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
|
||||
|
||||
def __init__(self, query_dim, K):
|
||||
|
||||
super(GravesAttention, self).__init__()
|
||||
self._mask_value = 1e-8
|
||||
self.K = K
|
||||
# self.attention_alignment = 0.05
|
||||
self.eps = 1e-5
|
||||
self.J = None
|
||||
self.N_a = nn.Sequential(
|
||||
nn.Linear(query_dim, query_dim, bias=True),
|
||||
nn.ReLU(),
|
||||
nn.Linear(query_dim, 3*K, bias=True))
|
||||
self.attention_weights = None
|
||||
self.mu_prev = None
|
||||
self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean
|
||||
torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std
|
||||
|
||||
def init_states(self, inputs):
|
||||
if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
|
||||
self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5
|
||||
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
|
||||
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
|
||||
|
||||
# pylint: disable=R0201
|
||||
# pylint: disable=unused-argument
|
||||
def preprocess_inputs(self, inputs):
|
||||
return None
|
||||
|
||||
def forward(self, query, inputs, processed_inputs, mask):
|
||||
"""
|
||||
Shapes:
|
||||
query: [B, C_attention_rnn]
|
||||
inputs: [B, T_in, C_encoder]
|
||||
processed_inputs: place_holder
|
||||
mask: [B, T_in]
|
||||
"""
|
||||
gbk_t = self.N_a(query)
|
||||
gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
|
||||
|
||||
# attention model parameters
|
||||
# each B x K
|
||||
g_t = gbk_t[:, 0, :]
|
||||
b_t = gbk_t[:, 1, :]
|
||||
k_t = gbk_t[:, 2, :]
|
||||
|
||||
# dropout to decorrelate attention heads
|
||||
g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
|
||||
|
||||
# attention GMM parameters
|
||||
sig_t = torch.nn.functional.softplus(b_t) + self.eps
|
||||
|
||||
mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
|
||||
g_t = torch.softmax(g_t, dim=-1) + self.eps
|
||||
|
||||
j = self.J[:inputs.size(1)+1]
|
||||
|
||||
# attention weights
|
||||
phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
|
||||
|
||||
# discritize attention weights
|
||||
alpha_t = torch.sum(phi_t, 1)
|
||||
alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
|
||||
alpha_t[alpha_t == 0] = 1e-8
|
||||
|
||||
# apply masking
|
||||
if mask is not None:
|
||||
alpha_t.data.masked_fill_(~mask, self._mask_value)
|
||||
|
||||
context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
|
||||
self.attention_weights = alpha_t
|
||||
self.mu_prev = mu_t
|
||||
return context
|
||||
|
||||
|
||||
class OriginalAttention(nn.Module):
|
||||
"""Bahdanau Attention with various optional modifications. Proposed below.
|
||||
- Location sensitive attnetion: https://arxiv.org/abs/1712.05884
|
||||
- Forward Attention: https://arxiv.org/abs/1807.06736 + state masking at inference
|
||||
- Using sigmoid instead of softmax normalization
|
||||
- Attention windowing at inference time
|
||||
|
||||
Note:
|
||||
Location Sensitive Attention is an attention mechanism that extends the additive attention mechanism
|
||||
to use cumulative attention weights from previous decoder time steps as an additional feature.
|
||||
|
||||
Forward attention considers only the alignment paths that satisfy the monotonic condition at each
|
||||
decoder timestep. The modified attention probabilities at each timestep are computed recursively
|
||||
using a forward algorithm.
|
||||
|
||||
Transition agent for forward attention is further proposed, which helps the attention mechanism
|
||||
to make decisions whether to move forward or stay at each decoder timestep.
|
||||
|
||||
Attention windowing applies a sliding windows to time steps of the input tensor centering at the last
|
||||
time step with the largest attention weight. It is especially useful at inference to keep the attention
|
||||
alignment diagonal.
|
||||
|
||||
|
||||
Args:
|
||||
query_dim (int): number of channels in the query tensor.
|
||||
embedding_dim (int): number of channels in the vakue tensor. In general, the value tensor is the output of the encoder layer.
|
||||
attention_dim (int): number of channels of the inner attention layers.
|
||||
location_attention (bool): enable/disable location sensitive attention.
|
||||
attention_location_n_filters (int): number of location attention filters.
|
||||
attention_location_kernel_size (int): filter size of location attention convolution layer.
|
||||
windowing (int): window size for attention windowing. if it is 5, for computing the attention, it only considers the time steps [(t-5), ..., (t+5)] of the input.
|
||||
norm (str): normalization method applied to the attention weights. 'softmax' or 'sigmoid'
|
||||
forward_attn (bool): enable/disable forward attention.
|
||||
trans_agent (bool): enable/disable transition agent in the forward attention.
|
||||
forward_attn_mask (int): enable/disable an explicit masking in forward attention. It is useful to set at especially inference time.
|
||||
"""
|
||||
# Pylint gets confused by PyTorch conventions here
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
def __init__(self, query_dim, embedding_dim, attention_dim,
|
||||
location_attention, attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing, norm, forward_attn,
|
||||
trans_agent, forward_attn_mask):
|
||||
super(OriginalAttention, self).__init__()
|
||||
self.query_layer = Linear(
|
||||
query_dim, attention_dim, bias=False, init_gain='tanh')
|
||||
self.inputs_layer = Linear(
|
||||
embedding_dim, attention_dim, bias=False, init_gain='tanh')
|
||||
self.v = Linear(attention_dim, 1, bias=True)
|
||||
if trans_agent:
|
||||
self.ta = nn.Linear(
|
||||
query_dim + embedding_dim, 1, bias=True)
|
||||
if location_attention:
|
||||
self.location_layer = LocationLayer(
|
||||
attention_dim,
|
||||
attention_location_n_filters,
|
||||
attention_location_kernel_size,
|
||||
)
|
||||
self._mask_value = -float("inf")
|
||||
self.windowing = windowing
|
||||
self.win_idx = None
|
||||
self.norm = norm
|
||||
self.forward_attn = forward_attn
|
||||
self.trans_agent = trans_agent
|
||||
self.forward_attn_mask = forward_attn_mask
|
||||
self.location_attention = location_attention
|
||||
|
||||
def init_win_idx(self):
|
||||
self.win_idx = -1
|
||||
self.win_back = 2
|
||||
self.win_front = 6
|
||||
|
||||
def init_forward_attn(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.alpha = torch.cat(
|
||||
[torch.ones([B, 1]),
|
||||
torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
|
||||
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
|
||||
|
||||
def init_location_attention(self, inputs):
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
self.attention_weights_cum = torch.zeros([B, T], device=inputs.device)
|
||||
|
||||
def init_states(self, inputs):
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
self.attention_weights = torch.zeros([B, T], device=inputs.device)
|
||||
if self.location_attention:
|
||||
self.init_location_attention(inputs)
|
||||
if self.forward_attn:
|
||||
self.init_forward_attn(inputs)
|
||||
if self.windowing:
|
||||
self.init_win_idx()
|
||||
|
||||
def preprocess_inputs(self, inputs):
|
||||
return self.inputs_layer(inputs)
|
||||
|
||||
def update_location_attention(self, alignments):
|
||||
self.attention_weights_cum += alignments
|
||||
|
||||
def get_location_attention(self, query, processed_inputs):
|
||||
attention_cat = torch.cat((self.attention_weights.unsqueeze(1),
|
||||
self.attention_weights_cum.unsqueeze(1)),
|
||||
dim=1)
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
processed_attention_weights = self.location_layer(attention_cat)
|
||||
energies = self.v(
|
||||
torch.tanh(processed_query + processed_attention_weights +
|
||||
processed_inputs))
|
||||
energies = energies.squeeze(-1)
|
||||
return energies, processed_query
|
||||
|
||||
def get_attention(self, query, processed_inputs):
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
energies = self.v(torch.tanh(processed_query + processed_inputs))
|
||||
energies = energies.squeeze(-1)
|
||||
return energies, processed_query
|
||||
|
||||
def apply_windowing(self, attention, inputs):
|
||||
back_win = self.win_idx - self.win_back
|
||||
front_win = self.win_idx + self.win_front
|
||||
if back_win > 0:
|
||||
attention[:, :back_win] = -float("inf")
|
||||
if front_win < inputs.shape[1]:
|
||||
attention[:, front_win:] = -float("inf")
|
||||
# this is a trick to solve a special problem.
|
||||
# but it does not hurt.
|
||||
if self.win_idx == -1:
|
||||
attention[:, 0] = attention.max()
|
||||
# Update the window
|
||||
self.win_idx = torch.argmax(attention, 1).long()[0].item()
|
||||
return attention
|
||||
|
||||
def apply_forward_attention(self, alignment):
|
||||
# forward attention
|
||||
fwd_shifted_alpha = F.pad(
|
||||
self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
|
||||
# compute transition potentials
|
||||
alpha = ((1 - self.u) * self.alpha
|
||||
+ self.u * fwd_shifted_alpha
|
||||
+ 1e-8) * alignment
|
||||
# force incremental alignment
|
||||
if not self.training and self.forward_attn_mask:
|
||||
_, n = fwd_shifted_alpha.max(1)
|
||||
val, _ = alpha.max(1)
|
||||
for b in range(alignment.shape[0]):
|
||||
alpha[b, n[b] + 3:] = 0
|
||||
alpha[b, :(
|
||||
n[b] - 1
|
||||
)] = 0 # ignore all previous states to prevent repetition.
|
||||
alpha[b,
|
||||
(n[b] - 2
|
||||
)] = 0.01 * val[b] # smoothing factor for the prev step
|
||||
# renormalize attention weights
|
||||
alpha = alpha / alpha.sum(dim=1, keepdim=True)
|
||||
return alpha
|
||||
|
||||
def forward(self, query, inputs, processed_inputs, mask):
|
||||
"""
|
||||
shapes:
|
||||
query: [B, C_attn_rnn]
|
||||
inputs: [B, T_en, D_en]
|
||||
processed_inputs: [B, T_en, D_attn]
|
||||
mask: [B, T_en]
|
||||
"""
|
||||
if self.location_attention:
|
||||
attention, _ = self.get_location_attention(
|
||||
query, processed_inputs)
|
||||
else:
|
||||
attention, _ = self.get_attention(
|
||||
query, processed_inputs)
|
||||
# apply masking
|
||||
if mask is not None:
|
||||
attention.data.masked_fill_(~mask, self._mask_value)
|
||||
# apply windowing - only in eval mode
|
||||
if not self.training and self.windowing:
|
||||
attention = self.apply_windowing(attention, inputs)
|
||||
|
||||
# normalize attention values
|
||||
if self.norm == "softmax":
|
||||
alignment = torch.softmax(attention, dim=-1)
|
||||
elif self.norm == "sigmoid":
|
||||
alignment = torch.sigmoid(attention) / torch.sigmoid(
|
||||
attention).sum(
|
||||
dim=1, keepdim=True)
|
||||
else:
|
||||
raise ValueError("Unknown value for attention norm type")
|
||||
|
||||
if self.location_attention:
|
||||
self.update_location_attention(alignment)
|
||||
|
||||
# apply forward attention if enabled
|
||||
if self.forward_attn:
|
||||
alignment = self.apply_forward_attention(alignment)
|
||||
self.alpha = alignment
|
||||
|
||||
context = torch.bmm(alignment.unsqueeze(1), inputs)
|
||||
context = context.squeeze(1)
|
||||
self.attention_weights = alignment
|
||||
|
||||
# compute transition agent
|
||||
if self.forward_attn and self.trans_agent:
|
||||
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
|
||||
self.u = torch.sigmoid(self.ta(ta_input))
|
||||
return context
|
||||
|
||||
|
||||
class MonotonicDynamicConvolutionAttention(nn.Module):
|
||||
"""Dynamic convolution attention from
|
||||
https://arxiv.org/pdf/1910.10288.pdf
|
||||
|
||||
|
||||
query -> linear -> tanh -> linear ->|
|
||||
| mask values
|
||||
v | |
|
||||
atten_w(t-1) -|-> conv1d_dynamic -> linear -|-> tanh -> + -> softmax -> * -> * -> context
|
||||
|-> conv1d_static -> linear -| |
|
||||
|-> conv1d_prior -> log ----------------|
|
||||
|
||||
query: attention rnn output.
|
||||
|
||||
Note:
|
||||
Dynamic convolution attention is an alternation of the location senstive attention with
|
||||
dynamically computed convolution filters from the previous attention scores and a set of
|
||||
constraints to keep the attention alignment diagonal.
|
||||
|
||||
Args:
|
||||
query_dim (int): number of channels in the query tensor.
|
||||
embedding_dim (int): number of channels in the value tensor.
|
||||
static_filter_dim (int): number of channels in the convolution layer computing the static filters.
|
||||
static_kernel_size (int): kernel size for the convolution layer computing the static filters.
|
||||
dynamic_filter_dim (int): number of channels in the convolution layer computing the dynamic filters.
|
||||
dynamic_kernel_size (int): kernel size for the convolution layer computing the dynamic filters.
|
||||
prior_filter_len (int, optional): [description]. Defaults to 11 from the paper.
|
||||
alpha (float, optional): [description]. Defaults to 0.1 from the paper.
|
||||
beta (float, optional): [description]. Defaults to 0.9 from the paper.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
query_dim,
|
||||
embedding_dim, # pylint: disable=unused-argument
|
||||
attention_dim,
|
||||
static_filter_dim,
|
||||
static_kernel_size,
|
||||
dynamic_filter_dim,
|
||||
dynamic_kernel_size,
|
||||
prior_filter_len=11,
|
||||
alpha=0.1,
|
||||
beta=0.9,
|
||||
):
|
||||
super().__init__()
|
||||
self._mask_value = 1e-8
|
||||
self.dynamic_filter_dim = dynamic_filter_dim
|
||||
self.dynamic_kernel_size = dynamic_kernel_size
|
||||
self.prior_filter_len = prior_filter_len
|
||||
self.attention_weights = None
|
||||
# setup key and query layers
|
||||
self.query_layer = nn.Linear(query_dim, attention_dim)
|
||||
self.key_layer = nn.Linear(
|
||||
attention_dim, dynamic_filter_dim * dynamic_kernel_size, bias=False
|
||||
)
|
||||
self.static_filter_conv = nn.Conv1d(
|
||||
1,
|
||||
static_filter_dim,
|
||||
static_kernel_size,
|
||||
padding=(static_kernel_size - 1) // 2,
|
||||
bias=False,
|
||||
)
|
||||
self.static_filter_layer = nn.Linear(static_filter_dim, attention_dim, bias=False)
|
||||
self.dynamic_filter_layer = nn.Linear(dynamic_filter_dim, attention_dim)
|
||||
self.v = nn.Linear(attention_dim, 1, bias=False)
|
||||
|
||||
prior = betabinom.pmf(range(prior_filter_len), prior_filter_len - 1,
|
||||
alpha, beta)
|
||||
self.register_buffer("prior", torch.FloatTensor(prior).flip(0))
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
def forward(self, query, inputs, processed_inputs, mask):
|
||||
"""
|
||||
query: [B, C_attn_rnn]
|
||||
inputs: [B, T_en, D_en]
|
||||
processed_inputs: place holder.
|
||||
mask: [B, T_en]
|
||||
"""
|
||||
# compute prior filters
|
||||
prior_filter = F.conv1d(
|
||||
F.pad(self.attention_weights.unsqueeze(1),
|
||||
(self.prior_filter_len - 1, 0)), self.prior.view(1, 1, -1))
|
||||
prior_filter = torch.log(prior_filter.clamp_min_(1e-6)).squeeze(1)
|
||||
G = self.key_layer(torch.tanh(self.query_layer(query)))
|
||||
# compute dynamic filters
|
||||
dynamic_filter = F.conv1d(
|
||||
self.attention_weights.unsqueeze(0),
|
||||
G.view(-1, 1, self.dynamic_kernel_size),
|
||||
padding=(self.dynamic_kernel_size - 1) // 2,
|
||||
groups=query.size(0),
|
||||
)
|
||||
dynamic_filter = dynamic_filter.view(query.size(0), self.dynamic_filter_dim, -1).transpose(1, 2)
|
||||
# compute static filters
|
||||
static_filter = self.static_filter_conv(self.attention_weights.unsqueeze(1)).transpose(1, 2)
|
||||
alignment = self.v(
|
||||
torch.tanh(
|
||||
self.static_filter_layer(static_filter) +
|
||||
self.dynamic_filter_layer(dynamic_filter))).squeeze(-1) + prior_filter
|
||||
# compute attention weights
|
||||
attention_weights = F.softmax(alignment, dim=-1)
|
||||
# apply masking
|
||||
if mask is not None:
|
||||
attention_weights.data.masked_fill_(~mask, self._mask_value)
|
||||
self.attention_weights = attention_weights
|
||||
# compute context
|
||||
context = torch.bmm(attention_weights.unsqueeze(1), inputs).squeeze(1)
|
||||
return context
|
||||
|
||||
def preprocess_inputs(self, inputs): # pylint: disable=no-self-use
|
||||
return None
|
||||
|
||||
def init_states(self, inputs):
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
self.attention_weights = torch.zeros([B, T], device=inputs.device)
|
||||
self.attention_weights[:, 0] = 1.
|
||||
|
||||
|
||||
def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
|
||||
location_attention, attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing, norm, forward_attn,
|
||||
trans_agent, forward_attn_mask, attn_K):
|
||||
if attn_type == "original":
|
||||
return OriginalAttention(query_dim, embedding_dim, attention_dim,
|
||||
location_attention,
|
||||
attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing,
|
||||
norm, forward_attn, trans_agent,
|
||||
forward_attn_mask)
|
||||
if attn_type == "graves":
|
||||
return GravesAttention(query_dim, attn_K)
|
||||
if attn_type == "dynamic_convolution":
|
||||
return MonotonicDynamicConvolutionAttention(query_dim,
|
||||
embedding_dim,
|
||||
attention_dim,
|
||||
static_filter_dim=8,
|
||||
static_kernel_size=21,
|
||||
dynamic_filter_dim=8,
|
||||
dynamic_kernel_size=21,
|
||||
prior_filter_len=11,
|
||||
alpha=0.1,
|
||||
beta=0.9)
|
||||
|
||||
raise RuntimeError(
|
||||
" [!] Given Attention Type '{attn_type}' is not exist.")
|
|
@ -4,6 +4,14 @@ from torch.nn import functional as F
|
|||
|
||||
|
||||
class Linear(nn.Module):
|
||||
"""Linear layer with a specific initialization.
|
||||
|
||||
Args:
|
||||
in_features (int): number of channels in the input tensor.
|
||||
out_features (int): number of channels in the output tensor.
|
||||
bias (bool, optional): enable/disable bias in the layer. Defaults to True.
|
||||
init_gain (str, optional): method to compute the gain in the weight initializtion based on the nonlinear activation used afterwards. Defaults to 'linear'.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_features,
|
||||
out_features,
|
||||
|
@ -24,6 +32,16 @@ class Linear(nn.Module):
|
|||
|
||||
|
||||
class LinearBN(nn.Module):
|
||||
"""Linear layer with Batch Normalization.
|
||||
|
||||
x -> linear -> BN -> o
|
||||
|
||||
Args:
|
||||
in_features (int): number of channels in the input tensor.
|
||||
out_features (int ): number of channels in the output tensor.
|
||||
bias (bool, optional): enable/disable bias in the linear layer. Defaults to True.
|
||||
init_gain (str, optional): method to set the gain for weight initialization. Defaults to 'linear'.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_features,
|
||||
out_features,
|
||||
|
@ -41,6 +59,10 @@ class LinearBN(nn.Module):
|
|||
gain=torch.nn.init.calculate_gain(init_gain))
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Shapes:
|
||||
x: [T, B, C] or [B, C]
|
||||
"""
|
||||
out = self.linear_layer(x)
|
||||
if len(out.shape) == 3:
|
||||
out = out.permute(1, 2, 0)
|
||||
|
@ -51,6 +73,29 @@ class LinearBN(nn.Module):
|
|||
|
||||
|
||||
class Prenet(nn.Module):
|
||||
"""Tacotron specific Prenet with an optional Batch Normalization.
|
||||
|
||||
Note:
|
||||
Prenet with BN improves the model performance significantly especially
|
||||
if it is enabled after learning a diagonal attention alignment with the original
|
||||
prenet. However, if the target dataset is high quality then it also works from
|
||||
the start. It is also suggested to disable dropout if BN is in use.
|
||||
|
||||
prenet_type == "original"
|
||||
x -> [linear -> ReLU -> Dropout]xN -> o
|
||||
|
||||
prenet_type == "bn"
|
||||
x -> [linear -> BN -> ReLU -> Dropout]xN -> o
|
||||
|
||||
Args:
|
||||
in_features (int): number of channels in the input tensor and the inner layers.
|
||||
prenet_type (str, optional): prenet type "original" or "bn". Defaults to "original".
|
||||
prenet_dropout (bool, optional): dropout rate. Defaults to True.
|
||||
out_features (list, optional): List of output channels for each prenet block.
|
||||
It also defines number of the prenet blocks based on the length of argument list.
|
||||
Defaults to [256, 256].
|
||||
bias (bool, optional): enable/disable bias in prenet linear layers. Defaults to True.
|
||||
"""
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
in_features,
|
||||
|
@ -79,311 +124,4 @@ class Prenet(nn.Module):
|
|||
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training)
|
||||
else:
|
||||
x = F.relu(linear(x))
|
||||
return x
|
||||
|
||||
|
||||
####################
|
||||
# ATTENTION MODULES
|
||||
####################
|
||||
|
||||
|
||||
class LocationLayer(nn.Module):
|
||||
def __init__(self,
|
||||
attention_dim,
|
||||
attention_n_filters=32,
|
||||
attention_kernel_size=31):
|
||||
super(LocationLayer, self).__init__()
|
||||
self.location_conv1d = nn.Conv1d(
|
||||
in_channels=2,
|
||||
out_channels=attention_n_filters,
|
||||
kernel_size=attention_kernel_size,
|
||||
stride=1,
|
||||
padding=(attention_kernel_size - 1) // 2,
|
||||
bias=False)
|
||||
self.location_dense = Linear(
|
||||
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
|
||||
|
||||
def forward(self, attention_cat):
|
||||
processed_attention = self.location_conv1d(attention_cat)
|
||||
processed_attention = self.location_dense(
|
||||
processed_attention.transpose(1, 2))
|
||||
return processed_attention
|
||||
|
||||
|
||||
class GravesAttention(nn.Module):
|
||||
""" Discretized Graves attention:
|
||||
- https://arxiv.org/abs/1910.10288
|
||||
- https://arxiv.org/pdf/1906.01083.pdf
|
||||
"""
|
||||
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
|
||||
|
||||
def __init__(self, query_dim, K):
|
||||
super(GravesAttention, self).__init__()
|
||||
self._mask_value = 1e-8
|
||||
self.K = K
|
||||
# self.attention_alignment = 0.05
|
||||
self.eps = 1e-5
|
||||
self.J = None
|
||||
self.N_a = nn.Sequential(
|
||||
nn.Linear(query_dim, query_dim, bias=True),
|
||||
nn.ReLU(),
|
||||
nn.Linear(query_dim, 3*K, bias=True))
|
||||
self.attention_weights = None
|
||||
self.mu_prev = None
|
||||
self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean
|
||||
torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std
|
||||
|
||||
def init_states(self, inputs):
|
||||
if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
|
||||
self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5
|
||||
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
|
||||
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
|
||||
|
||||
# pylint: disable=R0201
|
||||
# pylint: disable=unused-argument
|
||||
def preprocess_inputs(self, inputs):
|
||||
return None
|
||||
|
||||
def forward(self, query, inputs, processed_inputs, mask):
|
||||
"""
|
||||
shapes:
|
||||
query: B x D_attention_rnn
|
||||
inputs: B x T_in x D_encoder
|
||||
processed_inputs: place_holder
|
||||
mask: B x T_in
|
||||
"""
|
||||
gbk_t = self.N_a(query)
|
||||
gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
|
||||
|
||||
# attention model parameters
|
||||
# each B x K
|
||||
g_t = gbk_t[:, 0, :]
|
||||
b_t = gbk_t[:, 1, :]
|
||||
k_t = gbk_t[:, 2, :]
|
||||
|
||||
# dropout to decorrelate attention heads
|
||||
g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
|
||||
|
||||
# attention GMM parameters
|
||||
sig_t = torch.nn.functional.softplus(b_t) + self.eps
|
||||
|
||||
mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
|
||||
g_t = torch.softmax(g_t, dim=-1) + self.eps
|
||||
|
||||
j = self.J[:inputs.size(1)+1]
|
||||
|
||||
# attention weights
|
||||
phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
|
||||
|
||||
# discritize attention weights
|
||||
alpha_t = torch.sum(phi_t, 1)
|
||||
alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
|
||||
alpha_t[alpha_t == 0] = 1e-8
|
||||
|
||||
# apply masking
|
||||
if mask is not None:
|
||||
alpha_t.data.masked_fill_(~mask, self._mask_value)
|
||||
|
||||
context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
|
||||
self.attention_weights = alpha_t
|
||||
self.mu_prev = mu_t
|
||||
return context
|
||||
|
||||
|
||||
class OriginalAttention(nn.Module):
|
||||
"""Following the methods proposed here:
|
||||
- https://arxiv.org/abs/1712.05884
|
||||
- https://arxiv.org/abs/1807.06736 + state masking at inference
|
||||
- Using sigmoid instead of softmax normalization
|
||||
- Attention windowing at inference time
|
||||
"""
|
||||
# Pylint gets confused by PyTorch conventions here
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
def __init__(self, query_dim, embedding_dim, attention_dim,
|
||||
location_attention, attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing, norm, forward_attn,
|
||||
trans_agent, forward_attn_mask):
|
||||
super(OriginalAttention, self).__init__()
|
||||
self.query_layer = Linear(
|
||||
query_dim, attention_dim, bias=False, init_gain='tanh')
|
||||
self.inputs_layer = Linear(
|
||||
embedding_dim, attention_dim, bias=False, init_gain='tanh')
|
||||
self.v = Linear(attention_dim, 1, bias=True)
|
||||
if trans_agent:
|
||||
self.ta = nn.Linear(
|
||||
query_dim + embedding_dim, 1, bias=True)
|
||||
if location_attention:
|
||||
self.location_layer = LocationLayer(
|
||||
attention_dim,
|
||||
attention_location_n_filters,
|
||||
attention_location_kernel_size,
|
||||
)
|
||||
self._mask_value = -float("inf")
|
||||
self.windowing = windowing
|
||||
self.win_idx = None
|
||||
self.norm = norm
|
||||
self.forward_attn = forward_attn
|
||||
self.trans_agent = trans_agent
|
||||
self.forward_attn_mask = forward_attn_mask
|
||||
self.location_attention = location_attention
|
||||
|
||||
def init_win_idx(self):
|
||||
self.win_idx = -1
|
||||
self.win_back = 2
|
||||
self.win_front = 6
|
||||
|
||||
def init_forward_attn(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.alpha = torch.cat(
|
||||
[torch.ones([B, 1]),
|
||||
torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
|
||||
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
|
||||
|
||||
def init_location_attention(self, inputs):
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
self.attention_weights_cum = torch.zeros([B, T], device=inputs.device)
|
||||
|
||||
def init_states(self, inputs):
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
self.attention_weights = torch.zeros([B, T], device=inputs.device)
|
||||
if self.location_attention:
|
||||
self.init_location_attention(inputs)
|
||||
if self.forward_attn:
|
||||
self.init_forward_attn(inputs)
|
||||
if self.windowing:
|
||||
self.init_win_idx()
|
||||
|
||||
def preprocess_inputs(self, inputs):
|
||||
return self.inputs_layer(inputs)
|
||||
|
||||
def update_location_attention(self, alignments):
|
||||
self.attention_weights_cum += alignments
|
||||
|
||||
def get_location_attention(self, query, processed_inputs):
|
||||
attention_cat = torch.cat((self.attention_weights.unsqueeze(1),
|
||||
self.attention_weights_cum.unsqueeze(1)),
|
||||
dim=1)
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
processed_attention_weights = self.location_layer(attention_cat)
|
||||
energies = self.v(
|
||||
torch.tanh(processed_query + processed_attention_weights +
|
||||
processed_inputs))
|
||||
energies = energies.squeeze(-1)
|
||||
return energies, processed_query
|
||||
|
||||
def get_attention(self, query, processed_inputs):
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
energies = self.v(torch.tanh(processed_query + processed_inputs))
|
||||
energies = energies.squeeze(-1)
|
||||
return energies, processed_query
|
||||
|
||||
def apply_windowing(self, attention, inputs):
|
||||
back_win = self.win_idx - self.win_back
|
||||
front_win = self.win_idx + self.win_front
|
||||
if back_win > 0:
|
||||
attention[:, :back_win] = -float("inf")
|
||||
if front_win < inputs.shape[1]:
|
||||
attention[:, front_win:] = -float("inf")
|
||||
# this is a trick to solve a special problem.
|
||||
# but it does not hurt.
|
||||
if self.win_idx == -1:
|
||||
attention[:, 0] = attention.max()
|
||||
# Update the window
|
||||
self.win_idx = torch.argmax(attention, 1).long()[0].item()
|
||||
return attention
|
||||
|
||||
def apply_forward_attention(self, alignment):
|
||||
# forward attention
|
||||
fwd_shifted_alpha = F.pad(
|
||||
self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
|
||||
# compute transition potentials
|
||||
alpha = ((1 - self.u) * self.alpha
|
||||
+ self.u * fwd_shifted_alpha
|
||||
+ 1e-8) * alignment
|
||||
# force incremental alignment
|
||||
if not self.training and self.forward_attn_mask:
|
||||
_, n = fwd_shifted_alpha.max(1)
|
||||
val, _ = alpha.max(1)
|
||||
for b in range(alignment.shape[0]):
|
||||
alpha[b, n[b] + 3:] = 0
|
||||
alpha[b, :(
|
||||
n[b] - 1
|
||||
)] = 0 # ignore all previous states to prevent repetition.
|
||||
alpha[b,
|
||||
(n[b] - 2
|
||||
)] = 0.01 * val[b] # smoothing factor for the prev step
|
||||
# renormalize attention weights
|
||||
alpha = alpha / alpha.sum(dim=1, keepdim=True)
|
||||
return alpha
|
||||
|
||||
def forward(self, query, inputs, processed_inputs, mask):
|
||||
"""
|
||||
shapes:
|
||||
query: B x D_attn_rnn
|
||||
inputs: B x T_en x D_en
|
||||
processed_inputs:: B x T_en x D_attn
|
||||
mask: B x T_en
|
||||
"""
|
||||
if self.location_attention:
|
||||
attention, _ = self.get_location_attention(
|
||||
query, processed_inputs)
|
||||
else:
|
||||
attention, _ = self.get_attention(
|
||||
query, processed_inputs)
|
||||
# apply masking
|
||||
if mask is not None:
|
||||
attention.data.masked_fill_(~mask, self._mask_value)
|
||||
# apply windowing - only in eval mode
|
||||
if not self.training and self.windowing:
|
||||
attention = self.apply_windowing(attention, inputs)
|
||||
|
||||
# normalize attention values
|
||||
if self.norm == "softmax":
|
||||
alignment = torch.softmax(attention, dim=-1)
|
||||
elif self.norm == "sigmoid":
|
||||
alignment = torch.sigmoid(attention) / torch.sigmoid(
|
||||
attention).sum(
|
||||
dim=1, keepdim=True)
|
||||
else:
|
||||
raise ValueError("Unknown value for attention norm type")
|
||||
|
||||
if self.location_attention:
|
||||
self.update_location_attention(alignment)
|
||||
|
||||
# apply forward attention if enabled
|
||||
if self.forward_attn:
|
||||
alignment = self.apply_forward_attention(alignment)
|
||||
self.alpha = alignment
|
||||
|
||||
context = torch.bmm(alignment.unsqueeze(1), inputs)
|
||||
context = context.squeeze(1)
|
||||
self.attention_weights = alignment
|
||||
|
||||
# compute transition agent
|
||||
if self.forward_attn and self.trans_agent:
|
||||
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
|
||||
self.u = torch.sigmoid(self.ta(ta_input))
|
||||
return context
|
||||
|
||||
|
||||
def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
|
||||
location_attention, attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing, norm, forward_attn,
|
||||
trans_agent, forward_attn_mask, attn_K):
|
||||
if attn_type == "original":
|
||||
return OriginalAttention(query_dim, embedding_dim, attention_dim,
|
||||
location_attention,
|
||||
attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing,
|
||||
norm, forward_attn, trans_agent,
|
||||
forward_attn_mask)
|
||||
if attn_type == "graves":
|
||||
return GravesAttention(query_dim, attn_K)
|
||||
raise RuntimeError(
|
||||
" [!] Given Attention Type '{attn_type}' is not exist.")
|
||||
return x
|
|
@ -0,0 +1,118 @@
|
|||
from torch import nn
|
||||
|
||||
|
||||
class ZeroTemporalPad(nn.Module):
|
||||
"""Pad sequences to equal lentgh in the temporal dimension"""
|
||||
def __init__(self, kernel_size, dilation):
|
||||
super().__init__()
|
||||
total_pad = (dilation * (kernel_size - 1))
|
||||
begin = total_pad // 2
|
||||
end = total_pad - begin
|
||||
self.pad_layer = nn.ZeroPad2d((0, 0, begin, end))
|
||||
|
||||
def forward(self, x):
|
||||
return self.pad_layer(x)
|
||||
|
||||
|
||||
class Conv1dBN(nn.Module):
|
||||
"""1d convolutional with batch norm.
|
||||
conv1d -> relu -> BN blocks.
|
||||
|
||||
Note:
|
||||
Batch normalization is applied after ReLU regarding the original implementation.
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
kernel_size (int): kernel size for convolutional filters.
|
||||
dilation (int): dilation for convolution layers.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, kernel_size, dilation):
|
||||
super().__init__()
|
||||
padding = (dilation * (kernel_size - 1))
|
||||
pad_s = padding // 2
|
||||
pad_e = padding - pad_s
|
||||
self.conv1d = nn.Conv1d(in_channels, out_channels, kernel_size, dilation=dilation)
|
||||
self.pad = nn.ZeroPad2d((pad_s, pad_e, 0, 0)) # uneven left and right padding
|
||||
self.norm = nn.BatchNorm1d(out_channels)
|
||||
|
||||
def forward(self, x):
|
||||
o = self.conv1d(x)
|
||||
o = self.pad(o)
|
||||
o = nn.functional.relu(o)
|
||||
o = self.norm(o)
|
||||
return o
|
||||
|
||||
|
||||
class Conv1dBNBlock(nn.Module):
|
||||
"""1d convolutional block with batch norm. It is a set of conv1d -> relu -> BN blocks.
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): number of inner convolution channels.
|
||||
kernel_size (int): kernel size for convolutional filters.
|
||||
dilation (int): dilation for convolution layers.
|
||||
num_conv_blocks (int, optional): number of convolutional blocks. Defaults to 2.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation, num_conv_blocks=2):
|
||||
super().__init__()
|
||||
self.conv_bn_blocks = []
|
||||
for idx in range(num_conv_blocks):
|
||||
layer = Conv1dBN(in_channels if idx == 0 else hidden_channels,
|
||||
out_channels if idx == (num_conv_blocks - 1) else hidden_channels,
|
||||
kernel_size,
|
||||
dilation)
|
||||
self.conv_bn_blocks.append(layer)
|
||||
self.conv_bn_blocks = nn.Sequential(*self.conv_bn_blocks)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Shapes:
|
||||
x: (B, D, T)
|
||||
"""
|
||||
return self.conv_bn_blocks(x)
|
||||
|
||||
|
||||
class ResidualConv1dBNBlock(nn.Module):
|
||||
"""Residual Convolutional Blocks with BN
|
||||
Each block has 'num_conv_block' conv layers and 'num_res_blocks' such blocks are connected
|
||||
with residual connections.
|
||||
|
||||
conv_block = (conv1d -> relu -> bn) x 'num_conv_blocks'
|
||||
residuak_conv_block = (x -> conv_block -> + ->) x 'num_res_blocks'
|
||||
' - - - - - - - - - ^
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): number of inner convolution channels.
|
||||
kernel_size (int): kernel size for convolutional filters.
|
||||
dilations (list): dilations for each convolution layer.
|
||||
num_res_blocks (int, optional): number of residual blocks. Defaults to 13.
|
||||
num_conv_blocks (int, optional): number of convolutional blocks in each residual block. Defaults to 2.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2):
|
||||
|
||||
super().__init__()
|
||||
assert len(dilations) == num_res_blocks
|
||||
self.res_blocks = nn.ModuleList()
|
||||
for idx, dilation in enumerate(dilations):
|
||||
block = Conv1dBNBlock(in_channels if idx==0 else hidden_channels,
|
||||
out_channels if (idx + 1) == len(dilations) else hidden_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation,
|
||||
num_conv_blocks)
|
||||
self.res_blocks.append(block)
|
||||
|
||||
def forward(self, x, x_mask=None):
|
||||
if x_mask is None:
|
||||
x_mask = 1.0
|
||||
o = x * x_mask
|
||||
for block in self.res_blocks:
|
||||
res = o
|
||||
o = block(o)
|
||||
o = o + res
|
||||
if x_mask is not None:
|
||||
o = o * x_mask
|
||||
return o
|
|
@ -0,0 +1,170 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a + input_b
|
||||
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
||||
class WN(torch.nn.Module):
|
||||
"""Wavenet layers with weight norm and no input conditioning.
|
||||
|
||||
|-----------------------------------------------------------------------------|
|
||||
| |-> tanh -| |
|
||||
res -|- conv1d(dilation) -> dropout -> + -| * -> conv1d1x1 -> split -|- + -> res
|
||||
g -------------------------------------| |-> sigmoid -| |
|
||||
o --------------------------------------------------------------------------- + --------- o
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
hidden_channes (int): number of hidden channels.
|
||||
kernel_size (int): filter kernel size for the first conv layer.
|
||||
dilation_rate (int): dilations rate to increase dilation per layer.
|
||||
If it is 2, dilations are 1, 2, 4, 8 for the next 4 layers.
|
||||
num_layers (int): number of wavenet layers.
|
||||
c_in_channels (int): number of channels of conditioning input.
|
||||
dropout_p (float): dropout rate.
|
||||
weight_norm (bool): enable/disable weight norm for convolution layers.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
num_layers,
|
||||
c_in_channels=0,
|
||||
dropout_p=0,
|
||||
weight_norm=True):
|
||||
super().__init__()
|
||||
assert kernel_size % 2 == 1
|
||||
assert hidden_channels % 2 == 0
|
||||
self.in_channels = in_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.num_layers = num_layers
|
||||
self.c_in_channels = c_in_channels
|
||||
self.dropout_p = dropout_p
|
||||
|
||||
self.in_layers = torch.nn.ModuleList()
|
||||
self.res_skip_layers = torch.nn.ModuleList()
|
||||
self.dropout = nn.Dropout(dropout_p)
|
||||
|
||||
# init conditioning layer
|
||||
if c_in_channels > 0:
|
||||
cond_layer = torch.nn.Conv1d(c_in_channels,
|
||||
2 * hidden_channels * num_layers, 1)
|
||||
self.cond_layer = torch.nn.utils.weight_norm(cond_layer,
|
||||
name='weight')
|
||||
# intermediate layers
|
||||
for i in range(num_layers):
|
||||
dilation = dilation_rate**i
|
||||
padding = int((kernel_size * dilation - dilation) / 2)
|
||||
in_layer = torch.nn.Conv1d(hidden_channels,
|
||||
2 * hidden_channels,
|
||||
kernel_size,
|
||||
dilation=dilation,
|
||||
padding=padding)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
if i < num_layers - 1:
|
||||
res_skip_channels = 2 * hidden_channels
|
||||
else:
|
||||
res_skip_channels = hidden_channels
|
||||
|
||||
res_skip_layer = torch.nn.Conv1d(hidden_channels,
|
||||
res_skip_channels, 1)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer,
|
||||
name='weight')
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
# setup weight norm
|
||||
if not weight_norm:
|
||||
self.remove_weight_norm()
|
||||
|
||||
def forward(self, x, x_mask=None, g=None, **kwargs): # pylint: disable=unused-argument
|
||||
output = torch.zeros_like(x)
|
||||
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||
if g is not None:
|
||||
g = self.cond_layer(g)
|
||||
for i in range(self.num_layers):
|
||||
x_in = self.in_layers[i](x)
|
||||
x_in = self.dropout(x_in)
|
||||
if g is not None:
|
||||
cond_offset = i * 2 * self.hidden_channels
|
||||
g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
|
||||
else:
|
||||
g_l = torch.zeros_like(x_in)
|
||||
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l,
|
||||
n_channels_tensor)
|
||||
res_skip_acts = self.res_skip_layers[i](acts)
|
||||
if i < self.num_layers - 1:
|
||||
x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
|
||||
output = output + res_skip_acts[:, self.hidden_channels:, :]
|
||||
else:
|
||||
output = output + res_skip_acts
|
||||
return output * x_mask
|
||||
|
||||
def remove_weight_norm(self):
|
||||
if self.c_in_channels != 0:
|
||||
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
||||
for l in self.in_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
for l in self.res_skip_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
|
||||
|
||||
class WNBlocks(nn.Module):
|
||||
"""Wavenet blocks.
|
||||
|
||||
Note: After each block dilation resets to 1 and it increases in each block
|
||||
along the dilation rate.
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
hidden_channes (int): number of hidden channels.
|
||||
kernel_size (int): filter kernel size for the first conv layer.
|
||||
dilation_rate (int): dilations rate to increase dilation per layer.
|
||||
If it is 2, dilations are 1, 2, 4, 8 for the next 4 layers.
|
||||
num_blocks (int): number of wavenet blocks.
|
||||
num_layers (int): number of wavenet layers.
|
||||
c_in_channels (int): number of channels of conditioning input.
|
||||
dropout_p (float): dropout rate.
|
||||
weight_norm (bool): enable/disable weight norm for convolution layers.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
num_blocks,
|
||||
num_layers,
|
||||
c_in_channels=0,
|
||||
dropout_p=0,
|
||||
weight_norm=True):
|
||||
|
||||
super().__init__()
|
||||
self.wn_blocks = nn.ModuleList()
|
||||
for idx in range(num_blocks):
|
||||
layer = WN(in_channels=in_channels if idx == 0 else hidden_channels,
|
||||
hidden_channels=hidden_channels,
|
||||
kernel_size=kernel_size,
|
||||
dilation_rate=dilation_rate,
|
||||
num_layers=num_layers,
|
||||
c_in_channels=c_in_channels,
|
||||
dropout_p=dropout_p,
|
||||
weight_norm=weight_norm)
|
||||
self.wn_blocks.append(layer)
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
o = x
|
||||
for layer in self.wn_blocks:
|
||||
o = layer(o, x_mask, g)
|
||||
return o
|
|
@ -2,10 +2,17 @@ import torch
|
|||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.glow_tts.glow import InvConvNear, CouplingBlock
|
||||
from TTS.tts.layers.glow_tts.normalization import ActNorm
|
||||
from TTS.tts.layers.generic.normalization import ActNorm
|
||||
|
||||
|
||||
def squeeze(x, x_mask=None, num_sqz=2):
|
||||
"""GlowTTS squeeze operation
|
||||
Increase number of channels and reduce number of time steps
|
||||
by the same factor.
|
||||
|
||||
Note:
|
||||
each 's' is a n-dimensional vector.
|
||||
[s1,s2,s3,s4,s5,s6] --> [[s1, s3, s5], [s2, s4, s6]]"""
|
||||
b, c, t = x.size()
|
||||
|
||||
t = (t // num_sqz) * num_sqz
|
||||
|
@ -23,6 +30,11 @@ def squeeze(x, x_mask=None, num_sqz=2):
|
|||
|
||||
|
||||
def unsqueeze(x, x_mask=None, num_sqz=2):
|
||||
"""GlowTTS unsqueeze operation
|
||||
|
||||
Note:
|
||||
each 's' is a n-dimensional vector.
|
||||
[[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5], [s2, s4, s6]] """
|
||||
b, c, t = x.size()
|
||||
|
||||
x_unsqz = x.view(b, num_sqz, c // num_sqz, t)
|
||||
|
@ -40,7 +52,19 @@ def unsqueeze(x, x_mask=None, num_sqz=2):
|
|||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
"""Stack of Glow Modules"""
|
||||
"""Stack of Glow Decoder Modules.
|
||||
Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
|
||||
|
||||
Args:
|
||||
in_channels (int): channels of input tensor.
|
||||
hidden_channels (int): hidden decoder channels.
|
||||
kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
|
||||
dilation_rate (int): rate to increase dilation by each layer in a decoder block.
|
||||
num_flow_blocks (int): number of decoder blocks.
|
||||
num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
|
||||
dropout_p (float): wavenet dropout rate.
|
||||
sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
hidden_channels,
|
||||
|
@ -50,7 +74,7 @@ class Decoder(nn.Module):
|
|||
num_coupling_layers,
|
||||
dropout_p=0.,
|
||||
num_splits=4,
|
||||
num_sqz=2,
|
||||
num_squeeze=2,
|
||||
sigmoid_scale=False,
|
||||
c_in_channels=0):
|
||||
super().__init__()
|
||||
|
@ -63,18 +87,18 @@ class Decoder(nn.Module):
|
|||
self.num_coupling_layers = num_coupling_layers
|
||||
self.dropout_p = dropout_p
|
||||
self.num_splits = num_splits
|
||||
self.num_sqz = num_sqz
|
||||
self.num_squeeze = num_squeeze
|
||||
self.sigmoid_scale = sigmoid_scale
|
||||
self.c_in_channels = c_in_channels
|
||||
|
||||
self.flows = nn.ModuleList()
|
||||
for _ in range(num_flow_blocks):
|
||||
self.flows.append(ActNorm(channels=in_channels * num_sqz))
|
||||
self.flows.append(ActNorm(channels=in_channels * num_squeeze))
|
||||
self.flows.append(
|
||||
InvConvNear(channels=in_channels * num_sqz,
|
||||
InvConvNear(channels=in_channels * num_squeeze,
|
||||
num_splits=num_splits))
|
||||
self.flows.append(
|
||||
CouplingBlock(in_channels * num_sqz,
|
||||
CouplingBlock(in_channels * num_squeeze,
|
||||
hidden_channels,
|
||||
kernel_size=kernel_size,
|
||||
dilation_rate=dilation_rate,
|
||||
|
@ -91,16 +115,16 @@ class Decoder(nn.Module):
|
|||
flows = reversed(self.flows)
|
||||
logdet_tot = None
|
||||
|
||||
if self.num_sqz > 1:
|
||||
x, x_mask = squeeze(x, x_mask, self.num_sqz)
|
||||
if self.num_squeeze > 1:
|
||||
x, x_mask = squeeze(x, x_mask, self.num_squeeze)
|
||||
for f in flows:
|
||||
if not reverse:
|
||||
x, logdet = f(x, x_mask, g=g, reverse=reverse)
|
||||
logdet_tot += logdet
|
||||
else:
|
||||
x, logdet = f(x, x_mask, g=g, reverse=reverse)
|
||||
if self.num_sqz > 1:
|
||||
x, x_mask = unsqueeze(x, x_mask, self.num_sqz)
|
||||
if self.num_squeeze > 1:
|
||||
x, x_mask = unsqueeze(x, x_mask, self.num_squeeze)
|
||||
return x, logdet_tot
|
||||
|
||||
def store_inverse(self):
|
||||
|
|
|
@ -1,33 +1,50 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from .normalization import LayerNorm
|
||||
from ..generic.normalization import LayerNorm
|
||||
|
||||
|
||||
class DurationPredictor(nn.Module):
|
||||
def __init__(self, in_channels, filter_channels, kernel_size, dropout_p):
|
||||
"""Glow-TTS duration prediction model.
|
||||
[2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs
|
||||
|
||||
Args:
|
||||
in_channels ([type]): [description]
|
||||
hidden_channels ([type]): [description]
|
||||
kernel_size ([type]): [description]
|
||||
dropout_p ([type]): [description]
|
||||
"""
|
||||
def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p):
|
||||
super().__init__()
|
||||
# class arguments
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.filter_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dropout_p = dropout_p
|
||||
# layers
|
||||
self.drop = nn.Dropout(dropout_p)
|
||||
self.conv_1 = nn.Conv1d(in_channels,
|
||||
filter_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2)
|
||||
self.norm_1 = LayerNorm(filter_channels)
|
||||
self.conv_2 = nn.Conv1d(filter_channels,
|
||||
filter_channels,
|
||||
self.norm_1 = LayerNorm(hidden_channels)
|
||||
self.conv_2 = nn.Conv1d(hidden_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2)
|
||||
self.norm_2 = LayerNorm(filter_channels)
|
||||
self.norm_2 = LayerNorm(hidden_channels)
|
||||
# output layer
|
||||
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
||||
self.proj = nn.Conv1d(hidden_channels, 1, 1)
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
x_mask: [B, 1, T]
|
||||
|
||||
Returns:
|
||||
[type]: [description]
|
||||
"""
|
||||
x = self.conv_1(x * x_mask)
|
||||
x = torch.relu(x)
|
||||
x = self.norm_1(x)
|
||||
|
|
|
@ -2,25 +2,30 @@ import math
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.glow_tts.transformer import Transformer
|
||||
from TTS.tts.layers.glow_tts.gated_conv import GatedConvBlock
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
from TTS.tts.layers.generic.gated_conv import GatedConvBlock
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.layers.glow_tts.glow import ConvLayerNorm
|
||||
from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
|
||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.glow_tts.time_depth_sep_conv import TimeDepthSeparableConvBlock
|
||||
from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlock
|
||||
from TTS.tts.layers.generic.res_conv_bn import ResidualConv1dBNBlock
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
"""Glow-TTS encoder module. It uses Transformer with Relative Pos.Encoding
|
||||
as in the original paper or GatedConvBlock as a faster alternative.
|
||||
"""Glow-TTS encoder module.
|
||||
|
||||
embedding -> <prenet> -> encoder_module -> <postnet> --> proj_mean
|
||||
|
|
||||
|-> proj_var
|
||||
|
|
||||
|-> concat -> duration_predictor
|
||||
↑
|
||||
speaker_embed
|
||||
Args:
|
||||
num_chars (int): number of characters.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): encoder's embedding size.
|
||||
filter_channels (int): transformer's feed-forward channels.
|
||||
num_head (int): number of attention heads in transformer.
|
||||
num_layers (int): number of transformer encoder stack.
|
||||
hidden_channels_ffn (int): transformer's feed-forward channels.
|
||||
kernel_size (int): kernel size for conv layers and duration predictor.
|
||||
dropout_p (float): dropout rate for any dropout layer.
|
||||
mean_only (bool): if True, output only mean values and use constant std.
|
||||
|
@ -29,20 +34,49 @@ class Encoder(nn.Module):
|
|||
|
||||
Shapes:
|
||||
- input: (B, T, C)
|
||||
|
||||
Notes:
|
||||
suggested encoder params...
|
||||
|
||||
for encoder_type == 'rel_pos_transformer'
|
||||
encoder_params={
|
||||
'kernel_size':3,
|
||||
'dropout_p': 0.1,
|
||||
'num_layers': 6,
|
||||
'num_heads': 2,
|
||||
'hidden_channels_ffn': 768, # 4 times the hidden_channels
|
||||
'input_length': None
|
||||
}
|
||||
|
||||
for encoder_type == 'gated_conv'
|
||||
encoder_params={
|
||||
'kernel_size':5,
|
||||
'dropout_p': 0.1,
|
||||
'num_layers': 9,
|
||||
}
|
||||
|
||||
for encoder_type == 'residual_conv_bn'
|
||||
encoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
}
|
||||
|
||||
for encoder_type == 'time_depth_separable'
|
||||
encoder_params={
|
||||
"kernel_size": 5,
|
||||
'num_layers': 9,
|
||||
}
|
||||
"""
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
filter_channels_dp,
|
||||
hidden_channels_dp,
|
||||
encoder_type,
|
||||
num_heads,
|
||||
num_layers,
|
||||
kernel_size,
|
||||
dropout_p,
|
||||
rel_attn_window_size=None,
|
||||
input_length=None,
|
||||
encoder_params,
|
||||
dropout_p_dp=0.1,
|
||||
mean_only=False,
|
||||
use_prenet=True,
|
||||
c_in_channels=0):
|
||||
|
@ -51,12 +85,8 @@ class Encoder(nn.Module):
|
|||
self.num_chars = num_chars
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.filter_channels_dp = filter_channels_dp
|
||||
self.num_heads = num_heads
|
||||
self.num_layers = num_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.dropout_p = dropout_p
|
||||
self.hidden_channels_dp = hidden_channels_dp
|
||||
self.dropout_p_dp = dropout_p_dp
|
||||
self.mean_only = mean_only
|
||||
self.use_prenet = use_prenet
|
||||
self.c_in_channels = c_in_channels
|
||||
|
@ -64,35 +94,37 @@ class Encoder(nn.Module):
|
|||
# embedding layer
|
||||
self.emb = nn.Embedding(num_chars, hidden_channels)
|
||||
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
|
||||
# init encoder
|
||||
if encoder_type.lower() == "transformer":
|
||||
# optional convolutional prenet
|
||||
# init encoder module
|
||||
if encoder_type.lower() == "rel_pos_transformer":
|
||||
if use_prenet:
|
||||
self.pre = ConvLayerNorm(hidden_channels,
|
||||
self.prenet = ResidualConv1dLayerNormBlock(hidden_channels,
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
kernel_size=5,
|
||||
num_layers=3,
|
||||
dropout_p=0.5)
|
||||
# text encoder
|
||||
self.encoder = Transformer(
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
num_heads,
|
||||
num_layers,
|
||||
kernel_size=kernel_size,
|
||||
dropout_p=dropout_p,
|
||||
rel_attn_window_size=rel_attn_window_size,
|
||||
input_length=input_length)
|
||||
elif encoder_type.lower() == 'gatedconv':
|
||||
self.encoder = GatedConvBlock(hidden_channels,
|
||||
kernel_size=5,
|
||||
dropout_p=dropout_p,
|
||||
num_layers=3 + num_layers)
|
||||
elif encoder_type.lower() == 'time-depth-separable':
|
||||
# optional convolutional prenet
|
||||
self.encoder = RelativePositionTransformer(hidden_channels,
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
**encoder_params)
|
||||
elif encoder_type.lower() == 'gated_conv':
|
||||
self.encoder = GatedConvBlock(hidden_channels, **encoder_params)
|
||||
elif encoder_type.lower() == 'residual_conv_bn':
|
||||
if use_prenet:
|
||||
self.pre = ConvLayerNorm(hidden_channels,
|
||||
self.prenet = nn.Sequential(
|
||||
nn.Conv1d(hidden_channels, hidden_channels, 1),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.encoder = ResidualConv1dBNBlock(hidden_channels,
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
**encoder_params)
|
||||
self.postnet = nn.Sequential(
|
||||
nn.Conv1d(self.hidden_channels, self.hidden_channels, 1),
|
||||
nn.BatchNorm1d(self.hidden_channels))
|
||||
elif encoder_type.lower() == 'time_depth_separable':
|
||||
if use_prenet:
|
||||
self.prenet = ResidualConv1dLayerNormBlock(hidden_channels,
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
kernel_size=5,
|
||||
|
@ -101,8 +133,9 @@ class Encoder(nn.Module):
|
|||
self.encoder = TimeDepthSeparableConvBlock(hidden_channels,
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
kernel_size=5,
|
||||
num_layers=3 + num_layers)
|
||||
**encoder_params)
|
||||
else:
|
||||
raise ValueError(" [!] Unkown encoder type.")
|
||||
|
||||
# final projection layers
|
||||
self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
|
@ -110,10 +143,16 @@ class Encoder(nn.Module):
|
|||
self.proj_s = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
# duration predictor
|
||||
self.duration_predictor = DurationPredictor(
|
||||
hidden_channels + c_in_channels, filter_channels_dp, kernel_size,
|
||||
dropout_p)
|
||||
hidden_channels + c_in_channels, hidden_channels_dp, 3,
|
||||
dropout_p_dp)
|
||||
|
||||
def forward(self, x, x_lengths, g=None):
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
x_lengths: [B]
|
||||
g (optional): [B, 1, T]
|
||||
"""
|
||||
# embedding layer
|
||||
# [B ,T, D]
|
||||
x = self.emb(x) * math.sqrt(self.hidden_channels)
|
||||
|
@ -122,12 +161,14 @@ class Encoder(nn.Module):
|
|||
# compute input sequence mask
|
||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)),
|
||||
1).to(x.dtype)
|
||||
# pre-conv layers
|
||||
if self.encoder_type in ['transformer', 'time-depth-separable']:
|
||||
if self.use_prenet:
|
||||
x = self.pre(x, x_mask)
|
||||
# prenet
|
||||
if hasattr(self, 'prenet') and self.use_prenet:
|
||||
x = self.prenet(x, x_mask)
|
||||
# encoder
|
||||
x = self.encoder(x, x_mask)
|
||||
# postnet
|
||||
if hasattr(self, 'postnet'):
|
||||
x = self.postnet(x) * x_mask
|
||||
# set duration predictor input
|
||||
if g is not None:
|
||||
g_exp = g.expand(-1, -1, x.size(-1))
|
||||
|
|
|
@ -1,13 +1,28 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from TTS.tts.layers.generic.wavenet import WN
|
||||
|
||||
from .normalization import LayerNorm
|
||||
from ..generic.normalization import LayerNorm
|
||||
|
||||
|
||||
class ConvLayerNorm(nn.Module):
|
||||
class ResidualConv1dLayerNormBlock(nn.Module):
|
||||
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size,
|
||||
num_layers, dropout_p):
|
||||
"""Conv1d with Layer Normalization and residual connection as in GlowTTS paper.
|
||||
https://arxiv.org/pdf/1811.00002.pdf
|
||||
|
||||
x |-> conv1d -> layer_norm -> relu -> dropout -> + -> o
|
||||
|---------------> conv1d_1x1 -----------------------|
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input tensor channels.
|
||||
hidden_channels (int): number of inner layer channels.
|
||||
out_channels (int): number of output tensor channels.
|
||||
kernel_size (int): kernel size of conv1d filter.
|
||||
num_layers (int): number of blocks.
|
||||
dropout_p (float): dropout rate for each block.
|
||||
"""
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
|
@ -21,16 +36,9 @@ class ConvLayerNorm(nn.Module):
|
|||
self.conv_layers = nn.ModuleList()
|
||||
self.norm_layers = nn.ModuleList()
|
||||
|
||||
self.conv_layers.append(
|
||||
nn.Conv1d(in_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2))
|
||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||
|
||||
for _ in range(num_layers - 1):
|
||||
for idx in range(num_layers):
|
||||
self.conv_layers.append(
|
||||
nn.Conv1d(hidden_channels,
|
||||
nn.Conv1d(in_channels if idx == 0 else hidden_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2))
|
||||
|
@ -50,105 +58,20 @@ class ConvLayerNorm(nn.Module):
|
|||
return x * x_mask
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a + input_b
|
||||
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
||||
class WN(torch.nn.Module):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
num_layers,
|
||||
c_in_channels=0,
|
||||
dropout_p=0):
|
||||
super().__init__()
|
||||
assert kernel_size % 2 == 1
|
||||
assert hidden_channels % 2 == 0
|
||||
self.in_channels = in_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.num_layers = num_layers
|
||||
self.c_in_channels = c_in_channels
|
||||
self.dropout_p = dropout_p
|
||||
|
||||
self.in_layers = torch.nn.ModuleList()
|
||||
self.res_skip_layers = torch.nn.ModuleList()
|
||||
self.dropout = nn.Dropout(dropout_p)
|
||||
|
||||
if c_in_channels != 0:
|
||||
cond_layer = torch.nn.Conv1d(c_in_channels,
|
||||
2 * hidden_channels * num_layers, 1)
|
||||
self.cond_layer = torch.nn.utils.weight_norm(cond_layer,
|
||||
name='weight')
|
||||
|
||||
for i in range(num_layers):
|
||||
dilation = dilation_rate**i
|
||||
padding = int((kernel_size * dilation - dilation) / 2)
|
||||
in_layer = torch.nn.Conv1d(hidden_channels,
|
||||
2 * hidden_channels,
|
||||
kernel_size,
|
||||
dilation=dilation,
|
||||
padding=padding)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
if i < num_layers - 1:
|
||||
res_skip_channels = 2 * hidden_channels
|
||||
else:
|
||||
res_skip_channels = hidden_channels
|
||||
|
||||
res_skip_layer = torch.nn.Conv1d(hidden_channels,
|
||||
res_skip_channels, 1)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer,
|
||||
name='weight')
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
def forward(self, x, x_mask=None, g=None, **kwargs): # pylint: disable=unused-argument
|
||||
output = torch.zeros_like(x)
|
||||
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||
|
||||
if g is not None:
|
||||
g = self.cond_layer(g)
|
||||
|
||||
for i in range(self.num_layers):
|
||||
x_in = self.in_layers[i](x)
|
||||
x_in = self.dropout(x_in)
|
||||
if g is not None:
|
||||
cond_offset = i * 2 * self.hidden_channels
|
||||
g_l = g[:,
|
||||
cond_offset:cond_offset + 2 * self.hidden_channels, :]
|
||||
else:
|
||||
g_l = torch.zeros_like(x_in)
|
||||
|
||||
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l,
|
||||
n_channels_tensor)
|
||||
|
||||
res_skip_acts = self.res_skip_layers[i](acts)
|
||||
if i < self.num_layers - 1:
|
||||
x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
|
||||
output = output + res_skip_acts[:, self.hidden_channels:, :]
|
||||
else:
|
||||
output = output + res_skip_acts
|
||||
return output * x_mask
|
||||
|
||||
def remove_weight_norm(self):
|
||||
if self.c_in_channels != 0:
|
||||
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
||||
for l in self.in_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
for l in self.res_skip_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
|
||||
class InvConvNear(nn.Module):
|
||||
"""Invertible Convolution with input splitting as in GlowTTS paper.
|
||||
https://arxiv.org/pdf/1811.00002.pdf
|
||||
|
||||
Args:
|
||||
channels (int): input and output channels.
|
||||
num_splits (int): number of splits, also H and W of conv layer.
|
||||
no_jacobian (bool): enable/disable jacobian computations.
|
||||
|
||||
Note:
|
||||
Split the input into groups of size self.num_splits and
|
||||
perform 1x1 convolution separately. Cast 1x1 conv operation
|
||||
to 2d by reshaping the input for efficiency.
|
||||
"""
|
||||
def __init__(self, channels, num_splits=4, no_jacobian=False, **kwargs): # pylint: disable=unused-argument
|
||||
super().__init__()
|
||||
assert num_splits % 2 == 0
|
||||
|
@ -164,9 +87,10 @@ class InvConvNear(nn.Module):
|
|||
self.weight = nn.Parameter(w_init)
|
||||
|
||||
def forward(self, x, x_mask=None, reverse=False, **kwargs): # pylint: disable=unused-argument
|
||||
"""Split the input into groups of size self.num_splits and
|
||||
perform 1x1 convolution separately. Cast 1x1 conv operation
|
||||
to 2d by reshaping the input for efficienty.
|
||||
"""
|
||||
Shapes:
|
||||
x: B x C x T
|
||||
x_mask: B x 1 x T
|
||||
"""
|
||||
|
||||
b, c, t = x.size()
|
||||
|
@ -209,6 +133,25 @@ class InvConvNear(nn.Module):
|
|||
|
||||
|
||||
class CouplingBlock(nn.Module):
|
||||
"""Glow Affine Coupling block as in GlowTTS paper.
|
||||
https://arxiv.org/pdf/1811.00002.pdf
|
||||
|
||||
x --> x0 -> conv1d -> wavenet -> conv1d --> t, s -> concat(s*x1 + t, x0) -> o
|
||||
'-> x1 - - - - - - - - - - - - - - - - - - - - - - - - - ^
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input tensor channels.
|
||||
hidden_channels (int): number of hidden channels.
|
||||
kernel_size (int): WaveNet filter kernel size.
|
||||
dilation_rate (int): rate to increase dilation by each layer in a decoder block.
|
||||
num_layers (int): number of WaveNet layers.
|
||||
c_in_channels (int): number of conditioning input channels.
|
||||
dropout_p (int): wavenet dropout rate.
|
||||
sigmoid_scale (bool): enable/disable sigmoid scaling for output scale.
|
||||
|
||||
Note:
|
||||
It does not use conditional inputs differently from WaveGlow.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
hidden_channels,
|
||||
|
@ -227,21 +170,28 @@ class CouplingBlock(nn.Module):
|
|||
self.c_in_channels = c_in_channels
|
||||
self.dropout_p = dropout_p
|
||||
self.sigmoid_scale = sigmoid_scale
|
||||
|
||||
# input layer
|
||||
start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1)
|
||||
start = torch.nn.utils.weight_norm(start)
|
||||
self.start = start
|
||||
# output layer
|
||||
# Initializing last layer to 0 makes the affine coupling layers
|
||||
# do nothing at first. This helps with training stability
|
||||
end = torch.nn.Conv1d(hidden_channels, in_channels, 1)
|
||||
end.weight.data.zero_()
|
||||
end.bias.data.zero_()
|
||||
self.end = end
|
||||
|
||||
# coupling layers
|
||||
self.wn = WN(in_channels, hidden_channels, kernel_size, dilation_rate,
|
||||
num_layers, c_in_channels, dropout_p)
|
||||
|
||||
def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): # pylint: disable=unused-argument
|
||||
"""
|
||||
Shapes:
|
||||
x: B x C x T
|
||||
x_mask: B x 1 x T
|
||||
g: B x C x 1
|
||||
"""
|
||||
if x_mask is None:
|
||||
x_mask = 1
|
||||
x_0, x_1 = x[:, :self.in_channels // 2], x[:, self.in_channels // 2:]
|
||||
|
@ -251,17 +201,17 @@ class CouplingBlock(nn.Module):
|
|||
out = self.end(x)
|
||||
|
||||
z_0 = x_0
|
||||
m = out[:, :self.in_channels // 2, :]
|
||||
logs = out[:, self.in_channels // 2:, :]
|
||||
t = out[:, :self.in_channels // 2, :]
|
||||
s = out[:, self.in_channels // 2:, :]
|
||||
if self.sigmoid_scale:
|
||||
logs = torch.log(1e-6 + torch.sigmoid(logs + 2))
|
||||
s = torch.log(1e-6 + torch.sigmoid(s + 2))
|
||||
|
||||
if reverse:
|
||||
z_1 = (x_1 - m) * torch.exp(-logs) * x_mask
|
||||
z_1 = (x_1 - t) * torch.exp(-s) * x_mask
|
||||
logdet = None
|
||||
else:
|
||||
z_1 = (m + torch.exp(logs) * x_1) * x_mask
|
||||
logdet = torch.sum(logs * x_mask, [1, 2])
|
||||
z_1 = (t + torch.exp(s) * x_1) * x_mask
|
||||
logdet = torch.sum(s * x_mask, [1, 2])
|
||||
|
||||
z = torch.cat([z_0, z_1], 1)
|
||||
return z, logdet
|
||||
|
|
|
@ -7,8 +7,46 @@ from TTS.tts.layers.glow_tts.glow import LayerNorm
|
|||
|
||||
|
||||
class RelativePositionMultiHeadAttention(nn.Module):
|
||||
"""Implementation of Relative Position Encoding based on
|
||||
"""Multi-head attention with Relative Positional embedding.
|
||||
https://arxiv.org/pdf/1809.04281.pdf
|
||||
|
||||
It learns positional embeddings for a window of neighbours. For keys and values,
|
||||
it learns different set of embeddings. Key embeddings are agregated with the attention
|
||||
scores and value embeddings are aggregated with the output.
|
||||
|
||||
Note:
|
||||
Example with relative attention window size 2
|
||||
input = [a, b, c, d, e]
|
||||
rel_attn_embeddings = [e(t-2), e(t-1), e(t+1), e(t+2)]
|
||||
|
||||
So it learns 4 embedding vectors (in total 8) separately for key and value vectors.
|
||||
|
||||
Considering the input c
|
||||
e(t-2) corresponds to c -> a
|
||||
e(t-2) corresponds to c -> b
|
||||
e(t-2) corresponds to c -> d
|
||||
e(t-2) corresponds to c -> e
|
||||
|
||||
These embeddings are shared among different time steps. So input a, b, d and e also uses
|
||||
the same embeddings.
|
||||
|
||||
Embeddings are ignored when the relative window is out of limit for the first and the last
|
||||
n items.
|
||||
|
||||
Args:
|
||||
channels (int): input and inner layer channels.
|
||||
out_channels (int): output channels.
|
||||
num_heads (int): number of attention heads.
|
||||
rel_attn_window_size (int, optional): relation attention window size.
|
||||
If 4, for each time step next and previous 4 time steps are attended.
|
||||
If default, relative encoding is disabled and it is a regular transformer.
|
||||
Defaults to None.
|
||||
heads_share (bool, optional): [description]. Defaults to True.
|
||||
dropout_p (float, optional): dropout rate. Defaults to 0..
|
||||
input_length (int, optional): intput length for positional encoding. Defaults to None.
|
||||
proximal_bias (bool, optional): enable/disable proximal bias as in the paper. Defaults to False.
|
||||
proximal_init (bool, optional): enable/disable poximal init as in the paper.
|
||||
Init key and query layer weights the same. Defaults to False.
|
||||
"""
|
||||
def __init__(self,
|
||||
channels,
|
||||
|
@ -20,6 +58,7 @@ class RelativePositionMultiHeadAttention(nn.Module):
|
|||
input_length=None,
|
||||
proximal_bias=False,
|
||||
proximal_init=False):
|
||||
|
||||
super().__init__()
|
||||
assert channels % num_heads == 0, " [!] channels should be divisible by num_heads."
|
||||
# class attributes
|
||||
|
@ -81,7 +120,7 @@ class RelativePositionMultiHeadAttention(nn.Module):
|
|||
# compute raw attention scores
|
||||
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(
|
||||
self.k_channels)
|
||||
# relative positional encoding
|
||||
# relative positional encoding for scores
|
||||
if self.rel_attn_window_size is not None:
|
||||
assert t_s == t_t, "Relative attention is only available for self-attention."
|
||||
# get relative key embeddings
|
||||
|
@ -225,27 +264,35 @@ class RelativePositionMultiHeadAttention(nn.Module):
|
|||
return diff.unsqueeze(0).unsqueeze(0)
|
||||
|
||||
|
||||
class FFN(nn.Module):
|
||||
class FeedForwardNetwork(nn.Module):
|
||||
"""Feed Forward Inner layers for Transformer.
|
||||
|
||||
Args:
|
||||
in_channels (int): input tensor channels.
|
||||
out_channels (int): output tensor channels.
|
||||
hidden_channels (int): inner layers hidden channels.
|
||||
kernel_size (int): conv1d filter kernel size.
|
||||
dropout_p (float, optional): dropout rate. Defaults to 0.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dropout_p=0.,
|
||||
activation=None):
|
||||
dropout_p=0.):
|
||||
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dropout_p = dropout_p
|
||||
self.activation = activation
|
||||
|
||||
self.conv_1 = nn.Conv1d(in_channels,
|
||||
filter_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2)
|
||||
self.conv_2 = nn.Conv1d(filter_channels,
|
||||
self.conv_2 = nn.Conv1d(hidden_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2)
|
||||
|
@ -253,19 +300,36 @@ class FFN(nn.Module):
|
|||
|
||||
def forward(self, x, x_mask):
|
||||
x = self.conv_1(x * x_mask)
|
||||
if self.activation == "gelu":
|
||||
x = x * torch.sigmoid(1.702 * x)
|
||||
else:
|
||||
x = torch.relu(x)
|
||||
x = torch.relu(x)
|
||||
x = self.dropout(x)
|
||||
x = self.conv_2(x * x_mask)
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class Transformer(nn.Module):
|
||||
class RelativePositionTransformer(nn.Module):
|
||||
"""Transformer with Relative Potional Encoding.
|
||||
https://arxiv.org/abs/1803.02155
|
||||
|
||||
Args:
|
||||
in_channels (int): number of channels of the input tensor.
|
||||
out_chanels (int): number of channels of the output tensor.
|
||||
hidden_channels (int): model hidden channels.
|
||||
hidden_channels_ffn (int): hidden channels of FeedForwardNetwork.
|
||||
num_heads (int): number of attention heads.
|
||||
num_layers (int): number of transformer layers.
|
||||
kernel_size (int, optional): kernel size of feed-forward inner layers. Defaults to 1.
|
||||
dropout_p (float, optional): dropout rate for self-attention and feed-forward inner layers_per_stack. Defaults to 0.
|
||||
rel_attn_window_size (int, optional): relation attention window size.
|
||||
If 4, for each time step next and previous 4 time steps are attended.
|
||||
If default, relative encoding is disabled and it is a regular transformer.
|
||||
Defaults to None.
|
||||
input_length (int, optional): input lenght to limit position encoding. Defaults to None.
|
||||
"""
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
hidden_channels_ffn,
|
||||
num_heads,
|
||||
num_layers,
|
||||
kernel_size=1,
|
||||
|
@ -274,7 +338,7 @@ class Transformer(nn.Module):
|
|||
input_length=None):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.hidden_channels_ffn = hidden_channels_ffn
|
||||
self.num_heads = num_heads
|
||||
self.num_layers = num_layers
|
||||
self.kernel_size = kernel_size
|
||||
|
@ -286,25 +350,38 @@ class Transformer(nn.Module):
|
|||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for _ in range(self.num_layers):
|
||||
|
||||
for idx in range(self.num_layers):
|
||||
self.attn_layers.append(
|
||||
RelativePositionMultiHeadAttention(
|
||||
hidden_channels,
|
||||
hidden_channels if idx != 0 else in_channels,
|
||||
hidden_channels,
|
||||
num_heads,
|
||||
rel_attn_window_size=rel_attn_window_size,
|
||||
dropout_p=dropout_p,
|
||||
input_length=input_length))
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
|
||||
if hidden_channels != out_channels and (idx + 1) == self.num_layers:
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
|
||||
self.ffn_layers.append(
|
||||
FFN(hidden_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
FeedForwardNetwork(hidden_channels,
|
||||
hidden_channels if (idx + 1) != self.num_layers else out_channels,
|
||||
hidden_channels_ffn,
|
||||
kernel_size,
|
||||
dropout_p=dropout_p))
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
self.norm_layers_2.append(
|
||||
LayerNorm(hidden_channels if (
|
||||
idx + 1) != self.num_layers else out_channels))
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
x_mask: [B, 1, T]
|
||||
"""
|
||||
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
for i in range(self.num_layers):
|
||||
x = x * x_mask
|
||||
|
@ -314,6 +391,10 @@ class Transformer(nn.Module):
|
|||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.dropout(y)
|
||||
|
||||
if (i + 1) == self.num_layers and hasattr(self, 'proj'):
|
||||
x = self.proj(x)
|
||||
|
||||
x = self.norm_layers_2[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
|
|
@ -240,6 +240,24 @@ class GuidedAttentionLoss(torch.nn.Module):
|
|||
return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
|
||||
|
||||
|
||||
class Huber(nn.Module):
|
||||
# pylint: disable=R0201
|
||||
def forward(self, x, y, length=None):
|
||||
"""
|
||||
Shapes:
|
||||
x: B x T
|
||||
y: B x T
|
||||
length: B
|
||||
"""
|
||||
mask = sequence_mask(sequence_length=length, max_len=y.size(1)).float()
|
||||
return torch.nn.functional.smooth_l1_loss(
|
||||
x * mask, y * mask, reduction='sum') / mask.sum()
|
||||
|
||||
|
||||
########################
|
||||
# MODEL LOSS LAYERS
|
||||
########################
|
||||
|
||||
class TacotronLoss(torch.nn.Module):
|
||||
"""Collection of Tacotron set-up based on provided config."""
|
||||
def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4):
|
||||
|
@ -403,8 +421,27 @@ class GlowTTSLoss(torch.nn.Module):
|
|||
return_dict['log_mle'] = log_mle
|
||||
return_dict['loss_dur'] = loss_dur
|
||||
|
||||
# check if any loss is NaN
|
||||
# check if any loss is NaN
|
||||
for key, loss in return_dict.items():
|
||||
if torch.isnan(loss):
|
||||
raise RuntimeError(f" [!] NaN loss with {key}.")
|
||||
return return_dict
|
||||
|
||||
|
||||
class SpeedySpeechLoss(nn.Module):
|
||||
def __init__(self, c):
|
||||
super().__init__()
|
||||
self.l1 = L1LossMasked(False)
|
||||
self.ssim = SSIMLoss()
|
||||
self.huber = Huber()
|
||||
|
||||
self.ssim_alpha = c.ssim_alpha
|
||||
self.huber_alpha = c.huber_alpha
|
||||
self.l1_alpha = c.l1_alpha
|
||||
|
||||
def forward(self, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens):
|
||||
l1_loss = self.l1(decoder_output, decoder_target, decoder_output_lens)
|
||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
||||
huber_loss = self.huber(dur_output, dur_target, input_lens)
|
||||
loss = l1_loss + ssim_loss + huber_loss
|
||||
return {'loss': loss, 'loss_l1': l1_loss, 'loss_ssim': ssim_loss, 'loss_dur': huber_loss}
|
||||
|
|
|
@ -0,0 +1,192 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from TTS.tts.layers.generic.res_conv_bn import Conv1dBNBlock, ResidualConv1dBNBlock, Conv1dBN
|
||||
from TTS.tts.layers.generic.wavenet import WNBlocks
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
|
||||
|
||||
class WaveNetDecoder(nn.Module):
|
||||
"""WaveNet based decoder with a prenet and a postnet.
|
||||
|
||||
prenet: conv1d_1x1
|
||||
postnet: 3 x [conv1d_1x1 -> relu] -> conv1d_1x1
|
||||
|
||||
TODO: Integrate speaker conditioning vector.
|
||||
|
||||
Note:
|
||||
default wavenet parameters;
|
||||
params = {
|
||||
"num_blocks": 12,
|
||||
"hidden_channels":192,
|
||||
"kernel_size": 5,
|
||||
"dilation_rate": 1,
|
||||
"num_layers": 4,
|
||||
"dropout_p": 0.05
|
||||
}
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): number of hidden channels for prenet and postnet.
|
||||
params (dict): dictionary for residual convolutional blocks.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, c_in_channels, params):
|
||||
super().__init__()
|
||||
# prenet
|
||||
self.prenet = torch.nn.Conv1d(in_channels, params['hidden_channels'], 1)
|
||||
# wavenet layers
|
||||
self.wn = WNBlocks(params['hidden_channels'], c_in_channels=c_in_channels, **params)
|
||||
# postnet
|
||||
self.postnet = [
|
||||
torch.nn.Conv1d(params['hidden_channels'], hidden_channels, 1),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Conv1d(hidden_channels, hidden_channels, 1),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Conv1d(hidden_channels, hidden_channels, 1),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Conv1d(hidden_channels, out_channels, 1),
|
||||
]
|
||||
self.postnet = nn.Sequential(*self.postnet)
|
||||
|
||||
def forward(self, x, x_mask=None, g=None):
|
||||
x = self.prenet(x) * x_mask
|
||||
x = self.wn(x, x_mask, g)
|
||||
o = self.postnet(x) * x_mask
|
||||
return o
|
||||
|
||||
|
||||
class RelativePositionTransformerDecoder(nn.Module):
|
||||
"""Decoder with Relative Positional Transformer.
|
||||
|
||||
Note:
|
||||
Default params
|
||||
params={
|
||||
'hidden_channels_ffn': 128,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 8,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
}
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): number of hidden channels including Transformer layers.
|
||||
params (dict): dictionary for residual convolutional blocks.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||
|
||||
super().__init__()
|
||||
self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1)
|
||||
self.rel_pos_transformer = RelativePositionTransformer(
|
||||
in_channels, out_channels, hidden_channels, **params)
|
||||
|
||||
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
|
||||
o = self.prenet(x) * x_mask
|
||||
o = self.rel_pos_transformer(o, x_mask)
|
||||
return o
|
||||
|
||||
|
||||
class ResidualConv1dBNDecoder(nn.Module):
|
||||
"""Residual Convolutional Decoder as in the original Speedy Speech paper
|
||||
|
||||
TODO: Integrate speaker conditioning vector.
|
||||
|
||||
Note:
|
||||
Default params
|
||||
params = {
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17
|
||||
}
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): number of hidden channels including ResidualConv1dBNBlock layers.
|
||||
params (dict): dictionary for residual convolutional blocks.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||
super().__init__()
|
||||
self.res_conv_block = ResidualConv1dBNBlock(in_channels,
|
||||
hidden_channels,
|
||||
hidden_channels, **params)
|
||||
self.post_conv = nn.Conv1d(hidden_channels, hidden_channels, 1)
|
||||
self.postnet = nn.Sequential(
|
||||
Conv1dBNBlock(hidden_channels,
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
params['kernel_size'],
|
||||
1,
|
||||
num_conv_blocks=2),
|
||||
nn.Conv1d(hidden_channels, out_channels, 1),
|
||||
)
|
||||
|
||||
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
|
||||
o = self.res_conv_block(x, x_mask)
|
||||
o = self.post_conv(o) + x
|
||||
return self.postnet(o) * x_mask
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
"""Decodes the expanded phoneme encoding into spectrograms
|
||||
Args:
|
||||
out_channels (int): number of output channels.
|
||||
in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
|
||||
decoder_type (str): decoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
|
||||
decoder_params (dict): model parameters for specified decoder type.
|
||||
c_in_channels (int): number of channels for conditional input.
|
||||
|
||||
Shapes:
|
||||
- input: (B, C, T)
|
||||
"""
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(
|
||||
self,
|
||||
out_channels,
|
||||
in_hidden_channels,
|
||||
decoder_type='residual_conv_bn',
|
||||
decoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17
|
||||
},
|
||||
c_in_channels=0):
|
||||
super().__init__()
|
||||
|
||||
if decoder_type == 'transformer':
|
||||
self.decoder = RelativePositionTransformerDecoder(
|
||||
in_channels=in_hidden_channels,
|
||||
out_channels=out_channels,
|
||||
hidden_channels=in_hidden_channels,
|
||||
params=decoder_params)
|
||||
elif decoder_type == 'residual_conv_bn':
|
||||
self.decoder = ResidualConv1dBNDecoder(
|
||||
in_channels=in_hidden_channels,
|
||||
out_channels=out_channels,
|
||||
hidden_channels=in_hidden_channels,
|
||||
params=decoder_params)
|
||||
elif decoder_type == 'wavenet':
|
||||
self.decoder = WaveNetDecoder(in_channels=in_hidden_channels,
|
||||
out_channels=out_channels,
|
||||
hidden_channels=in_hidden_channels,
|
||||
c_in_channels=c_in_channels,
|
||||
params=decoder_params)
|
||||
else:
|
||||
raise ValueError(f'[!] Unknown decoder type - {decoder_type}')
|
||||
|
||||
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
|
||||
"""
|
||||
Args:
|
||||
x: [B, C, T]
|
||||
x_mask: [B, 1, T]
|
||||
g: [B, C_g, 1]
|
||||
"""
|
||||
# TODO: implement multi-speaker
|
||||
o = self.decoder(x, x_mask, g)
|
||||
return o
|
|
@ -0,0 +1,39 @@
|
|||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.generic.res_conv_bn import Conv1dBN
|
||||
|
||||
|
||||
class DurationPredictor(nn.Module):
|
||||
"""Speedy Speech duration predictor model.
|
||||
Predicts phoneme durations from encoder outputs.
|
||||
|
||||
Note:
|
||||
Outputs interpreted as log(durations)
|
||||
To get actual durations, do exp transformation
|
||||
|
||||
conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
|
||||
|
||||
Args:
|
||||
hidden_channels (int): number of channels in the inner layers.
|
||||
"""
|
||||
def __init__(self, hidden_channels):
|
||||
|
||||
super().__init__()
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
Conv1dBN(hidden_channels, hidden_channels, 4, 1),
|
||||
Conv1dBN(hidden_channels, hidden_channels, 3, 1),
|
||||
Conv1dBN(hidden_channels, hidden_channels, 1, 1),
|
||||
nn.Conv1d(hidden_channels, 1, 1)
|
||||
])
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
x_mask: [B, 1, T]
|
||||
"""
|
||||
o = x
|
||||
for layer in self.layers:
|
||||
o = layer(o) * x_mask
|
||||
return o
|
|
@ -0,0 +1,209 @@
|
|||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
from TTS.tts.layers.generic.res_conv_bn import ResidualConv1dBNBlock
|
||||
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
"""Sinusoidal positional encoding for non-recurrent neural networks.
|
||||
Implementation based on "Attention Is All You Need"
|
||||
Args:
|
||||
channels (int): embedding size
|
||||
dropout (float): dropout parameter
|
||||
"""
|
||||
def __init__(self, channels, dropout=0.0, max_len=5000):
|
||||
super().__init__()
|
||||
if channels % 2 != 0:
|
||||
raise ValueError(
|
||||
"Cannot use sin/cos positional encoding with "
|
||||
"odd channels (got channels={:d})".format(channels))
|
||||
pe = torch.zeros(max_len, channels)
|
||||
position = torch.arange(0, max_len).unsqueeze(1)
|
||||
div_term = torch.exp((torch.arange(0, channels, 2, dtype=torch.float) *
|
||||
-(math.log(10000.0) / channels)))
|
||||
pe[:, 0::2] = torch.sin(position.float() * div_term)
|
||||
pe[:, 1::2] = torch.cos(position.float() * div_term)
|
||||
pe = pe.unsqueeze(0).transpose(1, 2)
|
||||
self.register_buffer('pe', pe)
|
||||
if dropout > 0:
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
self.channels = channels
|
||||
|
||||
def forward(self, x, mask=None, first_idx=None, last_idx=None):
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
mask: [B, 1, T]
|
||||
first_idx: int
|
||||
last_idx: int
|
||||
"""
|
||||
|
||||
x = x * math.sqrt(self.channels)
|
||||
if first_idx is None:
|
||||
if self.pe.size(2) < x.size(2):
|
||||
raise RuntimeError(
|
||||
f"Sequence is {x.size(2)} but PositionalEncoding is"
|
||||
f" limited to {self.pe.size(2)}. See max_len argument.")
|
||||
if mask is not None:
|
||||
pos_enc = (self.pe[:, :, :x.size(2)] * mask)
|
||||
else:
|
||||
pos_enc = self.pe[:, :, :x.size(2)]
|
||||
x = x + pos_enc
|
||||
else:
|
||||
x = x + self.pe[:, :, first_idx:last_idx]
|
||||
if hasattr(self, 'dropout'):
|
||||
x = self.dropout(x)
|
||||
return x
|
||||
|
||||
|
||||
class RelativePositionTransformerEncoder(nn.Module):
|
||||
"""Speedy speech encoder built on Transformer with Relative Position encoding.
|
||||
|
||||
TODO: Integrate speaker conditioning vector.
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): number of hidden channels
|
||||
params (dict): dictionary for residual convolutional blocks.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||
super().__init__()
|
||||
self.prenet = ResidualConv1dBNBlock(in_channels,
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
kernel_size=5,
|
||||
num_res_blocks=3,
|
||||
num_conv_blocks=1,
|
||||
dilations=[1, 1, 1]
|
||||
)
|
||||
self.rel_pos_transformer = RelativePositionTransformer(
|
||||
hidden_channels, out_channels, hidden_channels, **params)
|
||||
|
||||
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
|
||||
if x_mask is None:
|
||||
x_mask = 1
|
||||
o = self.prenet(x) * x_mask
|
||||
o = self.rel_pos_transformer(o, x_mask)
|
||||
return o
|
||||
|
||||
|
||||
class ResidualConv1dBNEncoder(nn.Module):
|
||||
"""Residual Convolutional Encoder as in the original Speedy Speech paper
|
||||
|
||||
TODO: Integrate speaker conditioning vector.
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
hidden_channels (int): number of hidden channels
|
||||
params (dict): dictionary for residual convolutional blocks.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||
super().__init__()
|
||||
self.prenet = nn.Sequential(
|
||||
nn.Conv1d(in_channels, hidden_channels, 1),
|
||||
nn.ReLU())
|
||||
self.res_conv_block = ResidualConv1dBNBlock(hidden_channels,
|
||||
hidden_channels,
|
||||
hidden_channels, **params)
|
||||
|
||||
self.postnet = nn.Sequential(*[
|
||||
nn.Conv1d(hidden_channels, hidden_channels, 1),
|
||||
nn.ReLU(),
|
||||
nn.BatchNorm1d(hidden_channels),
|
||||
nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
])
|
||||
|
||||
def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
|
||||
if x_mask is None:
|
||||
x_mask = 1
|
||||
o = self.prenet(x) * x_mask
|
||||
o = self.res_conv_block(o, x_mask)
|
||||
o = self.postnet(o + x) * x_mask
|
||||
return o * x_mask
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
# pylint: disable=dangerous-default-value
|
||||
"""Factory class for Speedy Speech encoder enables different encoder types internally.
|
||||
|
||||
Args:
|
||||
num_chars (int): number of characters.
|
||||
out_channels (int): number of output channels.
|
||||
in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
|
||||
encoder_type (str): encoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
|
||||
encoder_params (dict): model parameters for specified encoder type.
|
||||
c_in_channels (int): number of channels for conditional input.
|
||||
|
||||
Note:
|
||||
Default encoder_params...
|
||||
|
||||
for 'transformer'
|
||||
encoder_params={
|
||||
'hidden_channels_ffn': 128,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
},
|
||||
|
||||
for 'residual_conv_bn'
|
||||
encoder_params = {
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
}
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
in_hidden_channels,
|
||||
out_channels,
|
||||
encoder_type='residual_conv_bn',
|
||||
encoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
},
|
||||
c_in_channels=0):
|
||||
super().__init__()
|
||||
self.out_channels = out_channels
|
||||
self.in_channels = in_hidden_channels
|
||||
self.hidden_channels = in_hidden_channels
|
||||
self.encoder_type = encoder_type
|
||||
self.c_in_channels = c_in_channels
|
||||
|
||||
# init encoder
|
||||
if encoder_type.lower() == "transformer":
|
||||
# text encoder
|
||||
self.encoder = RelativePositionTransformerEncoder(in_hidden_channels,
|
||||
out_channels,
|
||||
in_hidden_channels,
|
||||
encoder_params) # pylint: disable=unexpected-keyword-arg
|
||||
elif encoder_type.lower() == 'residual_conv_bn':
|
||||
self.encoder = ResidualConv1dBNEncoder(in_hidden_channels,
|
||||
out_channels,
|
||||
in_hidden_channels,
|
||||
encoder_params)
|
||||
else:
|
||||
raise NotImplementedError(' [!] unknown encoder type.')
|
||||
|
||||
# final projection layers
|
||||
|
||||
|
||||
def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, C, T]
|
||||
x_mask: [B, 1, T]
|
||||
g: [B, C, 1]
|
||||
"""
|
||||
o = self.encoder(x, x_mask)
|
||||
return o * x_mask
|
|
@ -1,7 +1,8 @@
|
|||
# coding: utf-8
|
||||
import torch
|
||||
from torch import nn
|
||||
from .common_layers import Prenet, init_attn
|
||||
from .common_layers import Prenet
|
||||
from .attentions import init_attn
|
||||
|
||||
|
||||
class BatchNormConv1d(nn.Module):
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from .common_layers import init_attn, Prenet, Linear
|
||||
from .common_layers import Prenet, Linear
|
||||
from .attentions import init_attn
|
||||
|
||||
# NOTE: linter has a problem with the current TF release
|
||||
#pylint: disable=no-value-for-parameter
|
||||
|
|
|
@ -10,46 +10,59 @@ from TTS.tts.layers.glow_tts.monotonic_align import maximum_path, generate_path
|
|||
|
||||
|
||||
class GlowTts(nn.Module):
|
||||
"""Glow TTS models from https://arxiv.org/abs/2005.11129"""
|
||||
"""Glow TTS models from https://arxiv.org/abs/2005.11129
|
||||
|
||||
Args:
|
||||
num_chars (int): number of embedding characters.
|
||||
hidden_channels_enc (int): number of embedding and encoder channels.
|
||||
hidden_channels_dec (int): number of decoder channels.
|
||||
use_encoder_prenet (bool): enable/disable prenet for encoder. Prenet modules are hard-coded for each alternative encoder.
|
||||
hidden_channels_dp (int): number of duration predictor channels.
|
||||
out_channels (int): number of output channels. It should be equal to the number of spectrogram filter.
|
||||
num_flow_blocks_dec (int): number of decoder blocks.
|
||||
kernel_size_dec (int): decoder kernel size.
|
||||
dilation_rate (int): rate to increase dilation by each layer in a decoder block.
|
||||
num_block_layers (int): number of decoder layers in each decoder block.
|
||||
dropout_p_dec (float): dropout rate for decoder.
|
||||
num_speaker (int): number of speaker to define the size of speaker embedding layer.
|
||||
c_in_channels (int): number of speaker embedding channels. It is set to 512 if embeddings are learned.
|
||||
num_splits (int): number of split levels in inversible conv1x1 operation.
|
||||
num_squeeze (int): number of squeeze levels. When squeezing channels increases and time steps reduces by the factor 'num_squeeze'.
|
||||
sigmoid_scale (bool): enable/disable sigmoid scaling in decoder.
|
||||
mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step.
|
||||
encoder_type (str): encoder module type.
|
||||
encoder_params (dict): encoder module parameters.
|
||||
external_speaker_embedding_dim (int): channels of external speaker embedding vectors.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
filter_channels_dp,
|
||||
hidden_channels_enc,
|
||||
hidden_channels_dec,
|
||||
use_encoder_prenet,
|
||||
hidden_channels_dp,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
num_heads=2,
|
||||
num_layers_enc=6,
|
||||
dropout_p=0.1,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=5,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.,
|
||||
dropout_p_dp=0.1,
|
||||
dropout_p_dec=0.05,
|
||||
num_speakers=0,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_sqz=1,
|
||||
num_squeeze=1,
|
||||
sigmoid_scale=False,
|
||||
rel_attn_window_size=None,
|
||||
input_length=None,
|
||||
mean_only=False,
|
||||
hidden_channels_enc=None,
|
||||
hidden_channels_dec=None,
|
||||
use_encoder_prenet=False,
|
||||
encoder_type="transformer",
|
||||
encoder_params=None,
|
||||
external_speaker_embedding_dim=None):
|
||||
|
||||
super().__init__()
|
||||
self.num_chars = num_chars
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.filter_channels_dp = filter_channels_dp
|
||||
self.hidden_channels_dp = hidden_channels_dp
|
||||
self.hidden_channels_enc = hidden_channels_enc
|
||||
self.hidden_channels_dec = hidden_channels_dec
|
||||
self.out_channels = out_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.num_heads = num_heads
|
||||
self.num_layers_enc = num_layers_enc
|
||||
self.dropout_p = dropout_p
|
||||
self.num_flow_blocks_dec = num_flow_blocks_dec
|
||||
self.kernel_size_dec = kernel_size_dec
|
||||
self.dilation_rate = dilation_rate
|
||||
|
@ -58,16 +71,14 @@ class GlowTts(nn.Module):
|
|||
self.num_speakers = num_speakers
|
||||
self.c_in_channels = c_in_channels
|
||||
self.num_splits = num_splits
|
||||
self.num_sqz = num_sqz
|
||||
self.num_squeeze = num_squeeze
|
||||
self.sigmoid_scale = sigmoid_scale
|
||||
self.rel_attn_window_size = rel_attn_window_size
|
||||
self.input_length = input_length
|
||||
self.mean_only = mean_only
|
||||
self.hidden_channels_enc = hidden_channels_enc
|
||||
self.hidden_channels_dec = hidden_channels_dec
|
||||
self.use_encoder_prenet = use_encoder_prenet
|
||||
self.noise_scale = 0.66
|
||||
self.length_scale = 1.
|
||||
|
||||
# model constants.
|
||||
self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference.
|
||||
self.length_scale = 1. # scaler for the duration predictor. The larger it is, the slower the speech.
|
||||
self.external_speaker_embedding_dim = external_speaker_embedding_dim
|
||||
|
||||
# if is a multispeaker and c_in_channels is 0, set to 256
|
||||
|
@ -79,31 +90,29 @@ class GlowTts(nn.Module):
|
|||
|
||||
self.encoder = Encoder(num_chars,
|
||||
out_channels=out_channels,
|
||||
hidden_channels=hidden_channels,
|
||||
filter_channels=filter_channels,
|
||||
filter_channels_dp=filter_channels_dp,
|
||||
hidden_channels=hidden_channels_enc,
|
||||
hidden_channels_dp=hidden_channels_dp,
|
||||
encoder_type=encoder_type,
|
||||
num_heads=num_heads,
|
||||
num_layers=num_layers_enc,
|
||||
kernel_size=kernel_size,
|
||||
dropout_p=dropout_p,
|
||||
encoder_params=encoder_params,
|
||||
mean_only=mean_only,
|
||||
use_prenet=use_encoder_prenet,
|
||||
dropout_p_dp=dropout_p_dp,
|
||||
c_in_channels=self.c_in_channels)
|
||||
|
||||
self.decoder = Decoder(out_channels,
|
||||
hidden_channels_dec or hidden_channels,
|
||||
hidden_channels_dec,
|
||||
kernel_size_dec,
|
||||
dilation_rate,
|
||||
num_flow_blocks_dec,
|
||||
num_block_layers,
|
||||
dropout_p=dropout_p_dec,
|
||||
num_splits=num_splits,
|
||||
num_sqz=num_sqz,
|
||||
num_squeeze=num_squeeze,
|
||||
sigmoid_scale=sigmoid_scale,
|
||||
c_in_channels=self.c_in_channels)
|
||||
|
||||
if num_speakers > 1 and not external_speaker_embedding_dim:
|
||||
# speaker embedding layer
|
||||
self.emb_g = nn.Embedding(num_speakers, self.c_in_channels)
|
||||
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
||||
|
||||
|
@ -122,11 +131,12 @@ class GlowTts(nn.Module):
|
|||
|
||||
def forward(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None):
|
||||
"""
|
||||
Shapes:
|
||||
x: B x T
|
||||
x_lenghts: B
|
||||
y: B x C x T
|
||||
y_lengths: B
|
||||
Shapes:
|
||||
x: [B, T]
|
||||
x_lenghts: B
|
||||
y: [B, C, T]
|
||||
y_lengths: B
|
||||
g: [B, C] or B
|
||||
"""
|
||||
y_max_length = y.size(2)
|
||||
# norm speaker embeddings
|
||||
|
@ -134,13 +144,13 @@ class GlowTts(nn.Module):
|
|||
if self.external_speaker_embedding_dim:
|
||||
g = F.normalize(g).unsqueeze(-1)
|
||||
else:
|
||||
g = F.normalize(self.emb_g(g)).unsqueeze(-1)# [b, h]
|
||||
g = F.normalize(self.emb_g(g)).unsqueeze(-1)# [b, h, 1]
|
||||
|
||||
# embedding pass
|
||||
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x,
|
||||
x_lengths,
|
||||
g=g)
|
||||
# format feature vectors and feature vector lenghts
|
||||
# drop redisual frames wrt num_squeeze and set y_lengths.
|
||||
y, y_lengths, y_max_length, attn = self.preprocess(
|
||||
y, y_lengths, y_max_length, None)
|
||||
# create masks
|
||||
|
@ -170,7 +180,6 @@ class GlowTts(nn.Module):
|
|||
|
||||
@torch.no_grad()
|
||||
def inference(self, x, x_lengths, g=None):
|
||||
|
||||
if g is not None:
|
||||
if self.external_speaker_embedding_dim:
|
||||
g = F.normalize(g).unsqueeze(-1)
|
||||
|
@ -195,6 +204,7 @@ class GlowTts(nn.Module):
|
|||
attn_mask.squeeze(1)).unsqueeze(1)
|
||||
y_mean, y_log_scale, o_attn_dur = self.compute_outputs(
|
||||
attn, o_mean, o_log_scale, x_mask)
|
||||
|
||||
z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) *
|
||||
self.noise_scale) * y_mask
|
||||
# decoder pass
|
||||
|
@ -204,11 +214,11 @@ class GlowTts(nn.Module):
|
|||
|
||||
def preprocess(self, y, y_lengths, y_max_length, attn=None):
|
||||
if y_max_length is not None:
|
||||
y_max_length = (y_max_length // self.num_sqz) * self.num_sqz
|
||||
y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze
|
||||
y = y[:, :, :y_max_length]
|
||||
if attn is not None:
|
||||
attn = attn[:, :, :, :y_max_length]
|
||||
y_lengths = (y_lengths // self.num_sqz) * self.num_sqz
|
||||
y_lengths = (y_lengths // self.num_squeeze) * self.num_squeeze
|
||||
return y, y_lengths, y_max_length, attn
|
||||
|
||||
def store_inverse(self):
|
||||
|
|
|
@ -0,0 +1,192 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from TTS.tts.layers.speedy_speech.decoder import Decoder
|
||||
from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.speedy_speech.encoder import Encoder, PositionalEncoding
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path
|
||||
|
||||
|
||||
class SpeedySpeech(nn.Module):
|
||||
"""Speedy Speech model
|
||||
https://arxiv.org/abs/2008.03802
|
||||
|
||||
Encoder -> DurationPredictor -> Decoder
|
||||
|
||||
This model is able to achieve a reasonable performance with only
|
||||
~3M model parameters and convolutional layers.
|
||||
|
||||
This model requires precomputed phoneme durations to train a duration predictor. At inference
|
||||
it only uses the duration predictor to compute durations and expand encoder outputs respectively.
|
||||
|
||||
Args:
|
||||
num_chars (int): number of unique input to characters
|
||||
out_channels (int): number of output tensor channels. It is equal to the expected spectrogram size.
|
||||
hidden_channels (int): number of channels in all the model layers.
|
||||
positional_encoding (bool, optional): enable/disable Positional encoding on encoder outputs. Defaults to True.
|
||||
length_scale (int, optional): coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
|
||||
encoder_type (str, optional): set the encoder type. Defaults to 'residual_conv_bn'.
|
||||
encoder_params (dict, optional): set encoder parameters depending on 'encoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13 }.
|
||||
decoder_type (str, optional): decoder type. Defaults to 'residual_conv_bn'.
|
||||
decoder_params (dict, optional): set decoder parameters depending on 'decoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17 }.
|
||||
num_speakers (int, optional): number of speakers for multi-speaker training. Defaults to 0.
|
||||
external_c (bool, optional): enable external speaker embeddings. Defaults to False.
|
||||
c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
|
||||
"""
|
||||
# pylint: disable=dangerous-default-value
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_chars,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
positional_encoding=True,
|
||||
length_scale=1,
|
||||
encoder_type='residual_conv_bn',
|
||||
encoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
},
|
||||
decoder_type='residual_conv_bn',
|
||||
decoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17
|
||||
},
|
||||
num_speakers=0,
|
||||
external_c=False,
|
||||
c_in_channels=0):
|
||||
|
||||
super().__init__()
|
||||
self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale
|
||||
self.emb = nn.Embedding(num_chars, hidden_channels)
|
||||
self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type,
|
||||
encoder_params, c_in_channels)
|
||||
if positional_encoding:
|
||||
self.pos_encoder = PositionalEncoding(hidden_channels)
|
||||
self.decoder = Decoder(out_channels, hidden_channels,
|
||||
decoder_type, decoder_params)
|
||||
self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels)
|
||||
|
||||
if num_speakers > 1 and not external_c:
|
||||
# speaker embedding layer
|
||||
self.emb_g = nn.Embedding(num_speakers, c_in_channels)
|
||||
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
||||
|
||||
if c_in_channels > 0 and c_in_channels != hidden_channels:
|
||||
self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1)
|
||||
|
||||
@staticmethod
|
||||
def expand_encoder_outputs(en, dr, x_mask, y_mask):
|
||||
"""Generate attention alignment map from durations and
|
||||
expand encoder outputs
|
||||
|
||||
Example:
|
||||
encoder output: [a,b,c,d]
|
||||
durations: [1, 3, 2, 1]
|
||||
|
||||
expanded: [a, b, b, b, c, c, d]
|
||||
attention map: [[0, 0, 0, 0, 0, 0, 1],
|
||||
[0, 0, 0, 0, 1, 1, 0],
|
||||
[0, 1, 1, 1, 0, 0, 0],
|
||||
[1, 0, 0, 0, 0, 0, 0]]
|
||||
"""
|
||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
||||
attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype)
|
||||
o_en_ex = torch.matmul(
|
||||
attn.squeeze(1).transpose(1, 2), en.transpose(1,
|
||||
2)).transpose(1, 2)
|
||||
return o_en_ex, attn
|
||||
|
||||
def format_durations(self, o_dr_log, x_mask):
|
||||
o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
|
||||
o_dr[o_dr < 1] = 1.0
|
||||
o_dr = torch.round(o_dr)
|
||||
return o_dr
|
||||
|
||||
@staticmethod
|
||||
def _concat_speaker_embedding(o_en, g):
|
||||
g_exp = g.expand(-1, -1, o_en.size(-1)) # [B, C, T_en]
|
||||
o_en = torch.cat([o_en, g_exp], 1)
|
||||
return o_en
|
||||
|
||||
def _sum_speaker_embedding(self, x, g):
|
||||
# project g to decoder dim.
|
||||
if hasattr(self, 'proj_g'):
|
||||
g = self.proj_g(g)
|
||||
return x + g
|
||||
|
||||
def _forward_encoder(self, x, x_lengths, g=None):
|
||||
if hasattr(self, 'emb_g'):
|
||||
g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1]
|
||||
|
||||
if g is not None:
|
||||
g = g.unsqueeze(-1)
|
||||
|
||||
# [B, T, C]
|
||||
x_emb = self.emb(x)
|
||||
# [B, C, T]
|
||||
x_emb = torch.transpose(x_emb, 1, -1)
|
||||
|
||||
# compute sequence masks
|
||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]),
|
||||
1).to(x.dtype)
|
||||
|
||||
# encoder pass
|
||||
o_en = self.encoder(x_emb, x_mask)
|
||||
|
||||
# speaker conditioning for duration predictor
|
||||
if g is not None:
|
||||
o_en_dp = self._concat_speaker_embedding(o_en, g)
|
||||
else:
|
||||
o_en_dp = o_en
|
||||
return o_en, o_en_dp, x_mask, g
|
||||
|
||||
def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g):
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None),
|
||||
1).to(o_en_dp.dtype)
|
||||
# expand o_en with durations
|
||||
o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
|
||||
# positional encoding
|
||||
if hasattr(self, 'pos_encoder'):
|
||||
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
|
||||
# speaker embedding
|
||||
if g is not None:
|
||||
o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
|
||||
# decoder pass
|
||||
o_de = self.decoder(o_en_ex, y_mask, g=g)
|
||||
return o_de, attn.transpose(1, 2)
|
||||
|
||||
def forward(self, x, x_lengths, y_lengths, dr, g=None): # pylint: disable=unused-argument
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, T_max]
|
||||
x_lengths: [B]
|
||||
y_lengths: [B]
|
||||
dr: [B, T_max]
|
||||
g: [B, C]
|
||||
"""
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||
o_de, attn= self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
||||
return o_de, o_dr_log.squeeze(1), attn
|
||||
|
||||
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, T_max]
|
||||
x_lengths: [B]
|
||||
g: [B, C]
|
||||
"""
|
||||
# pad input to prevent dropping the last word
|
||||
x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
# duration predictor pass
|
||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
||||
y_lengths = o_dr.sum(1)
|
||||
o_de, attn= self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
|
||||
return o_de, attn
|
|
@ -8,6 +8,45 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract
|
|||
|
||||
|
||||
class Tacotron(TacotronAbstract):
|
||||
"""Tacotron as in https://arxiv.org/abs/1703.10135
|
||||
|
||||
It's an autoregressive encoder-attention-decoder-postnet architecture.
|
||||
|
||||
Args:
|
||||
num_chars (int): number of input characters to define the size of embedding layer.
|
||||
num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings.
|
||||
r (int): initial model reduction rate.
|
||||
postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
|
||||
decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
|
||||
attn_type (str, optional): attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
|
||||
attn_win (bool, optional): enable/disable attention windowing.
|
||||
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||
attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
|
||||
prenet_type (str, optional): prenet type for the decoder. Defaults to "original".
|
||||
prenet_dropout (bool, optional): prenet dropout rate. Defaults to True.
|
||||
forward_attn (bool, optional): enable/disable forward attention.
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||
trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False.
|
||||
forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False.
|
||||
location_attn (bool, optional): enable/disable location sensitive attention.
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||
attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5.
|
||||
separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient
|
||||
flow from stopnet to the rest of the model. Defaults to True.
|
||||
bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False.
|
||||
double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False.
|
||||
ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
|
||||
encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
|
||||
decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
|
||||
speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
|
||||
gst (bool, optional): enable/disable global style token learning. Defaults to False.
|
||||
gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
|
||||
gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
|
||||
gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
|
||||
gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
|
||||
memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size```
|
||||
output frames to the prenet.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
num_speakers,
|
||||
|
@ -95,10 +134,12 @@ class Tacotron(TacotronAbstract):
|
|||
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
|
||||
"""
|
||||
Shapes:
|
||||
- characters: B x T_in
|
||||
- text_lengths: B
|
||||
- mel_specs: B x T_out x D
|
||||
- speaker_ids: B x 1
|
||||
characters: [B, T_in]
|
||||
text_lengths: [B]
|
||||
mel_specs: [B, T_out, C]
|
||||
mel_lengths: [B]
|
||||
speaker_ids: [B, 1]
|
||||
speaker_embeddings: [B, C]
|
||||
"""
|
||||
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||
# B x T_in x embed_dim
|
||||
|
|
|
@ -7,6 +7,43 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract
|
|||
|
||||
# TODO: match function arguments with tacotron
|
||||
class Tacotron2(TacotronAbstract):
|
||||
"""Tacotron2 as in https://arxiv.org/abs/1712.05884
|
||||
|
||||
It's an autoregressive encoder-attention-decoder-postnet architecture.
|
||||
|
||||
Args:
|
||||
num_chars (int): number of input characters to define the size of embedding layer.
|
||||
num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings.
|
||||
r (int): initial model reduction rate.
|
||||
postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
|
||||
decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
|
||||
attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'.
|
||||
attn_win (bool, optional): enable/disable attention windowing.
|
||||
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||
attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
|
||||
prenet_type (str, optional): prenet type for the decoder. Defaults to "original".
|
||||
prenet_dropout (bool, optional): prenet dropout rate. Defaults to True.
|
||||
forward_attn (bool, optional): enable/disable forward attention.
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||
trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False.
|
||||
forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False.
|
||||
location_attn (bool, optional): enable/disable location sensitive attention.
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||
attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5.
|
||||
separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient
|
||||
flow from stopnet to the rest of the model. Defaults to True.
|
||||
bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False.
|
||||
double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False.
|
||||
ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
|
||||
encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
|
||||
decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
|
||||
speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
|
||||
gst (bool, optional): enable/disable global style token learning. Defaults to False.
|
||||
gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
|
||||
gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
|
||||
gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
|
||||
gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
num_speakers,
|
||||
|
@ -93,6 +130,15 @@ class Tacotron2(TacotronAbstract):
|
|||
return mel_outputs, mel_outputs_postnet, alignments
|
||||
|
||||
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
|
||||
"""
|
||||
Shapes:
|
||||
text: [B, T_in]
|
||||
text_lengths: [B]
|
||||
mel_specs: [B, T_out, C]
|
||||
mel_lengths: [B]
|
||||
speaker_ids: [B, 1]
|
||||
speaker_embeddings: [B, C]
|
||||
"""
|
||||
# compute mask for padding
|
||||
# B x T_in_max (boolean)
|
||||
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||
|
|
|
@ -2,7 +2,7 @@ import tensorflow as tf
|
|||
from tensorflow import keras
|
||||
from TTS.tts.tf.utils.tf_utils import shape_list
|
||||
from TTS.tts.tf.layers.common_layers import Prenet, Attention
|
||||
# from tensorflow_addons.seq2seq import AttentionWrapper
|
||||
|
||||
|
||||
# NOTE: linter has a problem with the current TF release
|
||||
#pylint: disable=no-value-for-parameter
|
||||
|
|
|
@ -103,15 +103,13 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
|||
speaker_embedding_dim=speaker_embedding_dim)
|
||||
elif c.model.lower() == "glow_tts":
|
||||
model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False),
|
||||
hidden_channels=192,
|
||||
filter_channels=768,
|
||||
filter_channels_dp=256,
|
||||
out_channels=80,
|
||||
kernel_size=3,
|
||||
num_heads=2,
|
||||
num_layers_enc=6,
|
||||
hidden_channels_enc=c['hidden_channels_encoder'],
|
||||
hidden_channels_dec=c['hidden_channels_decoder'],
|
||||
hidden_channels_dp=c['hidden_channels_duration_predictor'],
|
||||
out_channels=c.audio['num_mels'],
|
||||
encoder_type=c.encoder_type,
|
||||
dropout_p=0.1,
|
||||
encoder_params=c.encoder_params,
|
||||
use_encoder_prenet=c["use_encoder_prenet"],
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=1,
|
||||
|
@ -120,20 +118,27 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
|||
num_speakers=num_speakers,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_sqz=2,
|
||||
num_squeeze=2,
|
||||
sigmoid_scale=False,
|
||||
mean_only=True,
|
||||
hidden_channels_enc=192,
|
||||
hidden_channels_dec=192,
|
||||
use_encoder_prenet=True,
|
||||
external_speaker_embedding_dim=speaker_embedding_dim)
|
||||
elif c.model.lower() == "speedy_speech":
|
||||
model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False),
|
||||
out_channels=c.audio['num_mels'],
|
||||
hidden_channels=c['hidden_channels'],
|
||||
positional_encoding=c['positional_encoding'],
|
||||
encoder_type=c['encoder_type'],
|
||||
encoder_params=c['encoder_params'],
|
||||
decoder_type=c['decoder_type'],
|
||||
decoder_params=c['decoder_params'],
|
||||
c_in_channels=0)
|
||||
return model
|
||||
|
||||
def is_tacotron(c):
|
||||
return False if 'glow_tts' in c['model'] else True
|
||||
return False if c['model'] in ['speedy_speech', 'glow_tts'] else True
|
||||
|
||||
def check_config_tts(c):
|
||||
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts'], restricted=True, val_type=str)
|
||||
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
|
||||
check_argument('run_name', c, restricted=True, val_type=str)
|
||||
check_argument('run_description', c, val_type=str)
|
||||
|
||||
|
@ -177,7 +182,7 @@ def check_config_tts(c):
|
|||
check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('r', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('gradual_training', c, restricted=False, val_type=list)
|
||||
check_argument('apex_amp_level', c, restricted=False, val_type=str)
|
||||
check_argument('mixed_precision', c, restricted=False, val_type=bool)
|
||||
# check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
|
||||
|
||||
# loss parameters
|
||||
|
@ -190,6 +195,10 @@ def check_config_tts(c):
|
|||
check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
if c['model'].lower == "speedy_speech":
|
||||
check_argument('ssim_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('l1_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('huber_alpha', c, restricted=True, val_type=float, min_val=0)
|
||||
|
||||
# validation parameters
|
||||
check_argument('run_eval', c, restricted=True, val_type=bool)
|
||||
|
@ -201,9 +210,9 @@ def check_config_tts(c):
|
|||
check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
|
||||
check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('lr', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('wd', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('wd', c, restricted=is_tacotron(c), val_type=float, min_val=0)
|
||||
check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('seq_len_norm', c, restricted=True, val_type=bool)
|
||||
check_argument('seq_len_norm', c, restricted=is_tacotron(c), val_type=bool)
|
||||
|
||||
# tacotron prenet
|
||||
check_argument('memory_size', c, restricted=is_tacotron(c), val_type=int, min_val=-1)
|
||||
|
@ -211,7 +220,7 @@ def check_config_tts(c):
|
|||
check_argument('prenet_dropout', c, restricted=is_tacotron(c), val_type=bool)
|
||||
|
||||
# attention
|
||||
check_argument('attention_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['graves', 'original'])
|
||||
check_argument('attention_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['graves', 'original', 'dynamic_convolution'])
|
||||
check_argument('attention_heads', c, restricted=is_tacotron(c), val_type=int)
|
||||
check_argument('attention_norm', c, restricted=is_tacotron(c), val_type=str, enum_list=['sigmoid', 'softmax'])
|
||||
check_argument('windowing', c, restricted=is_tacotron(c), val_type=bool)
|
||||
|
@ -224,9 +233,17 @@ def check_config_tts(c):
|
|||
check_argument('double_decoder_consistency', c, restricted=is_tacotron(c), val_type=bool)
|
||||
check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
|
||||
|
||||
# stopnet
|
||||
check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool)
|
||||
check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool)
|
||||
if c['model'].lower() in ['tacotron', 'tacotron2']:
|
||||
# stopnet
|
||||
check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool)
|
||||
check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool)
|
||||
|
||||
# Model Parameters for non-tacotron models
|
||||
if c['model'].lower == "speedy_speech":
|
||||
check_argument('positional_encoding', c, restricted=True, val_type=type)
|
||||
check_argument('encoder_type', c, restricted=True, val_type=str)
|
||||
check_argument('encoder_params', c, restricted=True, val_type=dict)
|
||||
check_argument('decoder_residual_conv_bn_params', c, restricted=True, val_type=dict)
|
||||
|
||||
# GlowTTS parameters
|
||||
check_argument('encoder_type', c, restricted=not is_tacotron(c), val_type=str)
|
||||
|
@ -248,6 +265,7 @@ def check_config_tts(c):
|
|||
check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
|
||||
check_argument('compute_input_seq_cache', c, restricted=True, val_type=bool)
|
||||
|
||||
# paths
|
||||
check_argument('output_path', c, restricted=True, val_type=str)
|
||||
|
@ -256,8 +274,8 @@ def check_config_tts(c):
|
|||
check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
|
||||
check_argument('use_external_speaker_embedding_file', c, restricted=c['use_speaker_embedding'], val_type=bool)
|
||||
check_argument('external_speaker_embedding_file', c, restricted=c['use_external_speaker_embedding_file'], val_type=str)
|
||||
check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool)
|
||||
if c['model'].lower() in ['tacotron', 'tacotron2'] and c['use_gst']:
|
||||
check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool)
|
||||
check_argument('gst', c, restricted=is_tacotron(c), val_type=dict)
|
||||
check_argument('gst_style_input', c['gst'], restricted=is_tacotron(c), val_type=[str, dict])
|
||||
check_argument('gst_embedding_dim', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000)
|
||||
|
|
|
@ -8,6 +8,17 @@ from TTS.utils.io import RenamingUnpickler
|
|||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
|
||||
"""Load ```TTS.tts.models``` checkpoints.
|
||||
|
||||
Args:
|
||||
model (TTS.tts.models): model object to load the weights for.
|
||||
checkpoint_path (string): checkpoint file path.
|
||||
amp (apex.amp, optional): Apex amp abject to load apex related state vars. Defaults to None.
|
||||
use_cuda (bool, optional): load model to GPU if True. Defaults to False.
|
||||
|
||||
Returns:
|
||||
[type]: [description]
|
||||
"""
|
||||
try:
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
except ModuleNotFoundError:
|
||||
|
@ -26,6 +37,17 @@ def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
|
|||
|
||||
|
||||
def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_dict=None, **kwargs):
|
||||
"""Save ```TTS.tts.models``` states with extra fields.
|
||||
|
||||
Args:
|
||||
model (TTS.tts.models.Model): models object to be saved.
|
||||
optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training.
|
||||
current_step (int): current number of training steps.
|
||||
epoch (int): current number of training epochs.
|
||||
r (int): model reduction rate for Tacotron models.
|
||||
output_path (str): output path to save the model file.
|
||||
amp_state_dict (state_dict, optional): Apex.amp state dict if Apex is enabled. Defaults to None.
|
||||
"""
|
||||
if hasattr(model, 'module'):
|
||||
model_state = model.module.state_dict()
|
||||
else:
|
||||
|
@ -45,6 +67,16 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_
|
|||
|
||||
|
||||
def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs):
|
||||
"""Save model checkpoint, intended for saving checkpoints at training.
|
||||
|
||||
Args:
|
||||
model (TTS.tts.models.Model): models object to be saved.
|
||||
optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training.
|
||||
current_step (int): current number of training steps.
|
||||
epoch (int): current number of training epochs.
|
||||
r (int): model reduction rate for Tacotron models.
|
||||
output_path (str): output path to save the model file.
|
||||
"""
|
||||
file_name = 'checkpoint_{}.pth.tar'.format(current_step)
|
||||
checkpoint_path = os.path.join(output_folder, file_name)
|
||||
print(" > CHECKPOINT : {}".format(checkpoint_path))
|
||||
|
@ -52,6 +84,23 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **k
|
|||
|
||||
|
||||
def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs):
|
||||
"""Save model checkpoint, intended for saving the best model after each epoch.
|
||||
It compares the current model loss with the best loss so far and saves the
|
||||
model if the current loss is better.
|
||||
|
||||
Args:
|
||||
target_loss (float): current model loss.
|
||||
best_loss (float): best loss so far.
|
||||
model (TTS.tts.models.Model): models object to be saved.
|
||||
optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training.
|
||||
current_step (int): current number of training steps.
|
||||
epoch (int): current number of training epochs.
|
||||
r (int): model reduction rate for Tacotron models.
|
||||
output_path (str): output path to save the model file.
|
||||
|
||||
Returns:
|
||||
float: updated current best loss.
|
||||
"""
|
||||
if target_loss < best_loss:
|
||||
file_name = 'best_model.pth.tar'
|
||||
checkpoint_path = os.path.join(output_folder, file_name)
|
||||
|
|
|
@ -63,7 +63,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
|||
speaker_embedding_dim = None
|
||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||
num_speakers = len(speaker_mapping)
|
||||
print("Training with {} speakers: {}".format(len(speakers),
|
||||
print(" > Training with {} speakers: {}".format(len(speakers),
|
||||
", ".join(speakers)))
|
||||
else:
|
||||
num_speakers = 0
|
||||
|
|
|
@ -62,7 +62,22 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
|
|||
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
|
||||
elif 'glow' in CONFIG.model.lower():
|
||||
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
|
||||
postnet_output, _, _, _, alignments, _, _ = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id else speaker_embeddings)
|
||||
if hasattr(model, 'module'):
|
||||
# distributed model
|
||||
postnet_output, _, _, _, alignments, _, _ = model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
||||
else:
|
||||
postnet_output, _, _, _, alignments, _, _ = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
||||
postnet_output = postnet_output.permute(0, 2, 1)
|
||||
# these only belong to tacotron models.
|
||||
decoder_output = None
|
||||
stop_tokens = None
|
||||
elif 'speedy_speech' in CONFIG.model.lower():
|
||||
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
|
||||
if hasattr(model, 'module'):
|
||||
# distributed model
|
||||
postnet_output, alignments= model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
||||
else:
|
||||
postnet_output, alignments= model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
||||
postnet_output = postnet_output.permute(0, 2, 1)
|
||||
# these only belong to tacotron models.
|
||||
decoder_output = None
|
||||
|
@ -145,7 +160,8 @@ def inv_spectrogram(postnet_output, ap, CONFIG):
|
|||
def id_to_torch(speaker_id, cuda=False):
|
||||
if speaker_id is not None:
|
||||
speaker_id = np.asarray(speaker_id)
|
||||
speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
|
||||
# TODO: test this for tacotron models
|
||||
speaker_id = torch.from_numpy(speaker_id)
|
||||
if cuda:
|
||||
return speaker_id.cuda()
|
||||
return speaker_id
|
||||
|
|
|
@ -14,6 +14,7 @@ import re
|
|||
from unidecode import unidecode
|
||||
from .number_norm import normalize_numbers
|
||||
from .abbreviations import abbreviations_en, abbreviations_fr
|
||||
from .time import expand_time_english
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
@ -95,6 +96,7 @@ def english_cleaners(text):
|
|||
'''Pipeline for English text, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_time_english(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = replace_symbols(text)
|
||||
|
@ -122,8 +124,8 @@ def portuguese_cleaners(text):
|
|||
|
||||
def phoneme_cleaners(text):
|
||||
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = expand_numbers(text)
|
||||
text = convert_to_ascii(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = replace_symbols(text)
|
||||
text = remove_aux_symbols(text)
|
||||
|
|
|
@ -2,14 +2,14 @@
|
|||
|
||||
import inflect
|
||||
import re
|
||||
from typing import Dict
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_currency_re = re.compile(r'(£|\$|¥)([0-9\,\.]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
_number_re = re.compile(r'-?[0-9]+')
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
|
@ -20,24 +20,54 @@ def _expand_decimal_point(m):
|
|||
return m.group(1).replace('.', ' point ')
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
|
||||
parts = value.replace(",", "").split('.')
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
if dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
if cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
return 'zero dollars'
|
||||
return f"{value} {inflection[2]}" # Unexpected format
|
||||
text = []
|
||||
integer = int(parts[0]) if parts[0] else 0
|
||||
if integer > 0:
|
||||
integer_unit = inflection.get(integer, inflection[2])
|
||||
text.append(f"{integer} {integer_unit}")
|
||||
fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if fraction > 0:
|
||||
fraction_unit = inflection.get(fraction/100, inflection[0.02])
|
||||
text.append(f"{fraction} {fraction_unit}")
|
||||
if len(text) == 0:
|
||||
return f"zero {inflection[2]}"
|
||||
return " ".join(text)
|
||||
|
||||
|
||||
def _expand_currency(m: "re.Match") -> str:
|
||||
currencies = {
|
||||
"$": {
|
||||
0.01: "cent",
|
||||
0.02: "cents",
|
||||
1: "dollar",
|
||||
2: "dollars",
|
||||
},
|
||||
"€": {
|
||||
0.01: "cent",
|
||||
0.02: "cents",
|
||||
1: "euro",
|
||||
2: "euros",
|
||||
},
|
||||
"£": {
|
||||
0.01: "penny",
|
||||
0.02: "pence",
|
||||
1: "pound sterling",
|
||||
2: "pounds sterling",
|
||||
},
|
||||
"¥": {
|
||||
# TODO rin
|
||||
0.02: "sen",
|
||||
2: "yen",
|
||||
}
|
||||
}
|
||||
unit = m.group(1)
|
||||
currency = currencies[unit]
|
||||
value = m.group(2)
|
||||
return __expand_currency(value, currency)
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
|
@ -62,8 +92,7 @@ def _expand_number(m):
|
|||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r'\1 pounds', text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_currency_re, _expand_currency, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
import re
|
||||
import inflect
|
||||
|
||||
_inflect = inflect.engine()
|
||||
|
||||
_time_re = re.compile(r"""\b
|
||||
((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
|
||||
:
|
||||
([0-5][0-9]) # minutes
|
||||
\s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
|
||||
\b""",
|
||||
re.IGNORECASE | re.X)
|
||||
|
||||
|
||||
def _expand_num(n: int) -> str:
|
||||
return _inflect.number_to_words(n)
|
||||
|
||||
|
||||
def _expand_time_english(match: "re.Match") -> str:
|
||||
hour = int(match.group(1))
|
||||
past_noon = hour >= 12
|
||||
time = []
|
||||
if hour > 12:
|
||||
hour -= 12
|
||||
elif hour == 0:
|
||||
hour = 12
|
||||
past_noon = True
|
||||
time.append(_expand_num(hour))
|
||||
|
||||
minute = int(match.group(6))
|
||||
if minute > 0:
|
||||
if minute < 10:
|
||||
time.append("oh")
|
||||
time.append(_expand_num(minute))
|
||||
am_pm = match.group(7)
|
||||
if am_pm is None:
|
||||
time.append("p m" if past_noon else "a m")
|
||||
else:
|
||||
time.extend(list(am_pm.replace(".", "")))
|
||||
return " ".join(time)
|
||||
|
||||
|
||||
def expand_time_english(text: str) -> str:
|
||||
return re.sub(_time_re, _expand_time_english, text)
|
|
@ -17,6 +17,8 @@ def plot_alignment(alignment,
|
|||
alignment_ = alignment.detach().cpu().numpy().squeeze()
|
||||
else:
|
||||
alignment_ = alignment
|
||||
alignment_ = alignment_.astype(
|
||||
np.float32) if alignment_.dtype == np.float16 else alignment_
|
||||
fig, ax = plt.subplots(figsize=fig_size)
|
||||
im = ax.imshow(alignment_.T,
|
||||
aspect='auto',
|
||||
|
|
|
@ -11,6 +11,7 @@ from TTS.tts.utils.data import StandardScaler
|
|||
class AudioProcessor(object):
|
||||
def __init__(self,
|
||||
sample_rate=None,
|
||||
resample=False,
|
||||
num_mels=None,
|
||||
min_level_db=None,
|
||||
frame_shift_ms=None,
|
||||
|
@ -39,6 +40,7 @@ class AudioProcessor(object):
|
|||
print(" > Setting up Audio Processor...")
|
||||
# setup class attributed
|
||||
self.sample_rate = sample_rate
|
||||
self.resample = resample
|
||||
self.num_mels = num_mels
|
||||
self.min_level_db = min_level_db or 0
|
||||
self.frame_shift_ms = frame_shift_ms
|
||||
|
@ -321,7 +323,9 @@ class AudioProcessor(object):
|
|||
|
||||
### save and load ###
|
||||
def load_wav(self, filename, sr=None):
|
||||
if sr is None:
|
||||
if self.resample:
|
||||
x, sr = librosa.load(filename, sr=self.sample_rate)
|
||||
elif sr is None:
|
||||
x, sr = sf.read(filename)
|
||||
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
|
||||
else:
|
||||
|
|
|
@ -3,6 +3,7 @@ import re
|
|||
import json
|
||||
import yaml
|
||||
import pickle as pickle_tts
|
||||
from shutil import copyfile
|
||||
|
||||
|
||||
class RenamingUnpickler(pickle_tts.Unpickler):
|
||||
|
@ -44,16 +45,19 @@ def load_config(config_path: str) -> AttrDict:
|
|||
return config
|
||||
|
||||
|
||||
def copy_config_file(config_file, out_path, new_fields):
|
||||
"""Copy config.json to training folder and add
|
||||
def copy_model_files(c, config_file, out_path, new_fields):
|
||||
"""Copy config.json and other model files to training folder and add
|
||||
new fields.
|
||||
|
||||
Args:
|
||||
c (dict): model config from config.json.
|
||||
config_file (str): path to config file.
|
||||
out_path (str): output path to copy the file.
|
||||
new_fields (dict): new fileds to be added or edited
|
||||
in the config file.
|
||||
"""
|
||||
# copy config.json
|
||||
copy_config_path = os.path.join(out_path, 'config.json')
|
||||
config_lines = open(config_file, "r").readlines()
|
||||
# add extra information fields
|
||||
for key, value in new_fields.items():
|
||||
|
@ -62,6 +66,10 @@ def copy_config_file(config_file, out_path, new_fields):
|
|||
else:
|
||||
new_line = '"{}":{},\n'.format(key, value)
|
||||
config_lines.insert(1, new_line)
|
||||
config_out_file = open(out_path, "w")
|
||||
config_out_file = open(copy_config_path, "w")
|
||||
config_out_file.writelines(config_lines)
|
||||
config_out_file.close()
|
||||
# copy model stats file if available
|
||||
if c.audio['stats_path'] is not None:
|
||||
copy_stats_path = os.path.join(out_path, 'scale_stats.npy')
|
||||
copyfile(c.audio['stats_path'], copy_stats_path)
|
||||
|
|
|
@ -105,8 +105,8 @@ class Wavegrad(nn.Module):
|
|||
self.noise_level = self.noise_level.to(y_0)
|
||||
if len(y_0.shape) == 3:
|
||||
y_0 = y_0.squeeze(1)
|
||||
s = torch.randint(1, self.num_steps + 1, [y_0.shape[0]])
|
||||
l_a, l_b = self.noise_level[s-1], self.noise_level[s]
|
||||
s = torch.randint(0, self.num_steps - 1, [y_0.shape[0]])
|
||||
l_a, l_b = self.noise_level[s], self.noise_level[s+1]
|
||||
noise_scale = l_a + torch.rand(y_0.shape[0]).to(y_0) * (l_b - l_a)
|
||||
noise_scale = noise_scale.unsqueeze(1)
|
||||
noise = torch.randn_like(y_0)
|
||||
|
|
|
@ -6,11 +6,12 @@ nosetests tests -x &&\
|
|||
|
||||
# runtime tests
|
||||
./tests/test_server_package.sh && \
|
||||
./tests/test_tts_train.sh && \
|
||||
./tests/test_tacotron_train.sh && \
|
||||
./tests/test_glow-tts_train.sh && \
|
||||
./tests/test_vocoder_gan_train.sh && \
|
||||
./tests/test_vocoder_wavernn_train.sh && \
|
||||
./tests/test_vocoder_wavegrad_train.sh && \
|
||||
./tests/test_speedy_speech_train.sh && \
|
||||
|
||||
# linter check
|
||||
cardboardlinter --refspec master
|
2
setup.py
2
setup.py
|
@ -33,7 +33,7 @@ args, unknown_args = parser.parse_known_args()
|
|||
# Remove our arguments from argv so that setuptools doesn't see them
|
||||
sys.argv = [sys.argv[0]] + unknown_args
|
||||
|
||||
version = '0.0.6'
|
||||
version = '0.0.8'
|
||||
|
||||
# Adapted from https://github.com/pytorch/pytorch
|
||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -70,6 +70,7 @@
|
|||
"eval_batch_size":1,
|
||||
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"data_dep_init_iter": 1,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
|
@ -85,7 +86,19 @@
|
|||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
"encoder_type": "gatedconv",
|
||||
"hidden_channels_encoder": 192,
|
||||
"hidden_channels_decoder": 192,
|
||||
"hidden_channels_duration_predictor": 256,
|
||||
"use_encoder_prenet": true,
|
||||
"encoder_type": "rel_pos_transformer",
|
||||
"encoder_params": {
|
||||
"kernel_size":3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"num_heads": 2,
|
||||
"hidden_channels_ffn": 768,
|
||||
"input_length": null
|
||||
},
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log training on console.
|
||||
|
@ -105,6 +118,8 @@
|
|||
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 500, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
"compute_input_seq_cache": true,
|
||||
"use_noise_augment": true,
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
|
|
@ -0,0 +1,153 @@
|
|||
{
|
||||
"model": "speedy_speech",
|
||||
"run_name": "test_sample_dataset_run",
|
||||
"run_description": "sample dataset test run",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
// stft parameters
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// Griffin-Lim
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1,
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
// "characters":{
|
||||
// "pad": "_",
|
||||
// "eos": "&",
|
||||
// "bos": "*",
|
||||
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? ",
|
||||
// "punctuations":"!'(),-.:;? ",
|
||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ'̃' "
|
||||
// },
|
||||
|
||||
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"positional_encoding": true,
|
||||
"hidden_channels": 128,
|
||||
"encoder_type": "residual_conv_bn",
|
||||
"encoder_type": "residual_conv_bn",
|
||||
"encoder_params":{
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13
|
||||
},
|
||||
"decoder_type": "residual_conv_bn",
|
||||
"decoder_params":{
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17
|
||||
},
|
||||
|
||||
|
||||
// TRAINING
|
||||
"batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":32,
|
||||
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
|
||||
// LOSS PARAMETERS
|
||||
"ssim_alpha": 1,
|
||||
"l1_alpha": 1,
|
||||
"huber_alpha": 1,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": -1, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": true, // use noam warmup and lr schedule.
|
||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"lr": 0.002, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 1, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n
|
||||
"mixed_precision": false,
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "english_cleaners",
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 0, // number of evaluation data loader processes.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 300, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
"compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronoun[ciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
"external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
|
||||
|
||||
// DATASETS
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "tests/data/ljspeech/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": "metadata.csv",
|
||||
"meta_file_attn_mask": "tests/data/ljspeech/metadata_attn_mask.txt"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -100,7 +100,7 @@
|
|||
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||
|
||||
// TACOTRON ATTENTION
|
||||
"attention_type": "original", // 'original' or 'graves'
|
||||
"attention_type": "original", // 'original' , 'graves', 'dynamic_convolution'
|
||||
"attention_heads": 4, // number of attention heads (only for 'graves')
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid.
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
|
@ -132,6 +132,7 @@
|
|||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
||||
"compute_input_seq_cache": true,
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
|
|
@ -42,60 +42,62 @@ class GlowTTSTrainTest(unittest.TestCase):
|
|||
criterion = criterion = GlowTTSLoss()
|
||||
|
||||
# model to train
|
||||
model = GlowTts(num_chars=32,
|
||||
hidden_channels=128,
|
||||
filter_channels=32,
|
||||
filter_channels_dp=32,
|
||||
out_channels=80,
|
||||
kernel_size=3,
|
||||
num_heads=2,
|
||||
num_layers_enc=6,
|
||||
dropout_p=0.1,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=5,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.,
|
||||
num_speakers=0,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_sqz=1,
|
||||
sigmoid_scale=False,
|
||||
rel_attn_window_size=None,
|
||||
input_length=None,
|
||||
mean_only=False,
|
||||
hidden_channels_enc=None,
|
||||
hidden_channels_dec=None,
|
||||
use_encoder_prenet=False,
|
||||
encoder_type="transformer").to(device)
|
||||
model = GlowTts(
|
||||
num_chars=32,
|
||||
hidden_channels_enc=128,
|
||||
hidden_channels_dec=128,
|
||||
hidden_channels_dp=32,
|
||||
out_channels=80,
|
||||
encoder_type='rel_pos_transformer',
|
||||
encoder_params={
|
||||
'kernel_size': 3,
|
||||
'dropout_p': 0.1,
|
||||
'num_layers': 6,
|
||||
'num_heads': 2,
|
||||
'hidden_channels_ffn': 768, # 4 times the hidden_channels
|
||||
'input_length': None
|
||||
},
|
||||
use_encoder_prenet=True,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=5,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.,
|
||||
num_speakers=0,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_squeeze=1,
|
||||
sigmoid_scale=False,
|
||||
mean_only=False).to(device)
|
||||
|
||||
# reference model to compare model weights
|
||||
model_ref = GlowTts(num_chars=32,
|
||||
hidden_channels=128,
|
||||
filter_channels=32,
|
||||
filter_channels_dp=32,
|
||||
out_channels=80,
|
||||
kernel_size=3,
|
||||
num_heads=2,
|
||||
num_layers_enc=6,
|
||||
dropout_p=0.1,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=5,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.,
|
||||
num_speakers=0,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_sqz=1,
|
||||
sigmoid_scale=False,
|
||||
rel_attn_window_size=None,
|
||||
input_length=None,
|
||||
mean_only=False,
|
||||
hidden_channels_enc=None,
|
||||
hidden_channels_dec=None,
|
||||
use_encoder_prenet=False,
|
||||
encoder_type="transformer").to(device)
|
||||
model_ref = GlowTts(
|
||||
num_chars=32,
|
||||
hidden_channels_enc=128,
|
||||
hidden_channels_dec=128,
|
||||
hidden_channels_dp=32,
|
||||
out_channels=80,
|
||||
encoder_type='rel_pos_transformer',
|
||||
encoder_params={
|
||||
'kernel_size': 3,
|
||||
'dropout_p': 0.1,
|
||||
'num_layers': 6,
|
||||
'num_heads': 2,
|
||||
'hidden_channels_ffn': 768, # 4 times the hidden_channels
|
||||
'input_length': None
|
||||
},
|
||||
use_encoder_prenet=True,
|
||||
num_flow_blocks_dec=12,
|
||||
kernel_size_dec=5,
|
||||
dilation_rate=5,
|
||||
num_block_layers=4,
|
||||
dropout_p_dec=0.,
|
||||
num_speakers=0,
|
||||
c_in_channels=0,
|
||||
num_splits=4,
|
||||
num_squeeze=1,
|
||||
sigmoid_scale=False,
|
||||
mean_only=False).to(device)
|
||||
|
||||
model.train()
|
||||
print(" > Num parameters for GlowTTS model:%s" %
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
import torch
|
||||
|
||||
from TTS.tts.layers.speedy_speech.encoder import Encoder
|
||||
from TTS.tts.layers.speedy_speech.decoder import Decoder
|
||||
from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeech
|
||||
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def test_encoder():
|
||||
input_dummy = torch.rand(8, 14, 37).to(device)
|
||||
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
|
||||
input_lengths[-1] = 37
|
||||
input_mask = torch.unsqueeze(
|
||||
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
|
||||
# residual bn conv encoder
|
||||
layer = Encoder(out_channels=11,
|
||||
in_hidden_channels=14,
|
||||
encoder_type='residual_conv_bn').to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
# transformer encoder
|
||||
layer = Encoder(out_channels=11,
|
||||
in_hidden_channels=14,
|
||||
encoder_type='transformer',
|
||||
encoder_params={
|
||||
'hidden_channels_ffn': 768,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
|
||||
def test_decoder():
|
||||
input_dummy = torch.rand(8, 128, 37).to(device)
|
||||
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
|
||||
input_lengths[-1] = 37
|
||||
|
||||
input_mask = torch.unsqueeze(
|
||||
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
|
||||
# residual bn conv decoder
|
||||
layer = Decoder(out_channels=11, in_hidden_channels=128).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
# transformer decoder
|
||||
layer = Decoder(out_channels=11,
|
||||
in_hidden_channels=128,
|
||||
decoder_type='transformer',
|
||||
decoder_params={
|
||||
'hidden_channels_ffn': 128,
|
||||
'num_heads': 2,
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 8,
|
||||
"rel_attn_window_size": 4,
|
||||
"input_length": None
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
|
||||
# wavenet decoder
|
||||
layer = Decoder(out_channels=11,
|
||||
in_hidden_channels=128,
|
||||
decoder_type='wavenet',
|
||||
decoder_params={
|
||||
"num_blocks": 12,
|
||||
"hidden_channels": 192,
|
||||
"kernel_size": 5,
|
||||
"dilation_rate": 1,
|
||||
"num_layers": 4,
|
||||
"dropout_p": 0.05
|
||||
}).to(device)
|
||||
output = layer(input_dummy, input_mask)
|
||||
assert list(output.shape) == [8, 11, 37]
|
||||
|
||||
|
||||
|
||||
def test_duration_predictor():
|
||||
input_dummy = torch.rand(8, 128, 27).to(device)
|
||||
input_lengths = torch.randint(20, 27, (8, )).long().to(device)
|
||||
input_lengths[-1] = 27
|
||||
|
||||
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)),
|
||||
1).to(device)
|
||||
|
||||
layer = DurationPredictor(hidden_channels=128).to(device)
|
||||
|
||||
output = layer(input_dummy, x_mask)
|
||||
assert list(output.shape) == [8, 1, 27]
|
||||
|
||||
|
||||
def test_speedy_speech():
|
||||
num_chars = 7
|
||||
B = 8
|
||||
T_en = 37
|
||||
T_de = 74
|
||||
|
||||
x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device)
|
||||
x_lengths = torch.randint(31, T_en, (B, )).long().to(device)
|
||||
x_lengths[-1] = T_en
|
||||
|
||||
# set durations. max total duration should be equal to T_de
|
||||
durations = torch.randint(1, 4, (B, T_en))
|
||||
durations = durations * (T_de / durations.sum(1)).unsqueeze(1)
|
||||
durations = durations.to(torch.long).to(device)
|
||||
max_dur = durations.sum(1).max()
|
||||
durations[:, 0] += T_de - max_dur if T_de > max_dur else 0
|
||||
|
||||
y_lengths = durations.sum(1)
|
||||
|
||||
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128)
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
|
||||
# forward pass
|
||||
o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations)
|
||||
|
||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
||||
# with speaker embedding
|
||||
model = SpeedySpeech(num_chars,
|
||||
out_channels=80,
|
||||
hidden_channels=128,
|
||||
num_speakers=10,
|
||||
c_in_channels=256).to(device)
|
||||
model.forward(x_dummy,
|
||||
x_lengths,
|
||||
y_lengths,
|
||||
durations,
|
||||
g=torch.randint(0, 10, (B,)).to(device))
|
||||
|
||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
||||
|
||||
# with speaker external embedding
|
||||
model = SpeedySpeech(num_chars,
|
||||
out_channels=80,
|
||||
hidden_channels=128,
|
||||
num_speakers=10,
|
||||
external_c=True,
|
||||
c_in_channels=256).to(device)
|
||||
model.forward(x_dummy,
|
||||
x_lengths,
|
||||
y_lengths,
|
||||
durations,
|
||||
g=torch.rand((B,256)).to(device))
|
||||
|
||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
|
@ -3,11 +3,11 @@ set -xe
|
|||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_speedy_speech.py --config_path $BASEDIR/inputs/test_speedy_speech.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_speedy_speech.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
|
@ -1,14 +1,13 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -xe
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --config_path $BASEDIR/inputs/test_train_config.json
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue