mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev-tacotron2'
commit
70929387c0
17
.compute
17
.compute
|
@ -1,7 +1,16 @@
|
|||
#!/bin/bash
|
||||
ls ${SHARED_DIR}/data/keithito
|
||||
pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
|
||||
yes | apt-get install sox
|
||||
yes | apt-get install ffmpeg
|
||||
yes | apt-get install espeak
|
||||
yes | apt-get install tmux
|
||||
yes | apt-get install zsh
|
||||
pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl
|
||||
# wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar
|
||||
wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh
|
||||
sudo sh install.sh
|
||||
python3 setup.py develop
|
||||
# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/ --restore_path ${USER_DIR}/best_model.pth.tar
|
||||
python3 train.py --config_path config.json --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/ --output_path ../keep/ --restore_path ${USER_DIR}/best_model_by_tilman.pth.tar
|
||||
# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
|
||||
cp -R ${USER_DIR}/GermanData ../tmp/
|
||||
python3 distribute.py --config_path config_tacotron.json --data_path ../tmp/GermanData/karlsson/
|
||||
# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/
|
||||
while true; do sleep 1000000; done
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
.idea/
|
||||
*.pyc
|
||||
.DS_Store
|
||||
./__init__.py
|
||||
|
|
82
README.md
82
README.md
|
@ -3,9 +3,14 @@
|
|||
|
||||
This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. To begin with, you can hear a sample generated voice from [here](https://soundcloud.com/user-565970875/commonvoice-loc-sens-attn).
|
||||
|
||||
The model architecture is highly inspired by Tacotron: [A Fully End-to-End Text-To-Speech Synthesis Model](https://arxiv.org/abs/1703.10135). However, it has many important updates that make training faster and computationally very efficient. Feel free to experiment with new ideas and propose changes.
|
||||
TTS includes two different model implementations which are based on [Tacotron](https://arxiv.org/abs/1703.10135) and [Tacotron2](https://arxiv.org/abs/1712.05884). Tacotron is smaller, efficient and easier to train but Tacotron2 provides better results, especially when it is combined with a Neural vocoder. Therefore, choose depending on your project requirements.
|
||||
|
||||
You can find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief note about TTS architectures and their comparisons.
|
||||
If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons.
|
||||
|
||||
## TTS Performance
|
||||
<p align="center"><img src="https://user-images.githubusercontent.com/1402048/56998082-36d43500-6baa-11e9-8ca3-6c91d3a747bf.png"/></p>
|
||||
|
||||
[Details...](https://github.com/mozilla/TTS/issues/186)
|
||||
|
||||
## Requirements and Installation
|
||||
Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation.
|
||||
|
@ -38,41 +43,43 @@ Check out [here](https://mycroft.ai/blog/available-voices/#the-human-voice-is-th
|
|||
|
||||
| Models |Dataset | Commit | Audio Sample | Details |
|
||||
| ------------- |:------:|:-----------------:|:--------------|:--------|
|
||||
| [iter-62410](https://drive.google.com/open?id=1pjJNzENL3ZNps9n7k_ktGbpEl6YPIkcZ)|LJSpeech| [99d56f7](https://github.com/mozilla/TTS/tree/99d56f7e93ccd7567beb0af8fcbd4d24c48e59e9) | [link](https://soundcloud.com/user-565970875/99d56f7-iter62410 )|First model with plain Tacotron implementation.|
|
||||
| [iter-170K](https://drive.google.com/open?id=16L6JbPXj6MSlNUxEStNn28GiSzi4fu1j) |LJSpeech| [e00bc66](https://github.com/mozilla/TTS/tree/e00bc66) |[link](https://soundcloud.com/user-565970875/april-13-2018-07-06pm-e00bc66-iter170k)|More stable and longer trained model.|
|
||||
| [iter-270K](https://drive.google.com/drive/folders/1Q6BKeEkZyxSGsocK2p_mqgzLwlNvbHFJ?usp=sharing)|LJSpeech|[256ed63](https://github.com/mozilla/TTS/tree/256ed63)|[link](https://soundcloud.com/user-565970875/sets/samples-1650226)|Stop-Token prediction is added, to detect end of speech.|
|
||||
| [iter-120K](https://drive.google.com/open?id=1A5Hr6aSvfGgIiE20mBkpzyn3vvbR2APj) |LJSpeech| [bf7590](https://github.com/mozilla/TTS/tree/bf7590) | [link](https://soundcloud.com/user-565970875/sets/september-26-2018-bf7590) | Better for longer sentences |
|
||||
|[iter-108K](https://drive.google.com/open?id=1deQ2akq9cuyreda0DgZOiBdydkbgseWP)| TWEB | [2810d57](https://github.com/mozilla/TTS/tree/2810d57) | [link](https://soundcloud.com/user-565970875/tweb-example-108k-iters-2810d57) | https://github.com/mozilla/TTS/issues/22 |
|
||||
| Best: [iter-185K](https://drive.google.com/drive/folders/1GU8WGix98WrR3ayjoiirmmbLUZzwg4n0?usp=sharing) | LJSpeech | [db7f3d3](https://github.com/mozilla/TTS/tree/db7f3d3) | [link](https://soundcloud.com/user-565970875/sets/ljspeech-model-185k-iters-commit-db7f3d3) | [link](https://github.com/mozilla/TTS/issues/108) |
|
||||
| [Tacotron-iter-62410](https://drive.google.com/open?id=1pjJNzENL3ZNps9n7k_ktGbpEl6YPIkcZ)|LJSpeech| [99d56f7](https://github.com/mozilla/TTS/tree/99d56f7e93ccd7567beb0af8fcbd4d24c48e59e9) | [link](https://soundcloud.com/user-565970875/99d56f7-iter62410 )|First model with plain Tacotron implementation.|
|
||||
| [Tacotron-iter-170K](https://drive.google.com/open?id=16L6JbPXj6MSlNUxEStNn28GiSzi4fu1j) |LJSpeech| [e00bc66](https://github.com/mozilla/TTS/tree/e00bc66) |[link](https://soundcloud.com/user-565970875/april-13-2018-07-06pm-e00bc66-iter170k)|More stable and longer trained model.|
|
||||
| [Tacotron-iter-270K](https://drive.google.com/drive/folders/1Q6BKeEkZyxSGsocK2p_mqgzLwlNvbHFJ?usp=sharing)|LJSpeech|[256ed63](https://github.com/mozilla/TTS/tree/256ed63)|[link](https://soundcloud.com/user-565970875/sets/samples-1650226)|Stop-Token prediction is added, to detect end of speech.|
|
||||
| [Tacotron-iter-120K](https://drive.google.com/open?id=1A5Hr6aSvfGgIiE20mBkpzyn3vvbR2APj) |LJSpeech| [bf7590](https://github.com/mozilla/TTS/tree/bf7590) | [link](https://soundcloud.com/user-565970875/sets/september-26-2018-bf7590) | Better for longer sentences |
|
||||
|[Tacotron-iter-108K](https://drive.google.com/open?id=1deQ2akq9cuyreda0DgZOiBdydkbgseWP)| TWEB | [2810d57](https://github.com/mozilla/TTS/tree/2810d57) | [link](https://soundcloud.com/user-565970875/tweb-example-108k-iters-2810d57) | https://github.com/mozilla/TTS/issues/22 |
|
||||
|[Tacotron-iter-185K](https://drive.google.com/drive/folders/1GU8WGix98WrR3ayjoiirmmbLUZzwg4n0?usp=sharing) | LJSpeech | [db7f3d3](https://github.com/mozilla/TTS/tree/db7f3d3) | [link](https://soundcloud.com/user-565970875/sets/ljspeech-model-185k-iters-commit-db7f3d3) | [link](https://github.com/mozilla/TTS/issues/108) |
|
||||
|[Tacotron2-iter-260K](https://drive.google.com/open?id=1FJRjGDAqWIyZRX4CsppaIPEW8UWXCWzF)|LJSpeech|[824c091](https://github.com/mozilla/TTS/tree/824c091)|[soundcloud](https://soundcloud.com/user-565970875/ljspeech-logistic-wavernn)|[link](https://github.com/mozilla/TTS/issues/153)|
|
||||
|
||||
## Example Model Outputs
|
||||
Below you see model state after 16K iterations with batch-size 32.
|
||||
Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset.
|
||||
|
||||
> "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning."
|
||||
|
||||
Audio output: [https://soundcloud.com/user-565970875/iter16k-f48c3b](https://soundcloud.com/user-565970875/iter16k-f48c3b)
|
||||
Audio examples: [https://soundcloud.com/user-565970875](https://soundcloud.com/user-565970875)
|
||||
|
||||
![example_model_output](images/example_model_output.png?raw=true)
|
||||
|
||||
## Runtime
|
||||
The most time-consuming part is the vocoder algorithm (Griffin-Lim) which runs on CPU. By setting its number of iterations, you might have faster execution with a small loss of quality. Some of the experimental values are below.
|
||||
The most time-consuming part is the vocoder algorithm (Griffin-Lim) which runs on CPU. By setting its number of iterations lower, you might have faster execution with a small loss of quality. Some of the experimental values are below.
|
||||
|
||||
Sentence: "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
|
||||
|
||||
Audio length is approximately 6 secs.
|
||||
|
||||
| Time (secs) | System | # GL iters |
|
||||
| ---- |:-------|:-----------|
|
||||
|2.00|GTX1080Ti|30|
|
||||
|3.01|GTX1080Ti|60|
|
||||
| Time (secs) | System | # GL iters | Model
|
||||
| ---- |:-------|:-----------| ---- |
|
||||
|2.00|GTX1080Ti|30|Tacotron|
|
||||
|3.01|GTX1080Ti|60|Tacotron|
|
||||
|3.57|CPU|60|Tacotron|
|
||||
|5.27|GTX1080Ti|60|Tacotron2|
|
||||
|6.50|CPU|60|Tacotron2|
|
||||
|
||||
|
||||
## Datasets and Data-Loading
|
||||
TTS provides a generic dataloder easy to use for new datasets. You need to write an adaptor to format and that's all you need.Check ```datasets/preprocess.py``` to see example adaptors. After you wrote an adaptor, you need to set ```dataset``` field in ```config.json```. Do not forget other data related fields.
|
||||
TTS provides a generic dataloder easy to use for new datasets. You need to write an preprocessor function to integrade your own dataset.Check ```datasets/preprocess.py``` to see some examples. After the function, you need to set ```dataset``` field in ```config.json```. Do not forget other data related fields too.
|
||||
|
||||
You can also use pre-computed features. In this case, compute features with ```extract_features.py``` and set ```dataset``` field as ```tts_cache```.
|
||||
|
||||
Example datasets, we successfully applied TTS, are linked below.
|
||||
Some of the open-sourced datasets that we successfully applied TTS, are linked below.
|
||||
|
||||
- [LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
|
||||
- [Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/)
|
||||
|
@ -80,9 +87,9 @@ Example datasets, we successfully applied TTS, are linked below.
|
|||
- [M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/)
|
||||
|
||||
## Training and Fine-tuning LJ-Speech
|
||||
[Click Here](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) for hands-on **Notebook example**, training LJSpeech.
|
||||
Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below.
|
||||
|
||||
Split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that having a validation split does not work well as oppose to other ML problems since at the validation time model generates spectrogram slices without "Teacher-Forcing" and that leads misalignment between the ground-truth and the prediction. Therefore, validation loss does not really show the model performance. Rather, you might use all data for training and check the model performance by relying on human inspection.
|
||||
To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listenning the results is the best way to go.
|
||||
|
||||
```
|
||||
shuf metadata.csv > metadata_shuf.csv
|
||||
|
@ -90,7 +97,7 @@ head -n 12000 metadata_shuf.csv > metadata_train.csv
|
|||
tail -n 1100 metadata_shuf.csv > metadata_val.csv
|
||||
```
|
||||
|
||||
To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below.
|
||||
To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below. You also set the model architecture in ```config.json```.
|
||||
|
||||
```train.py --config_path config.json```
|
||||
|
||||
|
@ -106,14 +113,20 @@ Each run creates a new output folder and ```config.json``` is copied under this
|
|||
|
||||
In case of any error or intercepted execution, if there is no checkpoint yet under the output folder, the whole folder is going to be removed.
|
||||
|
||||
You can also enjoy Tensorboard, if you point the Tensorboard argument```--logdir``` to the experiment folder.
|
||||
You can also enjoy Tensorboard, if you point Tensorboard argument```--logdir``` to the experiment folder.
|
||||
|
||||
## Testing
|
||||
Best way to test your network is to use Notebooks under ```notebooks``` folder.
|
||||
|
||||
## What is new with TTS
|
||||
If you train TTS with LJSpeech dataset, you start to hear reasonable results after 12.5K iterations with batch size 32. This is the fastest training with character-based methods up to our knowledge. Out implementation is also quite robust against long sentences.
|
||||
## Contact/Getting Help
|
||||
- [Wiki](https://github.com/mozilla/TTS/wiki)
|
||||
|
||||
- [Discourse Forums](https://discourse.mozilla.org/c/tts) - If your question is not addressed in the Wiki, the Discourse Forums is the next place to look. They contain conversations on General Topics, Using TTS, and TTS Development.
|
||||
|
||||
- [Issues](https://github.com/mozilla/TTS/issues) - Finally, if all else fails, you can open an issue in our repo.
|
||||
|
||||
<!--## What is new with TTS
|
||||
If you train TTS with LJSpeech dataset, you start to hear reasonable results after 12.5K iterations with batch size 32. This is the fastest training with character-based methods up to our knowledge. Out implementation is also quite robust against long sentences.
|
||||
- Location sensitive attention ([ref](https://arxiv.org/pdf/1506.07503.pdf)). Attention is a vital part of text2speech models. Therefore, it is important to use an attention mechanism that suits the diagonal nature of the problem where the output strictly aligns with the text monotonically. Location sensitive attention performs better by looking into the previous alignment vectors and learns diagonal attention more easily. Yet, I believe there is a good space for research at this front to find a better solution.
|
||||
- Attention smoothing with sigmoid ([ref](https://arxiv.org/pdf/1506.07503.pdf)). Attention weights are computed by normalized sigmoid values instead of softmax for sharper values. That enables the model to pick multiple highly scored inputs for alignments while reducing the noise.
|
||||
- Weight decay ([ref](http://www.fast.ai/2018/07/02/adam-weight-decay/)). After a certain point of the training, you might observe the model over-fitting. That is, the model is able to pronounce words probably better but the quality of the speech quality gets lower and sometimes attention alignment gets disoriented.
|
||||
|
@ -122,29 +135,23 @@ If you train TTS with LJSpeech dataset, you start to hear reasonable results aft
|
|||
- Phoneme based training is enabled for easier learning and robust pronunciation. It also makes easier to adapt TTS to the most languages without worrying about language specific characters.
|
||||
- Configurable attention windowing at inference-time for robust alignment. It enforces network to only consider a certain window of encoder steps per iteration.
|
||||
- Detailed Tensorboard stats for activation, weight and gradient values per layer. It is useful to detect defects and compare networks.
|
||||
- Constant history window. Instead of using only the last frame of predictions, define a constant history queue. It enables training with gradually decreasing prediction frame (r=5 --> r=1) by only changing the last layer. For instance, you can train the model with r=5 and then fine-tune it with r=1 without any performance loss. It also solves well-known PreNet problem [#50](https://github.com/mozilla/TTS/issues/50).
|
||||
- Constant history window. Instead of using only the last frame of predictions, define a constant history queue. It enables training with gradually decreasing prediction frame (r=5 -> r=1) by only changing the last layer. For instance, you can train the model with r=5 and then fine-tune it with r=1 without any performance loss. It also solves well-known PreNet problem [#50](https://github.com/mozilla/TTS/issues/50).
|
||||
- Initialization of hidden decoder states with Embedding layers instead of zero initialization.
|
||||
|
||||
One common question is to ask why we don't use Tacotron2 architecture. According to our ablation experiments, nothing, except Location Sensitive Attention, improves the performance, given the increase in the model size.
|
||||
|
||||
Please feel free to offer new changes and pull things off. We are happy to discuss and make things better.
|
||||
|
||||
## Problems waiting to be solved.
|
||||
- Punctuations at the end of a sentence sometimes affect the pronunciation of the last word. Because punctuation sign is attended by the attention module, that forces the network to create a voice signal or at least modify the voice signal being generated for neighboring frames.
|
||||
- ~~Simpler stop-token prediction. Right now we use RNN to keep the history of the previous frames. However, we never tested, if something simpler would work as well.~~ Yet RNN based model gives more stable predictions.
|
||||
- Train for better mel-specs. Mel-spectrograms are not good enough to be fed Neural Vocoder. Easy solution to this problem is to train the model with r=1. However, in this case, model struggles to align the attention.
|
||||
- irregular words: "minute", "focus", "aren't" etc. Even though ~~it might be solved~~ (Use a better dataset like Nancy or train phonemes enabled.)
|
||||
-->
|
||||
|
||||
## Major TODOs
|
||||
- [x] Implement the model.
|
||||
- [x] Generate human-like speech on LJSpeech dataset.
|
||||
- [x] Generate human-like speech on a different dataset (Nancy) (TWEB).
|
||||
- [x] Train TTS with r=1 successfully.
|
||||
- [x] Enable process based distributed training. Similar [to] (https://github.com/fastai/imagenet-fast/).
|
||||
- [ ] Adapting Neural Vocoder. The most active work is [here] (https://github.com/erogol/WaveRNN)
|
||||
- [x] Enable process based distributed training. Similar to (https://github.com/fastai/imagenet-fast/).
|
||||
- [x] Adapting Neural Vocoder. TTS works with (https://github.com/erogol/WaveRNN)
|
||||
- [ ] Multi-speaker embedding.
|
||||
- [ ] Model optimization (model export, prunning etc.)
|
||||
|
||||
## References
|
||||
<!--## References
|
||||
- [Efficient Neural Audio Synthesis](https://arxiv.org/pdf/1802.08435.pdf)
|
||||
- [Attention-Based models for speech recognition](https://arxiv.org/pdf/1506.07503.pdf)
|
||||
- [Generating Sequences With Recurrent Neural Networks](https://arxiv.org/pdf/1308.0850.pdf)
|
||||
|
@ -153,6 +160,7 @@ Please feel free to offer new changes and pull things off. We are happy to discu
|
|||
- [WaveRNN](https://arxiv.org/pdf/1802.08435.pdf)
|
||||
- [Faster WaveNet](https://arxiv.org/abs/1611.09482)
|
||||
- [Parallel WaveNet](https://arxiv.org/abs/1711.10433)
|
||||
-->
|
||||
|
||||
### Precursor implementations
|
||||
- https://github.com/keithito/tacotron (Dataset and Test processing)
|
||||
|
|
64
config.json
64
config.json
|
@ -1,12 +1,12 @@
|
|||
{
|
||||
"model_name": "queue",
|
||||
"model_description": "Queue memory and change lower r incrementatlly",
|
||||
"run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking",
|
||||
"run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP",
|
||||
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"frame_length_ms": 50, // stft window length in ms.
|
||||
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
|
@ -19,8 +19,8 @@
|
|||
"symmetric_norm": false, // move normalization to range [-1, 1]
|
||||
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": null, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": null, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
},
|
||||
|
||||
|
@ -29,38 +29,52 @@
|
|||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"embedding_size": 256, // Character embedding vector length. You don't need to change it in general.
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
"reinit_layers": [],
|
||||
|
||||
"model": "Tacotron2", // one of the model in models/
|
||||
"grad_clip": 1, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"loss_weight": 0.0, // loss weight to emphasize lower frequencies. Lower frequencies are in general more important for speech signals.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
"memory_size": 5, // memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
||||
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
||||
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
|
||||
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
|
||||
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
|
||||
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
|
||||
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||
"eval_batch_size":32,
|
||||
"r": 5, // Number of frames to predict for step.
|
||||
"eval_batch_size":16,
|
||||
"r": 1, // Number of frames to predict for step.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 5000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 50, // Number of steps to log traning on console.
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"batch_group_size": 8, //Number of batches to shuffle after bucketing.
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 10, // Number of steps to log traning on console.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 100, //Until attention is aligned, testing only wastes computation time.
|
||||
"data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument
|
||||
"meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
|
||||
"meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||
"dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
|
||||
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
|
||||
"meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
|
||||
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||
"dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
|
||||
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 300, // DATASET-RELATED: maximum text length
|
||||
"output_path": "/media/erogol/data_ssd/Data/models/ljspeech_models/", // DATASET-RELATED: output path for all training outputs.
|
||||
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"max_seq_len": 150, // DATASET-RELATED: maximum text length
|
||||
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"phoneme_cache_path": "ljspeech_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
"text_cleaner": "phoneme_cleaners"
|
||||
}
|
||||
|
||||
|
|
|
@ -1,13 +1,12 @@
|
|||
{
|
||||
"model_name": "tts-master",
|
||||
"model_description": "tts master with symbols update",
|
||||
"run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking",
|
||||
"run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP",
|
||||
|
||||
"audio":{
|
||||
"audio_processor": "audio", // to use dictate different audio processors, if available.
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"frame_length_ms": 50, // stft window length in ms.
|
||||
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
|
@ -20,8 +19,8 @@
|
|||
"symmetric_norm": false, // move normalization to range [-1, 1]
|
||||
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": null, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": null, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
},
|
||||
|
||||
|
@ -30,36 +29,52 @@
|
|||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"embedding_size": 256, // Character embedding vector length. You don't need to change it in general.
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
"reinit_layers": [],
|
||||
|
||||
"model": "Tacotron2", // one of the model in models/
|
||||
"grad_clip": 1, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"loss_weight": 0.0, // loss weight to emphasize lower frequencies. Lower frequencies are in general more important for speech signals.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
"memory_size": 5, // memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
||||
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||
"eval_batch_size":32,
|
||||
"r": 5, // Number of frames to predict for step.
|
||||
"wd": 0.00001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 5000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 50, // Number of steps to log traning on console.
|
||||
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
||||
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
|
||||
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
|
||||
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
|
||||
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
|
||||
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||
"eval_batch_size":16,
|
||||
"r": 1, // Number of frames to predict for step.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 10, // Number of steps to log traning on console.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
|
||||
"run_eval": true,
|
||||
"data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument
|
||||
"meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
|
||||
"meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||
"dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
|
||||
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
|
||||
"meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
|
||||
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||
"dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
|
||||
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 300, // DATASET-RELATED: maximum text length
|
||||
"output_path": "models/", // DATASET-RELATED: output path for all training outputs.
|
||||
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"max_seq_len": 150, // DATASET-RELATED: maximum text length
|
||||
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"phoneme_cache_path": "phonemes_cache", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
}
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
"text_cleaner": "phoneme_cleaners"
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
{
|
||||
"run_name": "mozilla-tacotron-tagent",
|
||||
"run_description": "using forward attention with transition agent, with original prenet, loss masking, separate stopnet, sigmoid norm. Compare this with 4841",
|
||||
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"frame_length_ms": 50, // stft window length in ms.
|
||||
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": false, // move normalization to range [-1, 1]
|
||||
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
},
|
||||
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [],
|
||||
|
||||
"model": "Tacotron", // one of the model in models/
|
||||
"grad_clip": 1, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
||||
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
|
||||
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
|
||||
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
|
||||
"transition_agent": true, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
|
||||
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||
"eval_batch_size":16,
|
||||
"r": 5, // Number of frames to predict for step.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 10, // Number of steps to log traning on console.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
|
||||
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
|
||||
"meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
|
||||
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||
"dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
|
||||
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 150, // DATASET-RELATED: maximum text length
|
||||
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
"text_cleaner": "phoneme_cleaners"
|
||||
}
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
{
|
||||
"run_name": "german-tacotron-tagent",
|
||||
"run_description": "using forward attention with transition agent, with original prenet, loss masking, separate stopnet, sigmoid norm. First run German data.",
|
||||
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"frame_length_ms": 50, // stft window length in ms.
|
||||
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": false, // move normalization to range [-1, 1]
|
||||
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
},
|
||||
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [],
|
||||
|
||||
"model": "Tacotron", // one of the model in models/
|
||||
"grad_clip": 1, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
||||
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
|
||||
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
|
||||
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
|
||||
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
|
||||
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||
"eval_batch_size":16,
|
||||
"r": 5, // Number of frames to predict for step.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 10, // Number of steps to log traning on console.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
|
||||
"run_eval": false,
|
||||
"test_sentences_file": "de_sentences.txt", // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
|
||||
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
|
||||
"meta_file_train": [
|
||||
"kleinzaches/metadata.csv",
|
||||
"spiegel_kaetzchen/metadata.csv",
|
||||
"herrnarnesschatz/metadata.csv",
|
||||
"maedchen_von_moorhof/metadata.csv",
|
||||
"koenigsgaukler/metadata.csv",
|
||||
"altehous/metadata.csv",
|
||||
"odysseus/metadata.csv",
|
||||
"undine/metadata.csv",
|
||||
"reise_tilsit/metadata.csv",
|
||||
"schmied_seines_glueckes/metadata.csv",
|
||||
"kammmacher/metadata.csv",
|
||||
"unterm_birnbaum/metadata.csv",
|
||||
"liebesbriefe/metadata.csv",
|
||||
"sandmann/metadata.csv"], // DATASET-RELATED: metafile for training dataloader.
|
||||
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||
"dataset": "mailabs", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
|
||||
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 150, // DATASET-RELATED: maximum text length
|
||||
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"phoneme_cache_path": "phoneme_cache", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "de", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
"text_cleaner": "phoneme_cleaners"
|
||||
}
|
||||
|
File diff suppressed because one or more lines are too long
|
@ -22,10 +22,10 @@ class MyDataset(Dataset):
|
|||
batch_group_size=0,
|
||||
min_seq_len=0,
|
||||
max_seq_len=float("inf"),
|
||||
cached=False,
|
||||
use_phonemes=True,
|
||||
phoneme_cache_path=None,
|
||||
phoneme_language="en-us",
|
||||
enable_eos_bos=False,
|
||||
verbose=False):
|
||||
"""
|
||||
Args:
|
||||
|
@ -42,12 +42,11 @@ class MyDataset(Dataset):
|
|||
min_seq_len (int): (0) minimum sequence length to be processed
|
||||
by the loader.
|
||||
max_seq_len (int): (float("inf")) maximum sequence length.
|
||||
cached (bool): (false) true if the given data path is created
|
||||
by extract_features.py.
|
||||
use_phonemes (bool): (true) if true, text converted to phonemes.
|
||||
phoneme_cache_path (str): path to cache phoneme features.
|
||||
phoneme_language (str): one the languages from
|
||||
https://github.com/bootphon/phonemizer#languages
|
||||
enable_eos_bos (bool): enable end of sentence and beginning of sentences characters.
|
||||
verbose (bool): print diagnostic information.
|
||||
"""
|
||||
self.root_path = root_path
|
||||
|
@ -59,10 +58,10 @@ class MyDataset(Dataset):
|
|||
self.min_seq_len = min_seq_len
|
||||
self.max_seq_len = max_seq_len
|
||||
self.ap = ap
|
||||
self.cached = cached
|
||||
self.use_phonemes = use_phonemes
|
||||
self.phoneme_cache_path = phoneme_cache_path
|
||||
self.phoneme_language = phoneme_language
|
||||
self.enable_eos_bos = enable_eos_bos
|
||||
self.verbose = verbose
|
||||
if use_phonemes and not os.path.isdir(phoneme_cache_path):
|
||||
os.makedirs(phoneme_cache_path, exist_ok=True)
|
||||
|
@ -72,16 +71,12 @@ class MyDataset(Dataset):
|
|||
print(" | > Use phonemes: {}".format(self.use_phonemes))
|
||||
if use_phonemes:
|
||||
print(" | > phoneme language: {}".format(phoneme_language))
|
||||
print(" | > Cached dataset: {}".format(self.cached))
|
||||
print(" | > Number of instances : {}".format(len(self.items)))
|
||||
self.sort_items()
|
||||
|
||||
def load_wav(self, filename):
|
||||
try:
|
||||
audio = self.ap.load_wav(filename)
|
||||
return audio
|
||||
except RuntimeError as e:
|
||||
print(" !! Cannot read file : {}".format(filename))
|
||||
|
||||
def load_np(self, filename):
|
||||
data = np.load(filename).astype('float32')
|
||||
|
@ -89,7 +84,8 @@ class MyDataset(Dataset):
|
|||
|
||||
def load_phoneme_sequence(self, wav_file, text):
|
||||
file_name = os.path.basename(wav_file).split('.')[0]
|
||||
tmp_path = os.path.join(self.phoneme_cache_path, file_name+'_phoneme.npy')
|
||||
tmp_path = os.path.join(self.phoneme_cache_path,
|
||||
file_name + '_phoneme.npy')
|
||||
if os.path.isfile(tmp_path):
|
||||
try:
|
||||
text = np.load(tmp_path)
|
||||
|
@ -97,40 +93,35 @@ class MyDataset(Dataset):
|
|||
print(" > ERROR: phoneme connot be loaded for {}. Recomputing.".format(wav_file))
|
||||
text = np.asarray(
|
||||
phoneme_to_sequence(
|
||||
text, [self.cleaners], language=self.phoneme_language),
|
||||
text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=self.enable_eos_bos),
|
||||
dtype=np.int32)
|
||||
np.save(tmp_path, text)
|
||||
else:
|
||||
text = np.asarray(
|
||||
phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language), dtype=np.int32)
|
||||
phoneme_to_sequence(
|
||||
text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=self.enable_eos_bos),
|
||||
dtype=np.int32)
|
||||
np.save(tmp_path, text)
|
||||
return text
|
||||
|
||||
def load_data(self, idx):
|
||||
if self.cached:
|
||||
wav_name = self.items[idx][1]
|
||||
mel_name = self.items[idx][2]
|
||||
linear_name = self.items[idx][3]
|
||||
text = self.items[idx][0]
|
||||
|
||||
if wav_name.split('.')[-1] == 'npy':
|
||||
wav = self.load_np(wav_name)
|
||||
else:
|
||||
wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
|
||||
mel = self.load_np(mel_name)
|
||||
linear = self.load_np(linear_name)
|
||||
else:
|
||||
text, wav_file = self.items[idx]
|
||||
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
|
||||
mel = None
|
||||
linear = None
|
||||
|
||||
if self.use_phonemes:
|
||||
text = self.load_phoneme_sequence(wav_file, text)
|
||||
else:
|
||||
text = np.asarray(
|
||||
text_to_sequence(text, [self.cleaners]), dtype=np.int32)
|
||||
sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'mel':mel, 'linear': linear}
|
||||
|
||||
assert text.size > 0, self.items[idx][1]
|
||||
assert wav.size > 0, self.items[idx][1]
|
||||
|
||||
sample = {
|
||||
'text': text,
|
||||
'wav': wav,
|
||||
'item_idx': self.items[idx][1]
|
||||
}
|
||||
return sample
|
||||
|
||||
def sort_items(self):
|
||||
|
@ -151,9 +142,9 @@ class MyDataset(Dataset):
|
|||
for i in range(len(new_items) // self.batch_group_size):
|
||||
offset = i * self.batch_group_size
|
||||
end_offset = offset + self.batch_group_size
|
||||
temp_items = new_items[offset : end_offset]
|
||||
temp_items = new_items[offset:end_offset]
|
||||
random.shuffle(temp_items)
|
||||
new_items[offset : end_offset] = temp_items
|
||||
new_items[offset:end_offset] = temp_items
|
||||
self.items = new_items
|
||||
|
||||
if self.verbose:
|
||||
|
@ -181,22 +172,20 @@ class MyDataset(Dataset):
|
|||
|
||||
# Puts each data field into a tensor with outer dimension batch size
|
||||
if isinstance(batch[0], collections.Mapping):
|
||||
keys = list()
|
||||
|
||||
wav = [d['wav'] for d in batch]
|
||||
item_idxs = [d['item_idx'] for d in batch]
|
||||
text = [d['text'] for d in batch]
|
||||
text_lenghts = np.array([len(d["text"]) for d in batch])
|
||||
text_lenghts, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor(text_lenghts), dim=0, descending=True)
|
||||
|
||||
text_lenghts = np.array([len(x) for x in text])
|
||||
max_text_len = np.max(text_lenghts)
|
||||
wav = [batch[idx]['wav'] for idx in ids_sorted_decreasing]
|
||||
item_idxs = [
|
||||
batch[idx]['item_idx'] for idx in ids_sorted_decreasing
|
||||
]
|
||||
text = [batch[idx]['text'] for idx in ids_sorted_decreasing]
|
||||
|
||||
# if specs are not computed, compute them.
|
||||
if batch[0]['mel'] is None and batch[0]['linear'] is None:
|
||||
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
|
||||
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
|
||||
else:
|
||||
mel = [d['mel'] for d in batch]
|
||||
linear = [d['linear'] for d in batch]
|
||||
|
||||
mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame
|
||||
|
||||
# compute 'stop token' targets
|
||||
|
|
|
@ -1,179 +0,0 @@
|
|||
import os
|
||||
import random
|
||||
import numpy as np
|
||||
import collections
|
||||
import librosa
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from utils.text import text_to_sequence
|
||||
from datasets.preprocess import tts_cache
|
||||
from utils.data import (prepare_data, pad_per_step, prepare_tensor,
|
||||
prepare_stop_target)
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
# TODO: Merge to TTSDataset.py, but it is not fast as it is supposed to be
|
||||
def __init__(self,
|
||||
root_path,
|
||||
meta_file,
|
||||
outputs_per_step,
|
||||
text_cleaner,
|
||||
ap,
|
||||
batch_group_size=0,
|
||||
min_seq_len=0,
|
||||
**kwargs
|
||||
):
|
||||
self.root_path = root_path
|
||||
self.batch_group_size = batch_group_size
|
||||
self.feat_dir = os.path.join(root_path, 'loader_data')
|
||||
self.items = tts_cache(root_path, meta_file)
|
||||
self.outputs_per_step = outputs_per_step
|
||||
self.sample_rate = ap.sample_rate
|
||||
self.cleaners = text_cleaner
|
||||
self.min_seq_len = min_seq_len
|
||||
self.wavs = None
|
||||
self.mels = None
|
||||
self.linears = None
|
||||
print(" > Reading LJSpeech from - {}".format(root_path))
|
||||
print(" | > Number of instances : {}".format(len(self.items)))
|
||||
self.sort_items()
|
||||
self.fill_data()
|
||||
|
||||
def fill_data(self):
|
||||
if self.wavs is None and self.mels is None:
|
||||
self.wavs = []
|
||||
self.mels = []
|
||||
self.linears = []
|
||||
self.texts = []
|
||||
for item in tqdm(self.items):
|
||||
wav_file = item[0]
|
||||
mel_file = item[1]
|
||||
linear_file = item[2]
|
||||
text = item[-1]
|
||||
wav = self.load_np(wav_file)
|
||||
mel = self.load_np(mel_file)
|
||||
linear = self.load_np(linear_file)
|
||||
self.wavs.append(wav)
|
||||
self.mels.append(mel)
|
||||
self.linears.append(linear)
|
||||
self.texts.append(np.asarray(
|
||||
text_to_sequence(text, [self.cleaners]), dtype=np.int32))
|
||||
print(" > Data loaded to memory")
|
||||
|
||||
def load_wav(self, filename):
|
||||
try:
|
||||
audio = librosa.core.load(filename, sr=self.sample_rate)
|
||||
return audio
|
||||
except RuntimeError as e:
|
||||
print(" !! Cannot read file : {}".format(filename))
|
||||
|
||||
def load_np(self, filename):
|
||||
data = np.load(filename).astype('float32')
|
||||
return data
|
||||
|
||||
def sort_items(self):
|
||||
r"""Sort text sequences in ascending order"""
|
||||
lengths = np.array([len(ins[-1]) for ins in self.items])
|
||||
|
||||
print(" | > Max length sequence {}".format(np.max(lengths)))
|
||||
print(" | > Min length sequence {}".format(np.min(lengths)))
|
||||
print(" | > Avg length sequence {}".format(np.mean(lengths)))
|
||||
|
||||
idxs = np.argsort(lengths)
|
||||
new_frames = []
|
||||
ignored = []
|
||||
for i, idx in enumerate(idxs):
|
||||
length = lengths[idx]
|
||||
if length < self.min_seq_len:
|
||||
ignored.append(idx)
|
||||
else:
|
||||
new_frames.append(self.items[idx])
|
||||
print(" | > {} instances are ignored by min_seq_len ({})".format(
|
||||
len(ignored), self.min_seq_len))
|
||||
# shuffle batch groups
|
||||
if self.batch_group_size > 0:
|
||||
print(" | > Batch group shuffling is active.")
|
||||
for i in range(len(new_frames) // self.batch_group_size):
|
||||
offset = i * self.batch_group_size
|
||||
end_offset = offset + self.batch_group_size
|
||||
temp_frames = new_frames[offset : end_offset]
|
||||
random.shuffle(temp_frames)
|
||||
new_frames[offset : end_offset] = temp_frames
|
||||
self.items = new_frames
|
||||
|
||||
def __len__(self):
|
||||
return len(self.items)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
text = self.texts[idx]
|
||||
wav = self.wavs[idx]
|
||||
mel = self.mels[idx]
|
||||
linear = self.linears[idx]
|
||||
sample = {
|
||||
'text': text,
|
||||
'wav': wav,
|
||||
'item_idx': self.items[idx][0],
|
||||
'mel': mel,
|
||||
'linear': linear
|
||||
}
|
||||
return sample
|
||||
|
||||
def collate_fn(self, batch):
|
||||
r"""
|
||||
Perform preprocessing and create a final data batch:
|
||||
1. PAD sequences with the longest sequence in the batch
|
||||
2. Convert Audio signal to Spectrograms.
|
||||
3. PAD sequences that can be divided by r.
|
||||
4. Convert Numpy to Torch tensors.
|
||||
"""
|
||||
|
||||
# Puts each data field into a tensor with outer dimension batch size
|
||||
if isinstance(batch[0], collections.Mapping):
|
||||
keys = list()
|
||||
|
||||
wav = [d['wav'] for d in batch]
|
||||
item_idxs = [d['item_idx'] for d in batch]
|
||||
text = [d['text'] for d in batch]
|
||||
mel = [d['mel'] for d in batch]
|
||||
linear = [d['linear'] for d in batch]
|
||||
|
||||
text_lenghts = np.array([len(x) for x in text])
|
||||
max_text_len = np.max(text_lenghts)
|
||||
mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame
|
||||
|
||||
# compute 'stop token' targets
|
||||
stop_targets = [
|
||||
np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths
|
||||
]
|
||||
|
||||
# PAD stop targets
|
||||
stop_targets = prepare_stop_target(stop_targets,
|
||||
self.outputs_per_step)
|
||||
|
||||
# PAD sequences with largest length of the batch
|
||||
text = prepare_data(text).astype(np.int32)
|
||||
wav = prepare_data(wav)
|
||||
|
||||
# PAD features with largest length + a zero frame
|
||||
linear = prepare_tensor(linear, self.outputs_per_step)
|
||||
mel = prepare_tensor(mel, self.outputs_per_step)
|
||||
timesteps = mel.shape[2]
|
||||
|
||||
# B x T x D
|
||||
linear = linear.transpose(0, 2, 1)
|
||||
mel = mel.transpose(0, 2, 1)
|
||||
|
||||
# convert things to pytorch
|
||||
text_lenghts = torch.LongTensor(text_lenghts)
|
||||
text = torch.LongTensor(text)
|
||||
linear = torch.FloatTensor(linear)
|
||||
mel = torch.FloatTensor(mel)
|
||||
mel_lengths = torch.LongTensor(mel_lengths)
|
||||
stop_targets = torch.FloatTensor(stop_targets)
|
||||
|
||||
return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs
|
||||
|
||||
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
|
||||
found {}".format(type(batch[0]))))
|
|
@ -1,16 +1,4 @@
|
|||
import os
|
||||
import random
|
||||
|
||||
def tts_cache(root_path, meta_file):
|
||||
"""This format is set for the meta-file generated by extract_features.py"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
with open(txt_file, 'r', encoding='utf8') as f:
|
||||
for line in f:
|
||||
cols = line.split('| ')
|
||||
items.append(cols) # text, wav_full_path, mel_name, linear_name, wav_len, mel_len
|
||||
random.shuffle(items)
|
||||
return items
|
||||
|
||||
|
||||
def tweb(root_path, meta_file):
|
||||
|
@ -22,10 +10,9 @@ def tweb(root_path, meta_file):
|
|||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('\t')
|
||||
wav_file = os.path.join(root_path, cols[0]+'.wav')
|
||||
wav_file = os.path.join(root_path, cols[0] + '.wav')
|
||||
text = cols[1]
|
||||
items.append([text, wav_file])
|
||||
random.shuffle(items)
|
||||
return items
|
||||
|
||||
|
||||
|
@ -42,34 +29,39 @@ def tweb(root_path, meta_file):
|
|||
# return {'text': texts, 'wavs': wavs}
|
||||
|
||||
|
||||
def mozilla(root_path, meta_files):
|
||||
def mozilla_old(root_path, meta_file):
|
||||
"""Normalizes Mozilla meta data files to TTS format"""
|
||||
import glob
|
||||
meta_files = glob.glob(root_path + "**/batch*.txt", recursive=True)
|
||||
folders = [os.path.dirname(f.strip()) for f in meta_files]
|
||||
items = []
|
||||
for idx, meta_file in enumerate(meta_files):
|
||||
print(" | > {}".format(meta_file))
|
||||
folder = folders[idx]
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = os.path.join(root_path, folder, 'wavs_no_processing', cols[1].strip())
|
||||
if os.path.isfile(wav_file):
|
||||
batch_no = int(cols[1].strip().split("_")[0])
|
||||
wav_folder = "batch{}".format(batch_no)
|
||||
wav_file = os.path.join(root_path, wav_folder, "wavs_no_processing", cols[1].strip())
|
||||
text = cols[0].strip()
|
||||
items.append([text, wav_file])
|
||||
else:
|
||||
print(" > Error: {}", line)
|
||||
continue
|
||||
random.shuffle(items)
|
||||
return items
|
||||
|
||||
|
||||
def mozilla(root_path, meta_file):
|
||||
"""Normalizes Mozilla meta data files to TTS format"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = cols[1].strip()
|
||||
text = cols[0].strip()
|
||||
wav_file = os.path.join(root_path, "wavs", wav_file)
|
||||
items.append([text, wav_file])
|
||||
return items
|
||||
|
||||
|
||||
def mailabs(root_path, meta_files):
|
||||
"""Normalizes M-AI-Labs meta data files to TTS format"""
|
||||
folders = [os.path.dirname(f.strip()) for f in meta_files.split(",")]
|
||||
meta_files = [f.strip() for f in meta_files.split(",")]
|
||||
folders = [os.path.dirname(f.strip()) for f in meta_files]
|
||||
# meta_files = [f.strip() for f in meta_files.split(",")]
|
||||
items = []
|
||||
for idx, meta_file in enumerate(meta_files):
|
||||
print(" | > {}".format(meta_file))
|
||||
|
@ -78,13 +70,13 @@ def mailabs(root_path, meta_files):
|
|||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = os.path.join(root_path, folder, 'wavs', cols[0]+'.wav')
|
||||
wav_file = os.path.join(root_path, folder, 'wavs',
|
||||
cols[0] + '.wav')
|
||||
if os.path.isfile(wav_file):
|
||||
text = cols[1]
|
||||
items.append([text, wav_file])
|
||||
else:
|
||||
continue
|
||||
random.shuffle(items)
|
||||
return items
|
||||
|
||||
|
||||
|
@ -95,10 +87,9 @@ def ljspeech(root_path, meta_file):
|
|||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = os.path.join(root_path, 'wavs', cols[0]+'.wav')
|
||||
wav_file = os.path.join(root_path, 'wavs', cols[0] + '.wav')
|
||||
text = cols[1]
|
||||
items.append([text, wav_file])
|
||||
random.shuffle(items)
|
||||
return items
|
||||
|
||||
|
||||
|
@ -109,8 +100,22 @@ def nancy(root_path, meta_file):
|
|||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
id = line.split()[1]
|
||||
text = line[line.find('"')+1:line.rfind('"')-1]
|
||||
wav_file = root_path + 'wavn/' + id + '.wav'
|
||||
text = line[line.find('"') + 1:line.rfind('"') - 1]
|
||||
wav_file = os.path.join(root_path, "wavn", id + ".wav")
|
||||
items.append([text, wav_file])
|
||||
return items
|
||||
|
||||
|
||||
def common_voice(root_path, meta_file):
|
||||
"""Normalize the common voice meta data file to TTS format."""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
if line.startswith("client_id"):
|
||||
continue
|
||||
cols = line.split("\t")
|
||||
text = cols[2]
|
||||
wav_file = os.path.join(root_path, "clips", cols[1] + ".wav")
|
||||
items.append([text, wav_file])
|
||||
random.shuffle(items)
|
||||
return items
|
|
@ -0,0 +1,4 @@
|
|||
Herzlieb, fragte er noch einmal, ist Papa wohl?
|
||||
Eine große Ueberraschung.
|
||||
Dann gab ihm sein kleines zärtliches Herz plötzlich ein, beide Aermchen um den Hals der Mutter zu schlingen und sie wieder und wieder zu küssen und seine weiche.
|
||||
als ob sie ihn nie mehr von sich lassen wollte, und weinte bitterlich.
|
|
@ -131,7 +131,7 @@ def main(args):
|
|||
Call train.py as a new process and pass command arguments
|
||||
"""
|
||||
CONFIG = load_config(args.config_path)
|
||||
OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.model_name,
|
||||
OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
|
||||
True)
|
||||
stdout_path = os.path.join(OUT_PATH, "process_stdout/")
|
||||
|
||||
|
|
|
@ -1,126 +0,0 @@
|
|||
'''
|
||||
Extract spectrograms and save them to file for training
|
||||
'''
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import glob
|
||||
import argparse
|
||||
import librosa
|
||||
import importlib
|
||||
import numpy as np
|
||||
import tqdm
|
||||
from utils.generic_utils import load_config, copy_config_file
|
||||
from utils.audio import AudioProcessor
|
||||
|
||||
from multiprocessing import Pool
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_path', type=str, help='Data folder.')
|
||||
parser.add_argument('--cache_path', type=str, help='Cache folder, place to output all the spectrogram files.')
|
||||
parser.add_argument(
|
||||
'--config', type=str, help='conf.json file for run settings.')
|
||||
parser.add_argument(
|
||||
"--num_proc", type=int, default=8, help="number of processes.")
|
||||
parser.add_argument(
|
||||
"--trim_silence",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="trim silence in the voice clip.")
|
||||
parser.add_argument("--only_mel", type=bool, default=False, help="If True, only melsceptrogram is extracted.")
|
||||
parser.add_argument("--dataset", type=str, help="Target dataset to be processed.")
|
||||
parser.add_argument("--val_split", type=int, default=0, help="Number of instances for validation.")
|
||||
parser.add_argument("--meta_file", type=str, help="Meta data file to be used for the dataset.")
|
||||
parser.add_argument("--process_audio", type=bool, default=False, help="Preprocess audio files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
DATA_PATH = args.data_path
|
||||
CACHE_PATH = args.cache_path
|
||||
CONFIG = load_config(args.config)
|
||||
|
||||
# load the right preprocessor
|
||||
preprocessor = importlib.import_module('datasets.preprocess')
|
||||
preprocessor = getattr(preprocessor, args.dataset.lower())
|
||||
items = preprocessor(args.data_path, args.meta_file)
|
||||
|
||||
print(" > Input path: ", DATA_PATH)
|
||||
print(" > Cache path: ", CACHE_PATH)
|
||||
|
||||
ap = AudioProcessor(**CONFIG.audio)
|
||||
|
||||
|
||||
def extract_mel(item):
|
||||
""" Compute spectrograms, length information """
|
||||
text = item[0]
|
||||
file_path = item[1]
|
||||
x = ap.load_wav(file_path, ap.sample_rate)
|
||||
file_name = os.path.basename(file_path).replace(".wav", "")
|
||||
mel_file = file_name + "_mel"
|
||||
mel_path = os.path.join(CACHE_PATH, 'mel', mel_file)
|
||||
mel = ap.melspectrogram(x.astype('float32')).astype('float32')
|
||||
np.save(mel_path, mel, allow_pickle=False)
|
||||
mel_len = mel.shape[1]
|
||||
wav_len = x.shape[0]
|
||||
output = [text, file_path, mel_path+".npy", str(wav_len), str(mel_len)]
|
||||
if not args.only_mel:
|
||||
linear_file = file_name + "_linear"
|
||||
linear_path = os.path.join(CACHE_PATH, 'linear', linear_file)
|
||||
linear = ap.spectrogram(x.astype('float32')).astype('float32')
|
||||
linear_len = linear.shape[1]
|
||||
np.save(linear_path, linear, allow_pickle=False)
|
||||
output.insert(3, linear_path+".npy")
|
||||
assert mel_len == linear_len
|
||||
if args.process_audio:
|
||||
audio_file = file_name + "_audio"
|
||||
audio_path = os.path.join(CACHE_PATH, 'audio', audio_file)
|
||||
np.save(audio_path, x, allow_pickle=False)
|
||||
del output[0]
|
||||
output.insert(1, audio_path+".npy")
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(" > Number of files: %i" % (len(items)))
|
||||
if not os.path.exists(CACHE_PATH):
|
||||
os.makedirs(os.path.join(CACHE_PATH, 'mel'))
|
||||
if not args.only_mel:
|
||||
os.makedirs(os.path.join(CACHE_PATH, 'linear'))
|
||||
if args.process_audio:
|
||||
os.makedirs(os.path.join(CACHE_PATH, 'audio'))
|
||||
print(" > A new folder created at {}".format(CACHE_PATH))
|
||||
|
||||
# Extract features
|
||||
r = []
|
||||
if args.num_proc > 1:
|
||||
print(" > Using {} processes.".format(args.num_proc))
|
||||
with Pool(args.num_proc) as p:
|
||||
r = list(
|
||||
tqdm.tqdm(
|
||||
p.imap(extract_mel, items),
|
||||
total=len(items)))
|
||||
# r = list(p.imap(extract_mel, file_names))
|
||||
else:
|
||||
print(" > Using single process run.")
|
||||
for item in items:
|
||||
print(" > ", item[1])
|
||||
r.append(extract_mel(item))
|
||||
|
||||
# Save meta data
|
||||
if args.cache_path is not None:
|
||||
file_path = os.path.join(CACHE_PATH, "tts_metadata_val.csv")
|
||||
file = open(file_path, "w")
|
||||
for line in r[:args.val_split]:
|
||||
line = "| ".join(line)
|
||||
file.write(line + '\n')
|
||||
file.close()
|
||||
|
||||
file_path = os.path.join(CACHE_PATH, "tts_metadata.csv")
|
||||
file = open(file_path, "w")
|
||||
for line in r[args.val_split:]:
|
||||
line = "| ".join(line)
|
||||
file.write(line + '\n')
|
||||
file.close()
|
||||
|
||||
# copy the used config file to output path for sanity
|
||||
copy_config_file(args.config, CACHE_PATH)
|
Binary file not shown.
After Width: | Height: | Size: 193 KiB |
|
@ -1,181 +0,0 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from utils.generic_utils import sequence_mask
|
||||
|
||||
|
||||
class BahdanauAttention(nn.Module):
|
||||
def __init__(self, annot_dim, query_dim, attn_dim):
|
||||
super(BahdanauAttention, self).__init__()
|
||||
self.query_layer = nn.Linear(query_dim, attn_dim, bias=True)
|
||||
self.annot_layer = nn.Linear(annot_dim, attn_dim, bias=True)
|
||||
self.v = nn.Linear(attn_dim, 1, bias=False)
|
||||
|
||||
def forward(self, annots, query):
|
||||
"""
|
||||
Shapes:
|
||||
- annots: (batch, max_time, dim)
|
||||
- query: (batch, 1, dim) or (batch, dim)
|
||||
"""
|
||||
if query.dim() == 2:
|
||||
# insert time-axis for broadcasting
|
||||
query = query.unsqueeze(1)
|
||||
# (batch, 1, dim)
|
||||
processed_query = self.query_layer(query)
|
||||
processed_annots = self.annot_layer(annots)
|
||||
# (batch, max_time, 1)
|
||||
alignment = self.v(torch.tanh(processed_query + processed_annots))
|
||||
# (batch, max_time)
|
||||
return alignment.squeeze(-1)
|
||||
|
||||
|
||||
class LocationSensitiveAttention(nn.Module):
|
||||
"""Location sensitive attention following
|
||||
https://arxiv.org/pdf/1506.07503.pdf"""
|
||||
|
||||
def __init__(self,
|
||||
annot_dim,
|
||||
query_dim,
|
||||
attn_dim,
|
||||
kernel_size=31,
|
||||
filters=32):
|
||||
super(LocationSensitiveAttention, self).__init__()
|
||||
self.kernel_size = kernel_size
|
||||
self.filters = filters
|
||||
padding = [(kernel_size - 1) // 2, (kernel_size - 1) // 2]
|
||||
self.loc_conv = nn.Sequential(
|
||||
nn.ConstantPad1d(padding, 0),
|
||||
nn.Conv1d(
|
||||
2,
|
||||
filters,
|
||||
kernel_size=kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
bias=False))
|
||||
self.loc_linear = nn.Linear(filters, attn_dim, bias=True)
|
||||
self.query_layer = nn.Linear(query_dim, attn_dim, bias=True)
|
||||
self.annot_layer = nn.Linear(annot_dim, attn_dim, bias=True)
|
||||
self.v = nn.Linear(attn_dim, 1, bias=False)
|
||||
self.processed_annots = None
|
||||
# self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.loc_linear.weight,
|
||||
gain=torch.nn.init.calculate_gain('tanh'))
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.query_layer.weight,
|
||||
gain=torch.nn.init.calculate_gain('tanh'))
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.annot_layer.weight,
|
||||
gain=torch.nn.init.calculate_gain('tanh'))
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.v.weight,
|
||||
gain=torch.nn.init.calculate_gain('linear'))
|
||||
|
||||
def reset(self):
|
||||
self.processed_annots = None
|
||||
|
||||
def forward(self, annot, query, loc):
|
||||
"""
|
||||
Shapes:
|
||||
- annot: (batch, max_time, dim)
|
||||
- query: (batch, 1, dim) or (batch, dim)
|
||||
- loc: (batch, 2, max_time)
|
||||
"""
|
||||
if query.dim() == 2:
|
||||
# insert time-axis for broadcasting
|
||||
query = query.unsqueeze(1)
|
||||
processed_loc = self.loc_linear(self.loc_conv(loc).transpose(1, 2))
|
||||
processed_query = self.query_layer(query)
|
||||
# cache annots
|
||||
if self.processed_annots is None:
|
||||
self.processed_annots = self.annot_layer(annot)
|
||||
alignment = self.v(
|
||||
torch.tanh(processed_query + self.processed_annots + processed_loc))
|
||||
del processed_loc
|
||||
del processed_query
|
||||
# (batch, max_time)
|
||||
return alignment.squeeze(-1)
|
||||
|
||||
|
||||
class AttentionRNNCell(nn.Module):
|
||||
def __init__(self, out_dim, rnn_dim, annot_dim, memory_dim, align_model, windowing=False):
|
||||
r"""
|
||||
General Attention RNN wrapper
|
||||
|
||||
Args:
|
||||
out_dim (int): context vector feature dimension.
|
||||
rnn_dim (int): rnn hidden state dimension.
|
||||
annot_dim (int): annotation vector feature dimension.
|
||||
memory_dim (int): memory vector (decoder output) feature dimension.
|
||||
align_model (str): 'b' for Bahdanau, 'ls' Location Sensitive alignment.
|
||||
windowing (bool): attention windowing forcing monotonic attention.
|
||||
It is only active in eval mode.
|
||||
"""
|
||||
super(AttentionRNNCell, self).__init__()
|
||||
self.align_model = align_model
|
||||
self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, rnn_dim)
|
||||
self.windowing = windowing
|
||||
if self.windowing:
|
||||
self.win_back = 3
|
||||
self.win_front = 6
|
||||
self.win_idx = None
|
||||
# pick bahdanau or location sensitive attention
|
||||
if align_model == 'b':
|
||||
self.alignment_model = BahdanauAttention(annot_dim, rnn_dim,
|
||||
out_dim)
|
||||
if align_model == 'ls':
|
||||
self.alignment_model = LocationSensitiveAttention(
|
||||
annot_dim, rnn_dim, out_dim)
|
||||
else:
|
||||
raise RuntimeError(" Wrong alignment model name: {}. Use\
|
||||
'b' (Bahdanau) or 'ls' (Location Sensitive).".format(
|
||||
align_model))
|
||||
|
||||
def forward(self, memory, context, rnn_state, annots, atten, mask, t):
|
||||
"""
|
||||
Shapes:
|
||||
- memory: (batch, 1, dim) or (batch, dim)
|
||||
- context: (batch, dim)
|
||||
- rnn_state: (batch, out_dim)
|
||||
- annots: (batch, max_time, annot_dim)
|
||||
- atten: (batch, 2, max_time)
|
||||
- mask: (batch,)
|
||||
"""
|
||||
if t == 0:
|
||||
self.alignment_model.reset()
|
||||
self.win_idx = 0
|
||||
# Feed it to RNN
|
||||
# s_i = f(y_{i-1}, c_{i}, s_{i-1})
|
||||
rnn_output = self.rnn_cell(torch.cat((memory, context), -1), rnn_state)
|
||||
# Alignment
|
||||
# (batch, max_time)
|
||||
# e_{ij} = a(s_{i-1}, h_j)
|
||||
if self.align_model is 'b':
|
||||
alignment = self.alignment_model(annots, rnn_output)
|
||||
else:
|
||||
alignment = self.alignment_model(annots, rnn_output, atten)
|
||||
if mask is not None:
|
||||
mask = mask.view(memory.size(0), -1)
|
||||
alignment.masked_fill_(1 - mask, -float("inf"))
|
||||
# Windowing
|
||||
if not self.training and self.windowing:
|
||||
back_win = self.win_idx - self.win_back
|
||||
front_win = self.win_idx + self.win_front
|
||||
if back_win > 0:
|
||||
alignment[:, :back_win] = -float("inf")
|
||||
if front_win < memory.shape[1]:
|
||||
alignment[:, front_win:] = -float("inf")
|
||||
# Update the window
|
||||
self.win_idx = torch.argmax(alignment,1).long()[0].item()
|
||||
# Normalize context weight
|
||||
# alignment = F.softmax(alignment, dim=-1)
|
||||
# alignment = 5 * alignment
|
||||
alignment = torch.sigmoid(alignment) / torch.sigmoid(alignment).sum(dim=1).unsqueeze(1)
|
||||
# Attention context vector
|
||||
# (batch, 1, dim)
|
||||
# c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j
|
||||
context = torch.bmm(alignment.unsqueeze(1), annots)
|
||||
context = context.squeeze(1)
|
||||
return rnn_output, context, alignment
|
|
@ -0,0 +1,255 @@
|
|||
from math import sqrt
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class Linear(nn.Module):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
out_features,
|
||||
bias=True,
|
||||
init_gain='linear'):
|
||||
super(Linear, self).__init__()
|
||||
self.linear_layer = torch.nn.Linear(
|
||||
in_features, out_features, bias=bias)
|
||||
self._init_w(init_gain)
|
||||
|
||||
def _init_w(self, init_gain):
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.linear_layer.weight,
|
||||
gain=torch.nn.init.calculate_gain(init_gain))
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear_layer(x)
|
||||
|
||||
|
||||
class LinearBN(nn.Module):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
out_features,
|
||||
bias=True,
|
||||
init_gain='linear'):
|
||||
super(LinearBN, self).__init__()
|
||||
self.linear_layer = torch.nn.Linear(
|
||||
in_features, out_features, bias=bias)
|
||||
self.bn = nn.BatchNorm1d(out_features)
|
||||
self._init_w(init_gain)
|
||||
|
||||
def _init_w(self, init_gain):
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.linear_layer.weight,
|
||||
gain=torch.nn.init.calculate_gain(init_gain))
|
||||
|
||||
def forward(self, x):
|
||||
out = self.linear_layer(x)
|
||||
if len(out.shape) == 3:
|
||||
out = out.permute(1, 2, 0)
|
||||
out = self.bn(out)
|
||||
if len(out.shape) == 3:
|
||||
out = out.permute(2, 0, 1)
|
||||
return out
|
||||
|
||||
|
||||
class Prenet(nn.Module):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
out_features=[256, 256],
|
||||
bias=True):
|
||||
super(Prenet, self).__init__()
|
||||
self.prenet_type = prenet_type
|
||||
self.prenet_dropout = prenet_dropout
|
||||
in_features = [in_features] + out_features[:-1]
|
||||
if prenet_type == "bn":
|
||||
self.layers = nn.ModuleList([
|
||||
LinearBN(in_size, out_size, bias=bias)
|
||||
for (in_size, out_size) in zip(in_features, out_features)
|
||||
])
|
||||
elif prenet_type == "original":
|
||||
self.layers = nn.ModuleList([
|
||||
Linear(in_size, out_size, bias=bias)
|
||||
for (in_size, out_size) in zip(in_features, out_features)
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
for linear in self.layers:
|
||||
if self.prenet_dropout:
|
||||
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training)
|
||||
else:
|
||||
x = F.relu(linear(x))
|
||||
return x
|
||||
|
||||
|
||||
class LocationLayer(nn.Module):
|
||||
def __init__(self, attention_n_filters, attention_kernel_size,
|
||||
attention_dim):
|
||||
super(LocationLayer, self).__init__()
|
||||
self.location_conv = nn.Conv1d(
|
||||
in_channels=2,
|
||||
out_channels=attention_n_filters,
|
||||
kernel_size=31,
|
||||
stride=1,
|
||||
padding=(31 - 1) // 2,
|
||||
bias=False)
|
||||
self.location_dense = Linear(
|
||||
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
|
||||
|
||||
def forward(self, attention_cat):
|
||||
processed_attention = self.location_conv(attention_cat)
|
||||
processed_attention = self.location_dense(
|
||||
processed_attention.transpose(1, 2))
|
||||
return processed_attention
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
|
||||
location_attention, attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing, norm, forward_attn,
|
||||
trans_agent):
|
||||
super(Attention, self).__init__()
|
||||
self.query_layer = Linear(
|
||||
attention_rnn_dim, attention_dim, bias=False, init_gain='tanh')
|
||||
self.inputs_layer = Linear(
|
||||
embedding_dim, attention_dim, bias=False, init_gain='tanh')
|
||||
self.v = Linear(attention_dim, 1, bias=True)
|
||||
if trans_agent:
|
||||
self.ta = nn.Linear(
|
||||
attention_rnn_dim + embedding_dim, 1, bias=True)
|
||||
if location_attention:
|
||||
self.location_layer = LocationLayer(
|
||||
attention_location_n_filters, attention_location_kernel_size,
|
||||
attention_dim)
|
||||
self._mask_value = -float("inf")
|
||||
self.windowing = windowing
|
||||
self.win_idx = None
|
||||
self.norm = norm
|
||||
self.forward_attn = forward_attn
|
||||
self.trans_agent = trans_agent
|
||||
self.location_attention = location_attention
|
||||
|
||||
def init_win_idx(self):
|
||||
self.win_idx = -1
|
||||
self.win_back = 2
|
||||
self.win_front = 6
|
||||
|
||||
def init_forward_attn(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.alpha = torch.cat(
|
||||
[torch.ones([B, 1]),
|
||||
torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
|
||||
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
|
||||
|
||||
def init_location_attention(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_())
|
||||
|
||||
def init_states(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.attention_weights = Variable(inputs.data.new(B, T).zero_())
|
||||
if self.location_attention:
|
||||
self.init_location_attention(inputs)
|
||||
if self.forward_attn:
|
||||
self.init_forward_attn(inputs)
|
||||
if self.windowing:
|
||||
self.init_win_idx()
|
||||
|
||||
def update_location_attention(self, alignments):
|
||||
self.attention_weights_cum += alignments
|
||||
|
||||
def get_location_attention(self, query, processed_inputs):
|
||||
attention_cat = torch.cat((self.attention_weights.unsqueeze(1),
|
||||
self.attention_weights_cum.unsqueeze(1)),
|
||||
dim=1)
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
processed_attention_weights = self.location_layer(attention_cat)
|
||||
energies = self.v(
|
||||
torch.tanh(processed_query + processed_attention_weights +
|
||||
processed_inputs))
|
||||
energies = energies.squeeze(-1)
|
||||
return energies, processed_query
|
||||
|
||||
def get_attention(self, query, processed_inputs):
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
energies = self.v(torch.tanh(processed_query + processed_inputs))
|
||||
energies = energies.squeeze(-1)
|
||||
return energies, processed_query
|
||||
|
||||
def apply_windowing(self, attention, inputs):
|
||||
back_win = self.win_idx - self.win_back
|
||||
front_win = self.win_idx + self.win_front
|
||||
if back_win > 0:
|
||||
attention[:, :back_win] = -float("inf")
|
||||
if front_win < inputs.shape[1]:
|
||||
attention[:, front_win:] = -float("inf")
|
||||
# this is a trick to solve a special problem.
|
||||
# but it does not hurt.
|
||||
if self.win_idx == -1:
|
||||
attention[:, 0] = attention.max()
|
||||
# Update the window
|
||||
self.win_idx = torch.argmax(attention, 1).long()[0].item()
|
||||
return attention
|
||||
|
||||
def apply_forward_attention(self, inputs, alignment, query):
|
||||
# forward attention
|
||||
prev_alpha = F.pad(self.alpha[:, :-1].clone(),
|
||||
(1, 0, 0, 0)).to(inputs.device)
|
||||
# compute transition potentials
|
||||
alpha = (((1 - self.u) * self.alpha.clone().to(inputs.device) +
|
||||
self.u * prev_alpha) + 1e-8) * alignment
|
||||
# force incremental alignment - TODO: make configurable
|
||||
if not self.training:
|
||||
_, n = prev_alpha.max(1)
|
||||
val, n2 = alpha.max(1)
|
||||
for b in range(alignment.shape[0]):
|
||||
alpha[b, n + 2:] = 0
|
||||
alpha[b, :(n - 1)] = 0 # ignore all previous states to prevent repetition.
|
||||
alpha[b, (n - 2)] = 0.01 * val # smoothing factor for the prev step
|
||||
# compute attention weights
|
||||
self.alpha = alpha / alpha.sum(dim=1).unsqueeze(1)
|
||||
# compute context
|
||||
context = torch.bmm(self.alpha.unsqueeze(1), inputs)
|
||||
context = context.squeeze(1)
|
||||
# compute transition agent
|
||||
if self.trans_agent:
|
||||
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
|
||||
self.u = torch.sigmoid(self.ta(ta_input))
|
||||
return context, self.alpha
|
||||
|
||||
def forward(self, attention_hidden_state, inputs, processed_inputs, mask):
|
||||
if self.location_attention:
|
||||
attention, processed_query = self.get_location_attention(
|
||||
attention_hidden_state, processed_inputs)
|
||||
else:
|
||||
attention, processed_query = self.get_attention(
|
||||
attention_hidden_state, processed_inputs)
|
||||
# apply masking
|
||||
if mask is not None:
|
||||
attention.data.masked_fill_(1 - mask, self._mask_value)
|
||||
# apply windowing - only in eval mode
|
||||
if not self.training and self.windowing:
|
||||
attention = self.apply_windowing(attention, inputs)
|
||||
# normalize attention values
|
||||
if self.norm == "softmax":
|
||||
alignment = torch.softmax(attention, dim=-1)
|
||||
elif self.norm == "sigmoid":
|
||||
alignment = torch.sigmoid(attention) / torch.sigmoid(
|
||||
attention).sum(dim=1).unsqueeze(1)
|
||||
else:
|
||||
raise RuntimeError("Unknown value for attention norm type")
|
||||
if self.location_attention:
|
||||
self.update_location_attention(alignment)
|
||||
# apply forward attention if enabled
|
||||
if self.forward_attn:
|
||||
context, self.attention_weights = self.apply_forward_attention(
|
||||
inputs, alignment, attention_hidden_state)
|
||||
else:
|
||||
context = torch.bmm(alignment.unsqueeze(1), inputs)
|
||||
context = context.squeeze(1)
|
||||
self.attention_weights = alignment
|
||||
return context
|
|
@ -23,6 +23,7 @@ class L1LossMasked(nn.Module):
|
|||
loss: An average loss value masked by the length.
|
||||
"""
|
||||
# mask: (batch, max_len, 1)
|
||||
target.requires_grad = False
|
||||
mask = sequence_mask(
|
||||
sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
|
||||
mask = mask.expand_as(input)
|
||||
|
@ -50,22 +51,13 @@ class MSELossMasked(nn.Module):
|
|||
Returns:
|
||||
loss: An average loss value masked by the length.
|
||||
"""
|
||||
input = input.contiguous()
|
||||
target = target.contiguous()
|
||||
|
||||
# logits_flat: (batch * max_len, dim)
|
||||
input = input.view(-1, input.shape[-1])
|
||||
# target_flat: (batch * max_len, dim)
|
||||
target_flat = target.view(-1, target.shape[-1])
|
||||
# losses_flat: (batch * max_len, dim)
|
||||
losses_flat = functional.mse_loss(
|
||||
input, target_flat, size_average=False, reduce=False)
|
||||
# losses: (batch, max_len, dim)
|
||||
losses = losses_flat.view(*target.size())
|
||||
|
||||
# mask: (batch, max_len, 1)
|
||||
target.requires_grad = False
|
||||
mask = sequence_mask(
|
||||
sequence_length=length, max_len=target.size(1)).unsqueeze(2)
|
||||
losses = losses * mask.float()
|
||||
loss = losses.sum() / (length.float().sum() * float(target.shape[2]))
|
||||
sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
|
||||
mask = mask.expand_as(input)
|
||||
loss = functional.mse_loss(
|
||||
input * mask, target * mask, reduction="sum")
|
||||
loss = loss / mask.sum()
|
||||
return loss
|
||||
|
||||
|
|
|
@ -1,39 +1,7 @@
|
|||
# coding: utf-8
|
||||
import torch
|
||||
from torch import nn
|
||||
from .attention import AttentionRNNCell
|
||||
|
||||
|
||||
class Prenet(nn.Module):
|
||||
r""" Prenet as explained at https://arxiv.org/abs/1703.10135.
|
||||
It creates as many layers as given by 'out_features'
|
||||
|
||||
Args:
|
||||
in_features (int): size of the input vector
|
||||
out_features (int or list): size of each output sample.
|
||||
If it is a list, for each value, there is created a new layer.
|
||||
"""
|
||||
|
||||
def __init__(self, in_features, out_features=[256, 128]):
|
||||
super(Prenet, self).__init__()
|
||||
in_features = [in_features] + out_features[:-1]
|
||||
self.layers = nn.ModuleList([
|
||||
nn.Linear(in_size, out_size)
|
||||
for (in_size, out_size) in zip(in_features, out_features)
|
||||
])
|
||||
self.relu = nn.ReLU()
|
||||
self.dropout = nn.Dropout(0.5)
|
||||
# self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
for layer in self.layers:
|
||||
torch.nn.init.xavier_uniform_(
|
||||
layer.weight, gain=torch.nn.init.calculate_gain('relu'))
|
||||
|
||||
def forward(self, inputs):
|
||||
for linear in self.layers:
|
||||
inputs = self.dropout(self.relu(linear(inputs)))
|
||||
return inputs
|
||||
from .common_layers import Prenet, Attention
|
||||
|
||||
|
||||
class BatchNormConv1d(nn.Module):
|
||||
|
@ -301,23 +269,34 @@ class Decoder(nn.Module):
|
|||
memory_size (int): size of the past window. if <= 0 memory_size = r
|
||||
"""
|
||||
|
||||
def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing):
|
||||
def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing,
|
||||
attn_norm, prenet_type, prenet_dropout, forward_attn,
|
||||
trans_agent, location_attn, separate_stopnet):
|
||||
super(Decoder, self).__init__()
|
||||
self.r = r
|
||||
self.in_features = in_features
|
||||
self.max_decoder_steps = 500
|
||||
self.memory_size = memory_size if memory_size > 0 else r
|
||||
self.memory_dim = memory_dim
|
||||
self.separate_stopnet = separate_stopnet
|
||||
# memory -> |Prenet| -> processed_memory
|
||||
self.prenet = Prenet(memory_dim * self.memory_size, out_features=[256, 128])
|
||||
self.prenet = Prenet(
|
||||
memory_dim * self.memory_size,
|
||||
prenet_type,
|
||||
prenet_dropout,
|
||||
out_features=[256, 128])
|
||||
# processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State
|
||||
self.attention_rnn = AttentionRNNCell(
|
||||
out_dim=128,
|
||||
rnn_dim=256,
|
||||
annot_dim=in_features,
|
||||
memory_dim=128,
|
||||
align_model='ls',
|
||||
windowing=attn_windowing)
|
||||
self.attention_rnn = nn.GRUCell(in_features + 128, 256)
|
||||
self.attention_layer = Attention(attention_rnn_dim=256,
|
||||
embedding_dim=in_features,
|
||||
attention_dim=128,
|
||||
location_attention=location_attn,
|
||||
attention_location_n_filters=32,
|
||||
attention_location_kernel_size=31,
|
||||
windowing=attn_windowing,
|
||||
norm=attn_norm,
|
||||
forward_attn=forward_attn,
|
||||
trans_agent=trans_agent)
|
||||
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
|
||||
self.project_to_decoder_in = nn.Linear(256 + in_features, 256)
|
||||
# decoder_RNN_input -> |RNN| -> RNN_state
|
||||
|
@ -360,80 +339,45 @@ class Decoder(nn.Module):
|
|||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
# go frame as zeros matrix
|
||||
initial_memory = self.memory_init(inputs.data.new_zeros(B).long())
|
||||
self.memory_input = self.memory_init(inputs.data.new_zeros(B).long())
|
||||
|
||||
# decoder states
|
||||
attention_rnn_hidden = self.attention_rnn_init(inputs.data.new_zeros(B).long())
|
||||
decoder_rnn_hiddens = [
|
||||
self.decoder_rnn_inits(inputs.data.new_tensor([idx]*B).long())
|
||||
self.attention_rnn_hidden = self.attention_rnn_init(
|
||||
inputs.data.new_zeros(B).long())
|
||||
self.decoder_rnn_hiddens = [
|
||||
self.decoder_rnn_inits(inputs.data.new_tensor([idx] * B).long())
|
||||
for idx in range(len(self.decoder_rnns))
|
||||
]
|
||||
current_context_vec = inputs.data.new(B, self.in_features).zero_()
|
||||
self.current_context_vec = inputs.data.new(B, self.in_features).zero_()
|
||||
# attention states
|
||||
attention = inputs.data.new(B, T).zero_()
|
||||
attention_cum = inputs.data.new(B, T).zero_()
|
||||
return (initial_memory, attention_rnn_hidden, decoder_rnn_hiddens,
|
||||
current_context_vec, attention, attention_cum)
|
||||
self.attention = inputs.data.new(B, T).zero_()
|
||||
self.attention_cum = inputs.data.new(B, T).zero_()
|
||||
# cache attention inputs
|
||||
self.processed_inputs = self.attention_layer.inputs_layer(inputs)
|
||||
|
||||
def forward(self, inputs, memory=None, mask=None):
|
||||
"""
|
||||
Decoder forward step.
|
||||
def _parse_outputs(self, outputs, attentions, stop_tokens):
|
||||
# Back to batch first
|
||||
attentions = torch.stack(attentions).transpose(0, 1)
|
||||
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
||||
stop_tokens = torch.stack(stop_tokens).transpose(0, 1).squeeze(-1)
|
||||
return outputs, attentions, stop_tokens
|
||||
|
||||
If decoder inputs are not given (e.g., at testing time), as noted in
|
||||
Tacotron paper, greedy decoding is adapted.
|
||||
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
memory (None): Decoder memory (autoregression. If None (at eval-time),
|
||||
decoder outputs are used as decoder inputs. If None, it uses the last
|
||||
output as the input.
|
||||
mask (None): Attention mask for sequence padding.
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x encoder_out_dim
|
||||
- memory: batch x #mel_specs x mel_spec_dim
|
||||
"""
|
||||
# Run greedy decoding if memory is None
|
||||
greedy = not self.training
|
||||
if memory is not None:
|
||||
memory = self._reshape_memory(memory)
|
||||
T_decoder = memory.size(0)
|
||||
outputs = []
|
||||
attentions = []
|
||||
stop_tokens = []
|
||||
t = 0
|
||||
memory_input, attention_rnn_hidden, decoder_rnn_hiddens,\
|
||||
current_context_vec, attention, attention_cum = self._init_states(inputs)
|
||||
while True:
|
||||
if t > 0:
|
||||
if memory is None:
|
||||
new_memory = outputs[-1]
|
||||
else:
|
||||
new_memory = memory[t - 1]
|
||||
# Queuing if memory size defined else use previous prediction only.
|
||||
if self.memory_size > 0:
|
||||
memory_input = torch.cat([memory_input[:, self.r * self.memory_dim:].clone(), new_memory], dim=-1)
|
||||
else:
|
||||
memory_input = new_memory
|
||||
def decode(self, inputs, mask=None):
|
||||
# Prenet
|
||||
processed_memory = self.prenet(memory_input)
|
||||
processed_memory = self.prenet(self.memory_input)
|
||||
# Attention RNN
|
||||
attention_cat = torch.cat(
|
||||
(attention.unsqueeze(1), attention_cum.unsqueeze(1)), dim=1)
|
||||
attention_rnn_hidden, current_context_vec, attention = self.attention_rnn(
|
||||
processed_memory, current_context_vec, attention_rnn_hidden,
|
||||
inputs, attention_cat, mask, t)
|
||||
del attention_cat
|
||||
attention_cum += attention
|
||||
self.attention_rnn_hidden = self.attention_rnn(torch.cat((processed_memory, self.current_context_vec), -1), self.attention_rnn_hidden)
|
||||
self.current_context_vec = self.attention_layer(self.attention_rnn_hidden, inputs, self.processed_inputs, mask)
|
||||
# Concat RNN output and attention context vector
|
||||
decoder_input = self.project_to_decoder_in(
|
||||
torch.cat((attention_rnn_hidden, current_context_vec), -1))
|
||||
torch.cat((self.attention_rnn_hidden, self.current_context_vec),
|
||||
-1))
|
||||
# Pass through the decoder RNNs
|
||||
for idx in range(len(self.decoder_rnns)):
|
||||
decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
|
||||
decoder_input, decoder_rnn_hiddens[idx])
|
||||
self.decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
|
||||
decoder_input, self.decoder_rnn_hiddens[idx])
|
||||
# Residual connection
|
||||
decoder_input = decoder_rnn_hiddens[idx] + decoder_input
|
||||
decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input
|
||||
decoder_output = decoder_input
|
||||
del decoder_input
|
||||
# predict mel vectors from decoder vectors
|
||||
|
@ -442,34 +386,91 @@ class Decoder(nn.Module):
|
|||
# predict stop token
|
||||
stopnet_input = torch.cat([decoder_output, output], -1)
|
||||
del decoder_output
|
||||
if self.separate_stopnet:
|
||||
stop_token = self.stopnet(stopnet_input.detach())
|
||||
else:
|
||||
stop_token = self.stopnet(stopnet_input)
|
||||
del stopnet_input
|
||||
return output, stop_token, self.attention_layer.attention_weights
|
||||
|
||||
def _update_memory_queue(self, new_memory):
|
||||
if self.memory_size > 0:
|
||||
self.memory_input = torch.cat([
|
||||
self.memory_input[:, self.r * self.memory_dim:].clone(),
|
||||
new_memory
|
||||
],
|
||||
dim=-1)
|
||||
else:
|
||||
self.memory_input = new_memory
|
||||
|
||||
def forward(self, inputs, memory, mask):
|
||||
"""
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
memory: Decoder memory (autoregression. If None (at eval-time),
|
||||
decoder outputs are used as decoder inputs. If None, it uses the last
|
||||
output as the input.
|
||||
mask: Attention mask for sequence padding.
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x encoder_out_dim
|
||||
- memory: batch x #mel_specs x mel_spec_dim
|
||||
"""
|
||||
# Run greedy decoding if memory is None
|
||||
memory = self._reshape_memory(memory)
|
||||
outputs = []
|
||||
attentions = []
|
||||
stop_tokens = []
|
||||
t = 0
|
||||
self._init_states(inputs)
|
||||
self.attention_layer.init_states(inputs)
|
||||
while len(outputs) < memory.size(0):
|
||||
if t > 0:
|
||||
new_memory = memory[t - 1]
|
||||
self._update_memory_queue(new_memory)
|
||||
output, stop_token, attention = self.decode(inputs, mask)
|
||||
outputs += [output]
|
||||
attentions += [attention]
|
||||
stop_tokens += [stop_token]
|
||||
del output
|
||||
t += 1
|
||||
if memory is not None:
|
||||
if t >= T_decoder:
|
||||
break
|
||||
else:
|
||||
if t > inputs.shape[1] / 4 and (stop_token > 0.6 or
|
||||
attention[:, -1].item() > 0.6):
|
||||
|
||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||
|
||||
def inference(self, inputs):
|
||||
"""
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x encoder_out_dim
|
||||
"""
|
||||
outputs = []
|
||||
attentions = []
|
||||
stop_tokens = []
|
||||
t = 0
|
||||
self._init_states(inputs)
|
||||
self.attention_layer.init_win_idx()
|
||||
self.attention_layer.init_states(inputs)
|
||||
while True:
|
||||
if t > 0:
|
||||
new_memory = outputs[-1]
|
||||
self._update_memory_queue(new_memory)
|
||||
output, stop_token, attention = self.decode(inputs, None)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [output]
|
||||
attentions += [attention]
|
||||
stop_tokens += [stop_token]
|
||||
t += 1
|
||||
if t > inputs.shape[1] / 4 and (stop_token > 0.6
|
||||
or attention[:, -1].item() > 0.6):
|
||||
break
|
||||
elif t > self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
# Back to batch first
|
||||
attentions = torch.stack(attentions).transpose(0, 1)
|
||||
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
||||
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
|
||||
return outputs, attentions, stop_tokens
|
||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||
|
||||
|
||||
class StopNet(nn.Module):
|
||||
r"""
|
||||
Predicting stop-token in decoder.
|
||||
|
||||
Args:
|
||||
in_features (int): feature dimension of input.
|
||||
"""
|
||||
|
@ -478,12 +479,10 @@ class StopNet(nn.Module):
|
|||
super(StopNet, self).__init__()
|
||||
self.dropout = nn.Dropout(0.1)
|
||||
self.linear = nn.Linear(in_features, 1)
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.linear.weight, gain=torch.nn.init.calculate_gain('linear'))
|
||||
|
||||
def forward(self, inputs):
|
||||
outputs = self.dropout(inputs)
|
||||
outputs = self.linear(outputs)
|
||||
outputs = self.sigmoid(outputs)
|
||||
return outputs
|
||||
|
|
|
@ -0,0 +1,334 @@
|
|||
from math import sqrt
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from .common_layers import Attention, Prenet, Linear, LinearBN
|
||||
|
||||
|
||||
class ConvBNBlock(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, kernel_size, nonlinear=None):
|
||||
super(ConvBNBlock, self).__init__()
|
||||
assert (kernel_size - 1) % 2 == 0
|
||||
padding = (kernel_size - 1) // 2
|
||||
conv1d = nn.Conv1d(
|
||||
in_channels, out_channels, kernel_size, padding=padding)
|
||||
norm = nn.BatchNorm1d(out_channels)
|
||||
dropout = nn.Dropout(p=0.5)
|
||||
if nonlinear == 'relu':
|
||||
self.net = nn.Sequential(conv1d, norm, nn.ReLU(), dropout)
|
||||
elif nonlinear == 'tanh':
|
||||
self.net = nn.Sequential(conv1d, norm, nn.Tanh(), dropout)
|
||||
else:
|
||||
self.net = nn.Sequential(conv1d, norm, dropout)
|
||||
|
||||
def forward(self, x):
|
||||
output = self.net(x)
|
||||
return output
|
||||
|
||||
|
||||
class Postnet(nn.Module):
|
||||
def __init__(self, mel_dim, num_convs=5):
|
||||
super(Postnet, self).__init__()
|
||||
self.convolutions = nn.ModuleList()
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(mel_dim, 512, kernel_size=5, nonlinear='tanh'))
|
||||
for i in range(1, num_convs - 1):
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(512, 512, kernel_size=5, nonlinear='tanh'))
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(512, mel_dim, kernel_size=5, nonlinear=None))
|
||||
|
||||
def forward(self, x):
|
||||
for layer in self.convolutions:
|
||||
x = layer(x)
|
||||
return x
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self, in_features=512):
|
||||
super(Encoder, self).__init__()
|
||||
convolutions = []
|
||||
for _ in range(3):
|
||||
convolutions.append(
|
||||
ConvBNBlock(in_features, in_features, 5, 'relu'))
|
||||
self.convolutions = nn.Sequential(*convolutions)
|
||||
self.lstm = nn.LSTM(
|
||||
in_features,
|
||||
int(in_features / 2),
|
||||
num_layers=1,
|
||||
batch_first=True,
|
||||
bidirectional=True)
|
||||
self.rnn_state = None
|
||||
|
||||
def forward(self, x, input_lengths):
|
||||
x = self.convolutions(x)
|
||||
x = x.transpose(1, 2)
|
||||
input_lengths = input_lengths.cpu().numpy()
|
||||
x = nn.utils.rnn.pack_padded_sequence(
|
||||
x, input_lengths, batch_first=True)
|
||||
self.lstm.flatten_parameters()
|
||||
outputs, _ = self.lstm(x)
|
||||
outputs, _ = nn.utils.rnn.pad_packed_sequence(
|
||||
outputs,
|
||||
batch_first=True,
|
||||
)
|
||||
return outputs
|
||||
|
||||
def inference(self, x):
|
||||
x = self.convolutions(x)
|
||||
x = x.transpose(1, 2)
|
||||
self.lstm.flatten_parameters()
|
||||
outputs, _ = self.lstm(x)
|
||||
return outputs
|
||||
|
||||
def inference_truncated(self, x):
|
||||
"""
|
||||
Preserve encoder state for continuous inference
|
||||
"""
|
||||
x = self.convolutions(x)
|
||||
x = x.transpose(1, 2)
|
||||
self.lstm.flatten_parameters()
|
||||
outputs, self.rnn_state = self.lstm(x, self.rnn_state)
|
||||
return outputs
|
||||
|
||||
|
||||
# adapted from https://github.com/NVIDIA/tacotron2/
|
||||
class Decoder(nn.Module):
|
||||
def __init__(self, in_features, inputs_dim, r, attn_win, attn_norm,
|
||||
prenet_type, prenet_dropout, forward_attn, trans_agent,
|
||||
location_attn, separate_stopnet):
|
||||
super(Decoder, self).__init__()
|
||||
self.mel_channels = inputs_dim
|
||||
self.r = r
|
||||
self.encoder_embedding_dim = in_features
|
||||
self.separate_stopnet = separate_stopnet
|
||||
self.attention_rnn_dim = 1024
|
||||
self.decoder_rnn_dim = 1024
|
||||
self.prenet_dim = 256
|
||||
self.max_decoder_steps = 1000
|
||||
self.gate_threshold = 0.5
|
||||
self.p_attention_dropout = 0.1
|
||||
self.p_decoder_dropout = 0.1
|
||||
|
||||
self.prenet = Prenet(self.mel_channels * r, prenet_type,
|
||||
prenet_dropout,
|
||||
[self.prenet_dim, self.prenet_dim], bias=False)
|
||||
|
||||
self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features,
|
||||
self.attention_rnn_dim)
|
||||
|
||||
self.attention_layer = Attention(self.attention_rnn_dim, in_features,
|
||||
128, location_attn, 32, 31, attn_win,
|
||||
attn_norm, forward_attn, trans_agent)
|
||||
|
||||
self.decoder_rnn = nn.LSTMCell(self.attention_rnn_dim + in_features,
|
||||
self.decoder_rnn_dim, 1)
|
||||
|
||||
self.linear_projection = Linear(self.decoder_rnn_dim + in_features,
|
||||
self.mel_channels * r)
|
||||
|
||||
self.stopnet = nn.Sequential(
|
||||
nn.Dropout(0.1),
|
||||
Linear(
|
||||
self.decoder_rnn_dim + self.mel_channels * r,
|
||||
1,
|
||||
bias=True,
|
||||
init_gain='sigmoid'))
|
||||
|
||||
self.attention_rnn_init = nn.Embedding(1, self.attention_rnn_dim)
|
||||
self.go_frame_init = nn.Embedding(1, self.mel_channels * r)
|
||||
self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim)
|
||||
self.memory_truncated = None
|
||||
|
||||
def get_go_frame(self, inputs):
|
||||
B = inputs.size(0)
|
||||
memory = self.go_frame_init(inputs.data.new_zeros(B).long())
|
||||
return memory
|
||||
|
||||
def _init_states(self, inputs, mask, keep_states=False):
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
|
||||
if not keep_states:
|
||||
self.attention_hidden = self.attention_rnn_init(
|
||||
inputs.data.new_zeros(B).long())
|
||||
self.attention_cell = Variable(
|
||||
inputs.data.new(B, self.attention_rnn_dim).zero_())
|
||||
|
||||
self.decoder_hidden = self.decoder_rnn_inits(
|
||||
inputs.data.new_zeros(B).long())
|
||||
self.decoder_cell = Variable(
|
||||
inputs.data.new(B, self.decoder_rnn_dim).zero_())
|
||||
|
||||
self.context = Variable(
|
||||
inputs.data.new(B, self.encoder_embedding_dim).zero_())
|
||||
|
||||
self.inputs = inputs
|
||||
self.processed_inputs = self.attention_layer.inputs_layer(inputs)
|
||||
self.mask = mask
|
||||
|
||||
def _reshape_memory(self, memories):
|
||||
memories = memories.view(
|
||||
memories.size(0), int(memories.size(1) / self.r), -1)
|
||||
memories = memories.transpose(0, 1)
|
||||
return memories
|
||||
|
||||
def _parse_outputs(self, outputs, stop_tokens, alignments):
|
||||
alignments = torch.stack(alignments).transpose(0, 1)
|
||||
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
|
||||
stop_tokens = stop_tokens.contiguous()
|
||||
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
||||
outputs = outputs.view(outputs.size(0), -1, self.mel_channels)
|
||||
outputs = outputs.transpose(1, 2)
|
||||
return outputs, stop_tokens, alignments
|
||||
|
||||
def decode(self, memory):
|
||||
cell_input = torch.cat((memory, self.context), -1)
|
||||
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
||||
cell_input, (self.attention_hidden, self.attention_cell))
|
||||
self.attention_hidden = F.dropout(
|
||||
self.attention_hidden, self.p_attention_dropout, self.training)
|
||||
self.attention_cell = F.dropout(
|
||||
self.attention_cell, self.p_attention_dropout, self.training)
|
||||
|
||||
self.context = self.attention_layer(self.attention_hidden, self.inputs,
|
||||
self.processed_inputs, self.mask)
|
||||
|
||||
memory = torch.cat((self.attention_hidden, self.context), -1)
|
||||
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
||||
memory, (self.decoder_hidden, self.decoder_cell))
|
||||
self.decoder_hidden = F.dropout(self.decoder_hidden,
|
||||
self.p_decoder_dropout, self.training)
|
||||
self.decoder_cell = F.dropout(self.decoder_cell,
|
||||
self.p_decoder_dropout, self.training)
|
||||
|
||||
decoder_hidden_context = torch.cat((self.decoder_hidden, self.context),
|
||||
dim=1)
|
||||
|
||||
decoder_output = self.linear_projection(decoder_hidden_context)
|
||||
|
||||
stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1)
|
||||
|
||||
if self.separate_stopnet:
|
||||
stop_token = self.stopnet(stopnet_input.detach())
|
||||
else:
|
||||
stop_token = self.stopnet(stopnet_input)
|
||||
return decoder_output, stop_token, self.attention_layer.attention_weights
|
||||
|
||||
def forward(self, inputs, memories, mask):
|
||||
memory = self.get_go_frame(inputs).unsqueeze(0)
|
||||
memories = self._reshape_memory(memories)
|
||||
memories = torch.cat((memory, memories), dim=0)
|
||||
memories = self.prenet(memories)
|
||||
|
||||
self._init_states(inputs, mask=mask)
|
||||
self.attention_layer.init_states(inputs)
|
||||
|
||||
outputs, stop_tokens, alignments = [], [], []
|
||||
while len(outputs) < memories.size(0) - 1:
|
||||
memory = memories[len(outputs)]
|
||||
mel_output, stop_token, attention_weights = self.decode(memory)
|
||||
outputs += [mel_output.squeeze(1)]
|
||||
stop_tokens += [stop_token.squeeze(1)]
|
||||
alignments += [attention_weights]
|
||||
|
||||
outputs, stop_tokens, alignments = self._parse_outputs(
|
||||
outputs, stop_tokens, alignments)
|
||||
|
||||
return outputs, stop_tokens, alignments
|
||||
|
||||
def inference(self, inputs):
|
||||
memory = self.get_go_frame(inputs)
|
||||
self._init_states(inputs, mask=None)
|
||||
|
||||
self.attention_layer.init_win_idx()
|
||||
self.attention_layer.init_states(inputs)
|
||||
|
||||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||
stop_flags = [True, False, False]
|
||||
stop_count = 0
|
||||
while True:
|
||||
memory = self.prenet(memory)
|
||||
mel_output, stop_token, alignment = self.decode(memory)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [mel_output.squeeze(1)]
|
||||
stop_tokens += [stop_token]
|
||||
alignments += [alignment]
|
||||
|
||||
stop_flags[0] = stop_flags[0] or stop_token > 0.5
|
||||
stop_flags[1] = stop_flags[1] or (alignment[0, -2:].sum() > 0.8
|
||||
and t > inputs.shape[1])
|
||||
stop_flags[2] = t > inputs.shape[1] * 2
|
||||
if all(stop_flags):
|
||||
stop_count += 1
|
||||
if stop_count > 20:
|
||||
break
|
||||
elif len(outputs) == self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
|
||||
memory = mel_output
|
||||
t += 1
|
||||
|
||||
outputs, stop_tokens, alignments = self._parse_outputs(
|
||||
outputs, stop_tokens, alignments)
|
||||
|
||||
return outputs, stop_tokens, alignments
|
||||
|
||||
def inference_truncated(self, inputs):
|
||||
"""
|
||||
Preserve decoder states for continuous inference
|
||||
"""
|
||||
if self.memory_truncated is None:
|
||||
self.memory_truncated = self.get_go_frame(inputs)
|
||||
self._init_states(inputs, mask=None, keep_states=False)
|
||||
else:
|
||||
self._init_states(inputs, mask=None, keep_states=True)
|
||||
|
||||
self.attention_layer.init_win_idx()
|
||||
self.attention_layer.init_states(inputs)
|
||||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||
stop_flags = [True, False, False]
|
||||
stop_count = 0
|
||||
while True:
|
||||
memory = self.prenet(self.memory_truncated)
|
||||
mel_output, stop_token, alignment = self.decode(memory)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [mel_output.squeeze(1)]
|
||||
stop_tokens += [stop_token]
|
||||
alignments += [alignment]
|
||||
|
||||
stop_flags[0] = stop_flags[0] or stop_token > 0.5
|
||||
stop_flags[1] = stop_flags[1] or (alignment[0, -2:].sum() > 0.8
|
||||
and t > inputs.shape[1])
|
||||
stop_flags[2] = t > inputs.shape[1] * 2
|
||||
if all(stop_flags):
|
||||
stop_count += 1
|
||||
if stop_count > 20:
|
||||
break
|
||||
elif len(outputs) == self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
|
||||
self.memory_truncated = mel_output
|
||||
t += 1
|
||||
|
||||
outputs, stop_tokens, alignments = self._parse_outputs(
|
||||
outputs, stop_tokens, alignments)
|
||||
|
||||
return outputs, stop_tokens, alignments
|
||||
|
||||
def inference_step(self, inputs, t, memory=None):
|
||||
"""
|
||||
For debug purposes
|
||||
"""
|
||||
if t == 0:
|
||||
memory = self.get_go_frame(inputs)
|
||||
self._init_states(inputs, mask=None)
|
||||
|
||||
memory = self.prenet(memory)
|
||||
mel_output, stop_token, alignment = self.decode(memory)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
memory = mel_output
|
||||
return mel_output, stop_token, alignment
|
|
@ -3,42 +3,58 @@ import torch
|
|||
from torch import nn
|
||||
from math import sqrt
|
||||
from layers.tacotron import Prenet, Encoder, Decoder, PostCBHG
|
||||
from utils.generic_utils import sequence_mask
|
||||
|
||||
|
||||
class Tacotron(nn.Module):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
embedding_dim=256,
|
||||
r=5,
|
||||
linear_dim=1025,
|
||||
mel_dim=80,
|
||||
r=5,
|
||||
padding_idx=None,
|
||||
memory_size=5,
|
||||
attn_windowing=False):
|
||||
attn_win=False,
|
||||
attn_norm="sigmoid",
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
forward_attn=False,
|
||||
trans_agent=False,
|
||||
location_attn=True,
|
||||
separate_stopnet=True):
|
||||
super(Tacotron, self).__init__()
|
||||
self.r = r
|
||||
self.mel_dim = mel_dim
|
||||
self.linear_dim = linear_dim
|
||||
self.embedding = nn.Embedding(
|
||||
num_chars, embedding_dim, padding_idx=padding_idx)
|
||||
self.embedding = nn.Embedding(num_chars, 256)
|
||||
self.embedding.weight.data.normal_(0, 0.3)
|
||||
self.encoder = Encoder(embedding_dim)
|
||||
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_windowing)
|
||||
self.encoder = Encoder(256)
|
||||
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, location_attn,
|
||||
separate_stopnet)
|
||||
self.postnet = PostCBHG(mel_dim)
|
||||
self.last_linear = nn.Sequential(
|
||||
nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim),
|
||||
nn.Sigmoid())
|
||||
|
||||
def forward(self, characters, mel_specs=None, mask=None):
|
||||
def forward(self, characters, text_lengths, mel_specs):
|
||||
B = characters.size(0)
|
||||
mask = sequence_mask(text_lengths).to(characters.device)
|
||||
inputs = self.embedding(characters)
|
||||
# batch x time x dim
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
# batch x time x dim*r
|
||||
mel_outputs, alignments, stop_tokens = self.decoder(
|
||||
encoder_outputs, mel_specs, mask)
|
||||
# Reshape
|
||||
# batch x time x dim
|
||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||
linear_outputs = self.postnet(mel_outputs)
|
||||
linear_outputs = self.last_linear(linear_outputs)
|
||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||
|
||||
def inference(self, characters):
|
||||
B = characters.size(0)
|
||||
inputs = self.embedding(characters)
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
mel_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||
encoder_outputs)
|
||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||
linear_outputs = self.postnet(mel_outputs)
|
||||
linear_outputs = self.last_linear(linear_outputs)
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
from math import sqrt
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from layers.tacotron2 import Encoder, Decoder, Postnet
|
||||
from utils.generic_utils import sequence_mask
|
||||
|
||||
|
||||
# TODO: match function arguments with tacotron
|
||||
class Tacotron2(nn.Module):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
r,
|
||||
attn_win=False,
|
||||
attn_norm="softmax",
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
forward_attn=False,
|
||||
trans_agent=False,
|
||||
location_attn=True,
|
||||
separate_stopnet=True):
|
||||
super(Tacotron2, self).__init__()
|
||||
self.n_mel_channels = 80
|
||||
self.n_frames_per_step = r
|
||||
self.embedding = nn.Embedding(num_chars, 512)
|
||||
std = sqrt(2.0 / (num_chars + 512))
|
||||
val = sqrt(3.0) * std # uniform bounds for std
|
||||
self.embedding.weight.data.uniform_(-val, val)
|
||||
self.encoder = Encoder(512)
|
||||
self.decoder = Decoder(512, self.n_mel_channels, r, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, location_attn,
|
||||
separate_stopnet)
|
||||
self.postnet = Postnet(self.n_mel_channels)
|
||||
|
||||
def shape_outputs(self, mel_outputs, mel_outputs_postnet, alignments):
|
||||
mel_outputs = mel_outputs.transpose(1, 2)
|
||||
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
||||
return mel_outputs, mel_outputs_postnet, alignments
|
||||
|
||||
def forward(self, text, text_lengths, mel_specs=None):
|
||||
# compute mask for padding
|
||||
mask = sequence_mask(text_lengths).to(text.device)
|
||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
||||
mel_outputs, stop_tokens, alignments = self.decoder(
|
||||
encoder_outputs, mel_specs, mask)
|
||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
|
||||
mel_outputs, mel_outputs_postnet, alignments)
|
||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
||||
|
||||
def inference(self, text):
|
||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||
mel_outputs, stop_tokens, alignments = self.decoder.inference(
|
||||
encoder_outputs)
|
||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
|
||||
mel_outputs, mel_outputs_postnet, alignments)
|
||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
||||
|
||||
def inference_truncated(self, text):
|
||||
"""
|
||||
Preserve model states for continuous inference
|
||||
"""
|
||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
|
||||
mel_outputs, stop_tokens, alignments = self.decoder.inference_truncated(
|
||||
encoder_outputs)
|
||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
|
||||
mel_outputs, mel_outputs_postnet, alignments)
|
||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,274 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TTS_PATH = \"/home/erogol/projects/\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"sys.path.append(TTS_PATH)\n",
|
||||
"import torch\n",
|
||||
"import importlib\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm import tqdm as tqdm\n",
|
||||
"from torch.utils.data import DataLoader\n",
|
||||
"from TTS.models.tacotron2 import Tacotron2\n",
|
||||
"from TTS.datasets.TTSDataset import MyDataset\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.visual import plot_spectrogram\n",
|
||||
"from TTS.utils.generic_utils import load_config\n",
|
||||
"from TTS.datasets.preprocess import ljspeech\n",
|
||||
"%matplotlib inline\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"os.environ['CUDA_VISIBLE_DEVICES']='0'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def set_filename(wav_path, out_path):\n",
|
||||
" wav_file = os.path.basename(wav_path)\n",
|
||||
" file_name = wav_file.split('.')[0]\n",
|
||||
" os.makedirs(os.path.join(out_path, \"quant\"), exist_ok=True)\n",
|
||||
" os.makedirs(os.path.join(out_path, \"mel\"), exist_ok=True)\n",
|
||||
" os.makedirs(os.path.join(out_path, \"wav_gl\"), exist_ok=True)\n",
|
||||
" wavq_path = os.path.join(out_path, \"quant\", file_name)\n",
|
||||
" mel_path = os.path.join(out_path, \"mel\", file_name)\n",
|
||||
" wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n",
|
||||
" return file_name, wavq_path, mel_path, wav_path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"OUT_PATH = \"/home/erogol/Data/LJSpeech-1.1/wavernn_4152/\"\n",
|
||||
"DATA_PATH = \"/home/erogol/Data/LJSpeech-1.1/\"\n",
|
||||
"METADATA_FILE = \"metadata_train.csv\"\n",
|
||||
"CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/ljspeech_models/4258_nancy/config.json\"\n",
|
||||
"MODEL_FILE = \"/home/erogol/checkpoint_92000.pth.tar\"\n",
|
||||
"DRY_RUN = True # if False, does not generate output files, only computes loss and visuals.\n",
|
||||
"BATCH_SIZE = 16\n",
|
||||
"\n",
|
||||
"use_cuda = torch.cuda.is_available()\n",
|
||||
"\n",
|
||||
"C = load_config(CONFIG_PATH)\n",
|
||||
"ap = AudioProcessor(bits=9, **C.audio)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = MyDataset(DATA_PATH, METADATA_FILE, C.r, C.text_cleaner, ap, ljspeech, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path)\n",
|
||||
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from utils.text.symbols import symbols, phonemes\n",
|
||||
"from utils.generic_utils import sequence_mask\n",
|
||||
"from layers.losses import L1LossMasked\n",
|
||||
"# load the model\n",
|
||||
"MyModel = importlib.import_module('TTS.models.'+C.model.lower())\n",
|
||||
"MyModel = getattr(MyModel, C.model)\n",
|
||||
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
||||
"model = MyModel(num_chars, C.r, attn_win=False)\n",
|
||||
"checkpoint = torch.load(MODEL_FILE)\n",
|
||||
"model.load_state_dict(checkpoint['model'])\n",
|
||||
"print(checkpoint['step'])\n",
|
||||
"model.eval()\n",
|
||||
"if use_cuda:\n",
|
||||
" model = model.cuda()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Generate model outputs "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pickle\n",
|
||||
"\n",
|
||||
"file_idxs = []\n",
|
||||
"losses = []\n",
|
||||
"postnet_losses = []\n",
|
||||
"criterion = L1LossMasked()\n",
|
||||
"for data in tqdm(loader):\n",
|
||||
" # setup input data\n",
|
||||
" text_input = data[0]\n",
|
||||
" text_lengths = data[1]\n",
|
||||
" linear_input = data[2]\n",
|
||||
" mel_input = data[3]\n",
|
||||
" mel_lengths = data[4]\n",
|
||||
" stop_targets = data[5]\n",
|
||||
" item_idx = data[6]\n",
|
||||
" \n",
|
||||
" # dispatch data to GPU\n",
|
||||
" if use_cuda:\n",
|
||||
" text_input = text_input.cuda()\n",
|
||||
" text_lengths = text_lengths.cuda()\n",
|
||||
" mel_input = mel_input.cuda()\n",
|
||||
" mel_lengths = mel_lengths.cuda()\n",
|
||||
"# linear_input = linear_input.cuda()\n",
|
||||
" stop_targets = stop_targets.cuda()\n",
|
||||
" \n",
|
||||
" mask = sequence_mask(text_lengths)\n",
|
||||
" mel_outputs, mel_postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input, mask)\n",
|
||||
" \n",
|
||||
" loss = criterion(mel_outputs, mel_input, mel_lengths)\n",
|
||||
" loss_postnet = criterion(mel_postnet_outputs, mel_input, mel_lengths)\n",
|
||||
" losses.append(loss.item())\n",
|
||||
" postnet_losses.append(loss_postnet.item())\n",
|
||||
" if not DRY_RUN:\n",
|
||||
" for idx in range(text_input.shape[0]):\n",
|
||||
" wav_file_path = item_idx[idx]\n",
|
||||
" wav = ap.load_wav(wav_file_path)\n",
|
||||
" file_name, wavq_path, mel_path, wav_path = set_filename(wav_file_path, OUT_PATH)\n",
|
||||
" file_idxs.append(file_name)\n",
|
||||
"\n",
|
||||
" # quantize and save wav\n",
|
||||
" wavq = ap.quantize(wav)\n",
|
||||
" np.save(wavq_path, wavq)\n",
|
||||
"\n",
|
||||
" # save TTS mel\n",
|
||||
" mel = mel_postnet_outputs[idx]\n",
|
||||
" mel = mel.data.cpu().numpy()\n",
|
||||
" mel_length = mel_lengths[idx]\n",
|
||||
" mel = mel[:mel_length, :].T\n",
|
||||
" np.save(mel_path, mel)\n",
|
||||
"\n",
|
||||
" # save GL voice\n",
|
||||
" # wav_gen = ap.inv_mel_spectrogram(mel.T) # mel to wav\n",
|
||||
" # wav_gen = ap.quantize(wav_gen)\n",
|
||||
" # np.save(wav_path, wav_gen)\n",
|
||||
"\n",
|
||||
"if not DRY_RUN:\n",
|
||||
" pickle.dump(file_idxs, open(OUT_PATH+\"/dataset_ids.pkl\", \"wb\")) \n",
|
||||
" \n",
|
||||
"\n",
|
||||
"print(np.mean(losses))\n",
|
||||
"print(np.mean(postnet_losses))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Check model performance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"idx = 1\n",
|
||||
"mel_example = mel_postnet_outputs[idx].data.cpu().numpy()\n",
|
||||
"plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n",
|
||||
"print(mel_example[:mel_lengths[1], :].shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wav = ap.load_wav(item_idx[idx])\n",
|
||||
"melt = ap.melspectrogram(wav)\n",
|
||||
"print(melt.shape)\n",
|
||||
"plot_spectrogram(melt.T, ap);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from matplotlib import pylab as plt\n",
|
||||
"mel_diff = mel_outputs[idx] - mel_postnet_outputs[idx]\n",
|
||||
"plt.figure(figsize=(16, 10))\n",
|
||||
"plt.imshow(abs(mel_diff.detach().cpu().numpy()[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.tight_layout()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from matplotlib import pylab as plt\n",
|
||||
"# mel = mel_poutputs[idx].detach().cpu().numpy()\n",
|
||||
"mel = mel_postnet_outputs[idx].detach().cpu().numpy()\n",
|
||||
"mel_diff2 = melt.T - mel[:melt.shape[1]]\n",
|
||||
"plt.figure(figsize=(16, 10))\n",
|
||||
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.tight_layout()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,52 +0,0 @@
|
|||
import io
|
||||
import librosa
|
||||
import torch
|
||||
import numpy as np
|
||||
from TTS.utils.text import text_to_sequence
|
||||
from matplotlib import pylab as plt
|
||||
|
||||
hop_length = 250
|
||||
|
||||
|
||||
def create_speech(m, s, CONFIG, use_cuda, ap):
|
||||
text_cleaner = [CONFIG.text_cleaner]
|
||||
seq = np.array(text_to_sequence(s, text_cleaner))
|
||||
chars_var = torch.from_numpy(seq).unsqueeze(0)
|
||||
if use_cuda:
|
||||
chars_var = chars_var.cuda()
|
||||
mel_out, linear_out, alignments, stop_tokens = m.forward(chars_var.long())
|
||||
linear_out = linear_out[0].data.cpu().numpy()
|
||||
alignment = alignments[0].cpu().data.numpy()
|
||||
spec = ap._denormalize(linear_out)
|
||||
wav = ap.inv_spectrogram(linear_out.T)
|
||||
wav = wav[:ap.find_endpoint(wav)]
|
||||
out = io.BytesIO()
|
||||
ap.save_wav(wav, out)
|
||||
return wav, alignment, spec, stop_tokens
|
||||
|
||||
|
||||
def visualize(alignment, spectrogram, stop_tokens, CONFIG):
|
||||
label_fontsize = 16
|
||||
plt.figure(figsize=(16, 24))
|
||||
|
||||
plt.subplot(3, 1, 1)
|
||||
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
||||
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
||||
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
||||
plt.colorbar()
|
||||
|
||||
stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy()
|
||||
plt.subplot(3, 1, 2)
|
||||
plt.plot(range(len(stop_tokens)), list(stop_tokens))
|
||||
|
||||
plt.subplot(3, 1, 3)
|
||||
librosa.display.specshow(
|
||||
spectrogram.T,
|
||||
sr=CONFIG.sample_rate,
|
||||
hop_length=hop_length,
|
||||
x_axis="time",
|
||||
y_axis="linear")
|
||||
plt.xlabel("Time", fontsize=label_fontsize)
|
||||
plt.ylabel("Hz", fontsize=label_fontsize)
|
||||
plt.tight_layout()
|
||||
plt.colorbar()
|
|
@ -1,5 +1,4 @@
|
|||
numpy==1.14.3
|
||||
lws
|
||||
torch>=0.4.1
|
||||
librosa==0.5.1
|
||||
Unidecode==0.4.20
|
||||
|
|
|
@ -1,7 +1,13 @@
|
|||
{
|
||||
"model_path":"/home/erogol/projects/runs/2579/keep/November-04-2018_06+19PM-TTS-master-_tmp-debug/",
|
||||
"model_name":"best_model.pth.tar",
|
||||
"model_config":"config.json",
|
||||
"tts_path":"/media/erogol/data_ssd/Data/models/ljspeech_models/ljspeech-April-08-2019_07+32PM-8a47b46/", // tts model root folder
|
||||
"tts_file":"checkpoint_261000.pth.tar", // tts checkpoint file
|
||||
"tts_config":"config.json", // tts config.json file
|
||||
"wavernn_lib_path": "/home/erogol/projects/", // Rootpath to wavernn project folder to be important. If this is none, model uses GL for speech synthesis.
|
||||
"wavernn_path":"/media/erogol/data_ssd/Data/models/wavernn/ljspeech/mold_ljspeech_best_model/", // wavernn model root path
|
||||
"wavernn_file":"checkpoint_433000.pth.tar", // wavernn checkpoint file name
|
||||
"wavernn_config":"config.json", // wavernn config file
|
||||
"is_wavernn_batched":true,
|
||||
"port": 5002,
|
||||
"use_cuda": true
|
||||
"use_cuda": true,
|
||||
"debug": true
|
||||
}
|
||||
|
|
|
@ -11,10 +11,7 @@ args = parser.parse_args()
|
|||
|
||||
config = load_config(args.config_path)
|
||||
app = Flask(__name__)
|
||||
synthesizer = Synthesizer()
|
||||
synthesizer.load_model(config.model_path, config.model_name,
|
||||
config.model_config, config.use_cuda)
|
||||
|
||||
synthesizer = Synthesizer(config)
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
|
@ -30,4 +27,4 @@ def tts():
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0', port=config.port)
|
||||
app.run(debug=config.debug, host='0.0.0.0', port=config.port)
|
||||
|
|
|
@ -1,8 +1,28 @@
|
|||
import io
|
||||
import os
|
||||
<<<<<<< HEAD
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
=======
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from models.tacotron import Tacotron
|
||||
from utils.audio import AudioProcessor
|
||||
from utils.generic_utils import load_config, setup_model
|
||||
from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme
|
||||
|
||||
import re
|
||||
alphabets= "([A-Za-z])"
|
||||
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
|
||||
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
|
||||
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
||||
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
||||
websites = "[.](com|net|org|io|gov)"
|
||||
>>>>>>> dev-tacotron2
|
||||
|
||||
from models.tacotron import Tacotron
|
||||
from utils.audio import AudioProcessor
|
||||
|
@ -10,61 +30,144 @@ from utils.generic_utils import load_config
|
|||
from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence
|
||||
|
||||
class Synthesizer(object):
|
||||
def load_model(self, model_path, model_name, model_config, use_cuda):
|
||||
model_config = os.path.join(model_path, model_config)
|
||||
self.model_file = os.path.join(model_path, model_name)
|
||||
print(" > Loading model ...")
|
||||
print(" | > model config: ", model_config)
|
||||
print(" | > model file: ", self.model_file)
|
||||
config = load_config(model_config)
|
||||
def __init__(self, config):
|
||||
self.wavernn = None
|
||||
self.config = config
|
||||
self.use_cuda = use_cuda
|
||||
self.use_phonemes = config.use_phonemes
|
||||
self.ap = AudioProcessor(**config.audio)
|
||||
self.use_cuda = config.use_cuda
|
||||
if self.use_cuda:
|
||||
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
|
||||
self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda)
|
||||
if self.config.wavernn_lib_path:
|
||||
self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda)
|
||||
|
||||
def load_tts(self, model_path, model_file, model_config, use_cuda):
|
||||
tts_config = os.path.join(model_path, model_config)
|
||||
self.model_file = os.path.join(model_path, model_file)
|
||||
print(" > Loading TTS model ...")
|
||||
print(" | > model config: ", tts_config)
|
||||
print(" | > model file: ", model_file)
|
||||
self.tts_config = load_config(tts_config)
|
||||
self.use_phonemes = self.tts_config.use_phonemes
|
||||
self.ap = AudioProcessor(**self.tts_config.audio)
|
||||
if self.use_phonemes:
|
||||
self.input_size = len(phonemes)
|
||||
self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
|
||||
self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars)
|
||||
else:
|
||||
self.input_size = len(symbols)
|
||||
self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
|
||||
|
||||
self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r)
|
||||
self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner])
|
||||
self.tts_model = setup_model(self.input_size, self.tts_config)
|
||||
# load model state
|
||||
if use_cuda:
|
||||
cp = torch.load(self.model_file)
|
||||
else:
|
||||
cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
|
||||
# load the model
|
||||
self.model.load_state_dict(cp['model'])
|
||||
self.tts_model.load_state_dict(cp['model'])
|
||||
if use_cuda:
|
||||
self.model.cuda()
|
||||
self.model.eval()
|
||||
self.tts_model.cuda()
|
||||
self.tts_model.eval()
|
||||
self.tts_model.decoder.max_decoder_steps = 3000
|
||||
|
||||
def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
|
||||
sys.path.append(lib_path) # set this if TTS is not installed globally
|
||||
from WaveRNN.models.wavernn import Model
|
||||
wavernn_config = os.path.join(model_path, model_config)
|
||||
model_file = os.path.join(model_path, model_file)
|
||||
print(" > Loading WaveRNN model ...")
|
||||
print(" | > model config: ", wavernn_config)
|
||||
print(" | > model file: ", model_file)
|
||||
self.wavernn_config = load_config(wavernn_config)
|
||||
self.wavernn = Model(
|
||||
rnn_dims=512,
|
||||
fc_dims=512,
|
||||
mode=self.wavernn_config.mode,
|
||||
pad=2,
|
||||
upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset
|
||||
feat_dims=80,
|
||||
compute_dims=128,
|
||||
res_out_dims=128,
|
||||
res_blocks=10,
|
||||
hop_length=self.ap.hop_length,
|
||||
sample_rate=self.ap.sample_rate,
|
||||
).cuda()
|
||||
|
||||
check = torch.load(model_file)
|
||||
self.wavernn.load_state_dict(check['model'])
|
||||
if use_cuda:
|
||||
self.wavernn.cuda()
|
||||
self.wavernn.eval()
|
||||
|
||||
def save_wav(self, wav, path):
|
||||
# wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
|
||||
wav = np.array(wav)
|
||||
self.ap.save_wav(wav, path)
|
||||
|
||||
def split_into_sentences(self, text):
|
||||
text = " " + text + " "
|
||||
text = text.replace("\n"," ")
|
||||
text = re.sub(prefixes,"\\1<prd>",text)
|
||||
text = re.sub(websites,"<prd>\\1",text)
|
||||
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
|
||||
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
|
||||
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
|
||||
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
|
||||
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
|
||||
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
|
||||
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
|
||||
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
|
||||
if "”" in text: text = text.replace(".”","”.")
|
||||
if "\"" in text: text = text.replace(".\"","\".")
|
||||
if "!" in text: text = text.replace("!\"","\"!")
|
||||
if "?" in text: text = text.replace("?\"","\"?")
|
||||
text = text.replace(".",".<stop>")
|
||||
text = text.replace("?","?<stop>")
|
||||
text = text.replace("!","!<stop>")
|
||||
text = text.replace("<prd>",".")
|
||||
sentences = text.split("<stop>")
|
||||
sentences = sentences[:-1]
|
||||
sentences = [s.strip() for s in sentences]
|
||||
return sentences
|
||||
|
||||
def tts(self, text):
|
||||
wavs = []
|
||||
for sen in text.split('.'):
|
||||
sens = self.split_into_sentences(text)
|
||||
if len(sens) == 0:
|
||||
sens = [text+'.']
|
||||
for sen in sens:
|
||||
if len(sen) < 3:
|
||||
continue
|
||||
sen = sen.strip()
|
||||
sen += '.'
|
||||
print(sen)
|
||||
<<<<<<< HEAD
|
||||
sen = sen.strip()
|
||||
|
||||
seq = np.array(self.input_adapter(sen))
|
||||
=======
|
||||
|
||||
seq = np.array(self.input_adapter(sen))
|
||||
text_hat = sequence_to_phoneme(seq)
|
||||
print(text_hat)
|
||||
>>>>>>> dev-tacotron2
|
||||
|
||||
chars_var = torch.from_numpy(seq).unsqueeze(0).long()
|
||||
|
||||
if self.use_cuda:
|
||||
chars_var = chars_var.cuda()
|
||||
mel_out, linear_out, alignments, stop_tokens = self.model.forward(
|
||||
decoder_out, postnet_out, alignments, stop_tokens = self.tts_model.inference(
|
||||
chars_var)
|
||||
<<<<<<< HEAD
|
||||
linear_out = linear_out[0].data.cpu().numpy()
|
||||
wav = self.ap.inv_spectrogram(linear_out.T)
|
||||
=======
|
||||
postnet_out = postnet_out[0].data.cpu().numpy()
|
||||
if self.tts_config.model == "Tacotron":
|
||||
wav = self.ap.inv_spectrogram(postnet_out.T)
|
||||
elif self.tts_config.model == "Tacotron2":
|
||||
if self.wavernn:
|
||||
wav = self.wavernn.generate(torch.FloatTensor(postnet_out.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550)
|
||||
else:
|
||||
wav = self.ap.inv_mel_spectrogram(postnet_out.T)
|
||||
>>>>>>> dev-tacotron2
|
||||
wavs += list(wav)
|
||||
wavs += [0] * 10000
|
||||
|
||||
|
|
|
@ -56,11 +56,10 @@
|
|||
<div class="container">
|
||||
<div class="row">
|
||||
<div class="col-lg-12 text-center">
|
||||
<h1 class="mt-5">Mozilla TTS</h1>
|
||||
<p class="lead">"work-in-progress"</p>
|
||||
<img class="mt-5" src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" alt=></img>
|
||||
<ul class="list-unstyled">
|
||||
</ul>
|
||||
<input id="text" placeholder="Enter text" size=45 type="text" name="text">
|
||||
<input id="text" placeholder="Type here..." size=45 type="text" name="text">
|
||||
<button id="speak-button" name="speak">Speak</button><br/><br/>
|
||||
<audio id="audio" controls autoplay hidden></audio>
|
||||
<p id="message"></p>
|
||||
|
|
1
setup.py
1
setup.py
|
@ -83,6 +83,7 @@ setup(
|
|||
# "lws",
|
||||
"tqdm",
|
||||
"phonemizer",
|
||||
"soundfile",
|
||||
],
|
||||
dependency_links=[
|
||||
'http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer'
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
import os
|
||||
|
||||
|
||||
def get_tests_path():
|
||||
"""Returns the path to the test directory."""
|
||||
return os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
def get_tests_input_path():
|
||||
"""Returns the path to the test data directory."""
|
||||
return os.path.join(get_tests_path(), "inputs")
|
||||
|
||||
|
||||
def get_tests_output_path():
|
||||
"""Returns the path to the directory for test outputs."""
|
||||
return os.path.join(get_tests_path(), "outputs")
|
|
@ -0,0 +1,10 @@
|
|||
client_id path sentence up_votes down_votes age gender accent
|
||||
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 21fce545b24d9a5af0403b949e95e8dd3c10c4ff3e371f14e4d5b4ebf588670b7c9e618285fc872d94a89ed7f0217d9019fe5de33f1577b49dcd518eacf63c4b Man sollte den Länderfinanzausgleich durch einen Bundesliga-Soli ersetzen. 2 0 fourties male germany
|
||||
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 42758baa4e91ef6b82b78b11a04bc5117a035a8d3bc42c33c0bb3084909af17043a194cfd8cd9839f0d6ef1ea5413acda5de5d1936abcc8ca073e2da7f9488ea Folgende Lektüre kann ich Ihnen zum Thema Kognitionspsychologie empfehlen. 2 0 fourties male germany
|
||||
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 478f172c2dbda6675247e9674ade79a5b49efeefb7c9e99040dcc69a847a01d69398cf180570859b0cdb6fc887717e04cd8b149c723d48d00b5d18f41314667c Touristen winkten den Leuten am Ufer zu. 2 0 fourties male germany
|
||||
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 4854368d6d21cb44103e432b5332f31e8d14030582a40850501bcf9377d699314a5ff27a8206fa89254ddde7f3f1c65d33836f3dfcfa16bcabec08537f2b5f08 Valentin hat das Handtuch geworfen. 2 0 fourties male germany
|
||||
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 a841a9f3e032495dd47560e65fba99eeacb3618c07de8b1351c20188e5b71e33cc52f73315f721a3a24b65763c65bb52fbf3ae052eb5774e834dcb57f296db5c Ohne Gehörschutz bei der Arbeit wäre Klaus wohl nach zwei Wochen taub. 2 0 fourties male germany
|
||||
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 03ab970a5bf5410bc3260b073cce1c7f49c688ace83dc8836b1c0f79a09fea45a27725c769f4a9d2e6181defd016d22642789d7ac51da252b42958a9192bd4c7 Gerrit erinnerte sich daran, dass er einst einen Eid geschworen hatte. 2 0 fourties male germany
|
||||
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 c4a94df443ad5f2c7241413ef7145d5f0de41ae929759073917fe96166da3c7d3a612c920ed7b0f3d5950a38d6205e9dba24af5bfb27e390a220d004e6e26744 Auf das, was jetzt kommt, habe ich nämlich absolut keinen Bock. 2 0 fourties male germany
|
||||
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 104695983b1112229b4a48696405d044dad9ddef713aa6eb1a6240cc16b7b7a2a96354ae9da99783850dde08a982091e48d3037288a3a58269cac9fe70a6bd7a Von Salzburg ist es doch nicht weit bis zum Chiemsee. 2 0 fourties male germany
|
||||
d5b5da343bb0f65e3580bc2e1902a4f5d004241488d751503f2020bc1c93f89715e355e35f6e25def2b90cb3eea99fda403eb92ae3afbb84d039a54a4ed2d875 ad2f69e053b0e20e01c82b9821fe5787f1cc8e4b0b97f0e4cab1e9a652c577169c8244fb222281a60ee3081854014113e04c4ca43643100b7c01dab0fac11974 Warum werden da keine strafrechtlichen Konsequenzen gezogen? 2 0 thirties male germany
|
|
|
@ -2,21 +2,23 @@ import os
|
|||
import unittest
|
||||
import numpy as np
|
||||
import torch as T
|
||||
|
||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
from utils.audio import AudioProcessor
|
||||
from utils.generic_utils import load_config
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
INPUTPATH = os.path.join(file_path, 'inputs')
|
||||
OUTPATH = os.path.join(file_path, "outputs/audio_tests")
|
||||
os.makedirs(OUTPATH, exist_ok=True)
|
||||
TESTS_PATH = get_tests_path()
|
||||
OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
os.makedirs(OUT_PATH, exist_ok=True)
|
||||
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
|
||||
|
||||
|
||||
class TestAudio(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TestAudio, self).__init__(*args, **kwargs)
|
||||
self.ap = AudioProcessor(**c.audio)
|
||||
self.ap = AudioProcessor(**conf.audio)
|
||||
|
||||
def test_audio_synthesis(self):
|
||||
""" 1. load wav
|
||||
|
@ -31,13 +33,13 @@ class TestAudio(unittest.TestCase):
|
|||
self.ap.signal_norm = signal_norm
|
||||
self.ap.symmetric_norm = symmetric_norm
|
||||
self.ap.clip_norm = clip_norm
|
||||
wav = self.ap.load_wav(INPUTPATH + "/example_1.wav")
|
||||
wav = self.ap.load_wav(WAV_FILE)
|
||||
mel = self.ap.melspectrogram(wav)
|
||||
wav_ = self.ap.inv_mel_spectrogram(mel)
|
||||
file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\
|
||||
.format(max_norm, signal_norm, symmetric_norm, clip_norm)
|
||||
print(" | > Creating wav file at : ", file_name)
|
||||
self.ap.save_wav(wav_, OUTPATH + file_name)
|
||||
self.ap.save_wav(wav_, OUT_PATH + file_name)
|
||||
|
||||
# maxnorm = 1.0
|
||||
_test(1., False, False, False)
|
||||
|
@ -55,7 +57,7 @@ class TestAudio(unittest.TestCase):
|
|||
def test_normalize(self):
|
||||
"""Check normalization and denormalization for range values and consistency """
|
||||
print(" > Testing normalization and denormalization.")
|
||||
wav = self.ap.load_wav(INPUTPATH + "/example_1.wav")
|
||||
wav = self.ap.load_wav(WAV_FILE)
|
||||
self.ap.signal_norm = False
|
||||
x = self.ap.melspectrogram(wav)
|
||||
x_old = x
|
|
@ -36,7 +36,6 @@
|
|||
|
||||
"save_step": 200,
|
||||
"data_path": "/home/erogol/Data/LJSpeech-1.1/",
|
||||
"data_path_cache": "/media/erogol/data_ssd/Data/Nancy/tts_cache/",
|
||||
"output_path": "result",
|
||||
"min_seq_len": 0,
|
||||
"max_seq_len": 300,
|
||||
|
|
|
@ -38,18 +38,16 @@ class CBHGTests(unittest.TestCase):
|
|||
|
||||
class DecoderTests(unittest.TestCase):
|
||||
def test_in_out(self):
|
||||
layer = Decoder(in_features=256, memory_dim=80, r=2, memory_size=4, attn_windowing=False)
|
||||
layer = Decoder(in_features=256, memory_dim=80, r=2, memory_size=4, attn_windowing=False, attn_norm="sigmoid")
|
||||
dummy_input = T.rand(4, 8, 256)
|
||||
dummy_memory = T.rand(4, 2, 80)
|
||||
|
||||
output, alignment, stop_tokens = layer(dummy_input, dummy_memory)
|
||||
output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None)
|
||||
|
||||
assert output.shape[0] == 4
|
||||
assert output.shape[1] == 1, "size not {}".format(output.shape[1])
|
||||
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
|
||||
assert stop_tokens.shape[0] == 4
|
||||
assert stop_tokens.max() <= 1.0
|
||||
assert stop_tokens.min() >= 0
|
||||
|
||||
|
||||
class EncoderTests(unittest.TestCase):
|
|
@ -7,7 +7,7 @@ from torch.utils.data import DataLoader
|
|||
from utils.generic_utils import load_config
|
||||
from utils.audio import AudioProcessor
|
||||
from datasets import TTSDataset
|
||||
from datasets.preprocess import ljspeech, tts_cache
|
||||
from datasets.preprocess import ljspeech
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
OUTPATH = os.path.join(file_path, "outputs/loader_tests/")
|
||||
|
@ -16,15 +16,11 @@ c = load_config(os.path.join(file_path, 'test_config.json'))
|
|||
ok_ljspeech = os.path.exists(c.data_path)
|
||||
|
||||
DATA_EXIST = True
|
||||
CACHE_EXIST = True
|
||||
if not os.path.exists(c.data_path_cache):
|
||||
CACHE_EXIST = False
|
||||
|
||||
if not os.path.exists(c.data_path):
|
||||
DATA_EXIST = False
|
||||
|
||||
print(" > Dynamic data loader test: {}".format(DATA_EXIST))
|
||||
print(" > Cache data loader test: {}".format(CACHE_EXIST))
|
||||
|
||||
|
||||
class TestTTSDataset(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
@ -126,8 +122,9 @@ class TestTTSDataset(unittest.TestCase):
|
|||
wav = self.ap.load_wav(item_idx[0])
|
||||
mel = self.ap.melspectrogram(wav)
|
||||
mel_dl = mel_input[0].cpu().numpy()
|
||||
assert (
|
||||
abs(mel.T).astype("float32") - abs(mel_dl[:-1])).sum() == 0
|
||||
assert (abs(mel.T).astype("float32")
|
||||
- abs(mel_dl[:-1])
|
||||
).sum() == 0
|
||||
|
||||
# check mel-spec correctness
|
||||
mel_spec = mel_input[0].cpu().numpy()
|
||||
|
@ -139,7 +136,8 @@ class TestTTSDataset(unittest.TestCase):
|
|||
linear_spec = linear_input[0].cpu().numpy()
|
||||
wav = self.ap.inv_spectrogram(linear_spec.T)
|
||||
self.ap.save_wav(wav, OUTPATH + '/linear_inv_dataloader.wav')
|
||||
shutil.copy(item_idx[0], OUTPATH + '/linear_target_dataloader.wav')
|
||||
shutil.copy(item_idx[0],
|
||||
OUTPATH + '/linear_target_dataloader.wav')
|
||||
|
||||
# check the last time step to be zero padded
|
||||
assert linear_input[0, -1].sum() == 0
|
|
@ -0,0 +1,28 @@
|
|||
import unittest
|
||||
import os
|
||||
from tests import get_tests_input_path
|
||||
|
||||
from datasets.preprocess import common_voice
|
||||
|
||||
|
||||
class TestPreprocessors(unittest.TestCase):
|
||||
|
||||
def test_common_voice_preprocessor(self):
|
||||
root_path = get_tests_input_path()
|
||||
meta_file = "common_voice.tsv"
|
||||
items = common_voice(root_path, meta_file)
|
||||
assert items[0][0] == "Man sollte den Länderfinanzausgleich durch " \
|
||||
"einen Bundesliga-Soli ersetzen."
|
||||
assert items[0][1] == os.path.join(get_tests_input_path(), "clips",
|
||||
"21fce545b24d9a5af0403b949e95e8dd3"
|
||||
"c10c4ff3e371f14e4d5b4ebf588670b7c"
|
||||
"9e618285fc872d94a89ed7f0217d9019f"
|
||||
"e5de33f1577b49dcd518eacf63c4b.wav")
|
||||
|
||||
assert items[-1][0] == "Warum werden da keine strafrechtlichen " \
|
||||
"Konsequenzen gezogen?"
|
||||
assert items[-1][1] == os.path.join(get_tests_input_path(), "clips",
|
||||
"ad2f69e053b0e20e01c82b9821fe5787f1"
|
||||
"cc8e4b0b97f0e4cab1e9a652c577169c82"
|
||||
"44fb222281a60ee3081854014113e04c4c"
|
||||
"a43643100b7c01dab0fac11974.wav")
|
|
@ -0,0 +1,69 @@
|
|||
import os
|
||||
import copy
|
||||
import torch
|
||||
import unittest
|
||||
import numpy as np
|
||||
|
||||
from torch import optim
|
||||
from torch import nn
|
||||
from utils.generic_utils import load_config
|
||||
from layers.losses import MSELossMasked
|
||||
from models.tacotron2 import Tacotron2
|
||||
|
||||
torch.manual_seed(1)
|
||||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
|
||||
|
||||
class TacotronTrainTest(unittest.TestCase):
|
||||
def test_train_step(self):
|
||||
input = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
|
||||
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
||||
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
|
||||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()):, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = MSELossMasked().to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron2(24, c.r).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(),
|
||||
model_ref.parameters()):
|
||||
assert (param - param_ref).sum() == 0, param
|
||||
count += 1
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for i in range(5):
|
||||
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
||||
input, input_lengths, mel_spec)
|
||||
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
||||
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# check parameter changes
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(),
|
||||
model_ref.parameters()):
|
||||
# ignore pre-higway layer since it works conditional
|
||||
# if count not in [145, 59]:
|
||||
assert (param != param_ref).any(
|
||||
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
||||
count, param.shape, param, param_ref)
|
||||
count += 1
|
|
@ -18,9 +18,16 @@ file_path = os.path.dirname(os.path.realpath(__file__))
|
|||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
r"""Count number of trainable parameters in a network"""
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
class TacotronTrainTest(unittest.TestCase):
|
||||
def test_train_step(self):
|
||||
input = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
|
||||
input_lengths[-1] = 128
|
||||
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||
linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
|
||||
|
@ -31,13 +38,19 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
|
||||
stop_targets = stop_targets.view(input.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float()
|
||||
stop_targets = (stop_targets.sum(2) >
|
||||
0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
criterion = L1LossMasked().to(device)
|
||||
criterion_st = nn.BCELoss().to(device)
|
||||
model = Tacotron(32, c.embedding_size, c.audio['num_freq'], c.audio['num_mels'],
|
||||
c.r, c.memory_size).to(device)
|
||||
criterion_st = nn.BCEWithLogitsLoss().to(device)
|
||||
model = Tacotron(
|
||||
32,
|
||||
linear_dim=c.audio['num_freq'],
|
||||
mel_dim=c.audio['num_mels'],
|
||||
r=c.r,
|
||||
memory_size=c.memory_size).to(device)
|
||||
model.train()
|
||||
print(" > Num parameters for Tacotron model:%s"%(count_parameters(model)))
|
||||
model_ref = copy.deepcopy(model)
|
||||
count = 0
|
||||
for param, param_ref in zip(model.parameters(),
|
||||
|
@ -47,9 +60,7 @@ class TacotronTrainTest(unittest.TestCase):
|
|||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
for i in range(5):
|
||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||
input, mel_spec)
|
||||
assert stop_tokens.data.max() <= 1.0
|
||||
assert stop_tokens.data.min() >= 0.0
|
||||
input, input_lengths, mel_spec)
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
|
@ -0,0 +1,76 @@
|
|||
import unittest
|
||||
import torch as T
|
||||
|
||||
from utils.text import *
|
||||
|
||||
def test_phoneme_to_sequence():
|
||||
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||
text_cleaner = ["phoneme_cleaners"]
|
||||
lang = "en-us"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
|
||||
assert text_hat == gt
|
||||
|
||||
# multiple punctuations
|
||||
text = "Be a voice, not an! echo?"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
|
||||
# not ending with punctuation
|
||||
text = "Be a voice, not an! echo"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
|
||||
# original
|
||||
text = "Be a voice, not an echo!"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
|
||||
# extra space after the sentence
|
||||
text = "Be a voice, not an! echo. "
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
|
||||
# extra space after the sentence
|
||||
text = "Be a voice, not an! echo. "
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang, True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
|
||||
# padding char
|
||||
text = "_Be a _voice, not an! echo_"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
|
||||
|
||||
def test_text2phone():
|
||||
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||
text_cleaner = ["phoneme_cleaners"]
|
||||
gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i|| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n||| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
|
||||
lang = "en-us"
|
||||
phonemes = text2phone(text, lang)
|
||||
assert gt == phonemes
|
308
train.py
308
train.py
|
@ -14,20 +14,20 @@ from torch import optim
|
|||
from torch.utils.data import DataLoader
|
||||
|
||||
from datasets.TTSDataset import MyDataset
|
||||
from layers.losses import L1LossMasked
|
||||
from models.tacotron import Tacotron
|
||||
from distribute import (DistributedSampler, apply_gradient_allreduce,
|
||||
init_distributed, reduce_tensor)
|
||||
from layers.losses import L1LossMasked, MSELossMasked
|
||||
from utils.audio import AudioProcessor
|
||||
from utils.generic_utils import (
|
||||
NoamLR, check_update, count_parameters, create_experiment_folder,
|
||||
get_commit_hash, load_config, lr_decay, remove_experiment_folder,
|
||||
save_best_model, save_checkpoint, sequence_mask, weight_decay)
|
||||
from utils.generic_utils import (NoamLR, check_update, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
load_config, lr_decay,
|
||||
remove_experiment_folder, save_best_model,
|
||||
save_checkpoint, sequence_mask, weight_decay,
|
||||
set_init_dict, copy_config_file, setup_model)
|
||||
from utils.logger import Logger
|
||||
from utils.synthesis import synthesis
|
||||
from utils.text.symbols import phonemes, symbols
|
||||
from utils.visual import plot_alignment, plot_spectrogram
|
||||
from distribute import init_distributed, apply_gradient_allreduce, reduce_tensor
|
||||
from distribute import DistributedSampler
|
||||
|
||||
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
@ -53,10 +53,10 @@ def setup_loader(is_val=False, verbose=False):
|
|||
batch_group_size=0 if is_val else c.batch_group_size * c.batch_size,
|
||||
min_seq_len=0 if is_val else c.min_seq_len,
|
||||
max_seq_len=float("inf") if is_val else c.max_seq_len,
|
||||
cached=False if c.dataset != "tts_cache" else True,
|
||||
phoneme_cache_path=c.phoneme_cache_path,
|
||||
use_phonemes=c.use_phonemes,
|
||||
phoneme_language=c.phoneme_language,
|
||||
enable_eos_bos=c.enable_eos_bos_chars,
|
||||
verbose=verbose)
|
||||
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(
|
||||
|
@ -77,24 +77,19 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
data_loader = setup_loader(is_val=False, verbose=(epoch==0))
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
avg_linear_loss = 0
|
||||
avg_mel_loss = 0
|
||||
avg_postnet_loss = 0
|
||||
avg_decoder_loss = 0
|
||||
avg_stop_loss = 0
|
||||
avg_step_time = 0
|
||||
print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True)
|
||||
n_priority_freq = int(
|
||||
3000 / (c.audio['sample_rate'] * 0.5) * c.audio['num_freq'])
|
||||
if num_gpus > 0:
|
||||
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||
else:
|
||||
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
linear_input = data[2]
|
||||
linear_input = data[2] if c.model == "Tacotron" else None
|
||||
mel_input = data[3]
|
||||
mel_lengths = data[4]
|
||||
stop_targets = data[5]
|
||||
|
@ -104,7 +99,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
# set stop targets view, we predict a single stop token per r frames prediction
|
||||
stop_targets = stop_targets.view(text_input.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float()
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
|
||||
|
||||
current_step = num_iter + args.restore_step + \
|
||||
epoch * len(data_loader) + 1
|
||||
|
@ -113,7 +108,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
if c.lr_decay:
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
optimizer_st.zero_grad()
|
||||
if optimizer_st: optimizer_st.zero_grad();
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
|
@ -121,85 +116,90 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
text_lengths = text_lengths.cuda(non_blocking=True)
|
||||
mel_input = mel_input.cuda(non_blocking=True)
|
||||
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
||||
linear_input = linear_input.cuda(non_blocking=True)
|
||||
linear_input = linear_input.cuda(non_blocking=True) if c.model == "Tacotron" else None
|
||||
stop_targets = stop_targets.cuda(non_blocking=True)
|
||||
|
||||
# compute mask for padding
|
||||
mask = sequence_mask(text_lengths)
|
||||
|
||||
# forward pass
|
||||
mel_output, linear_output, alignments, stop_tokens = model(
|
||||
text_input, mel_input, mask)
|
||||
# forward pass model
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||
text_input, text_lengths, mel_input)
|
||||
|
||||
# loss computation
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
mel_loss = criterion(mel_output, mel_input, mel_lengths)
|
||||
linear_loss = (1 - c.loss_weight) * criterion(linear_output, linear_input, mel_lengths)\
|
||||
+ c.loss_weight * criterion(linear_output[:, :, :n_priority_freq],
|
||||
linear_input[:, :, :n_priority_freq],
|
||||
mel_lengths)
|
||||
loss = mel_loss + linear_loss
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
||||
if c.loss_masking:
|
||||
decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
|
||||
if c.model == "Tacotron":
|
||||
postnet_loss = criterion(postnet_output, linear_input, mel_lengths)
|
||||
else:
|
||||
postnet_loss = criterion(postnet_output, mel_input, mel_lengths)
|
||||
else:
|
||||
decoder_loss = criterion(decoder_output, mel_input)
|
||||
if c.model == "Tacotron":
|
||||
postnet_loss = criterion(postnet_output, linear_input)
|
||||
else:
|
||||
postnet_loss = criterion(postnet_output, mel_input)
|
||||
loss = decoder_loss + postnet_loss
|
||||
if not c.separate_stopnet and c.stopnet:
|
||||
loss += stop_loss
|
||||
|
||||
# backpass and check the grad norm for spec losses
|
||||
loss.backward(retain_graph=True)
|
||||
loss.backward()
|
||||
optimizer, current_lr = weight_decay(optimizer, c.wd)
|
||||
grad_norm, _ = check_update(model, 1.0)
|
||||
grad_norm, _ = check_update(model, c.grad_clip)
|
||||
optimizer.step()
|
||||
|
||||
# backpass and check the grad norm for stop loss
|
||||
if c.separate_stopnet:
|
||||
stop_loss.backward()
|
||||
optimizer_st, _ = weight_decay(optimizer_st, c.wd)
|
||||
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
|
||||
optimizer_st.step()
|
||||
else:
|
||||
grad_norm_st = 0
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
if current_step % c.print_step == 0:
|
||||
print(
|
||||
" | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} LinearLoss:{:.5f} "
|
||||
"MelLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} "
|
||||
"GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}"
|
||||
.format(num_iter, batch_n_iter, current_step, loss.item(),
|
||||
linear_loss.item(), mel_loss.item(), stop_loss.item(),
|
||||
grad_norm, grad_norm_st, avg_text_length,
|
||||
avg_spec_length, step_time, current_lr),
|
||||
" | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} "
|
||||
"DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} "
|
||||
"GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}".format(
|
||||
num_iter, batch_n_iter, current_step, loss.item(),
|
||||
postnet_loss.item(), decoder_loss.item(), stop_loss.item(),
|
||||
grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, current_lr),
|
||||
flush=True)
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
linear_loss = reduce_tensor(linear_loss.data, num_gpus)
|
||||
mel_loss = reduce_tensor(mel_loss.data, num_gpus)
|
||||
postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
|
||||
decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
|
||||
loss = reduce_tensor(loss.data, num_gpus)
|
||||
stop_loss = reduce_tensor(stop_loss.data, num_gpus)
|
||||
stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss
|
||||
|
||||
if args.rank == 0:
|
||||
avg_linear_loss += float(linear_loss.item())
|
||||
avg_mel_loss += float(mel_loss.item())
|
||||
avg_stop_loss += stop_loss.item()
|
||||
avg_postnet_loss += float(postnet_loss.item())
|
||||
avg_decoder_loss += float(decoder_loss.item())
|
||||
avg_stop_loss += stop_loss if type(stop_loss) is float else float(stop_loss.item())
|
||||
avg_step_time += step_time
|
||||
|
||||
# Plot Training Iter Stats
|
||||
iter_stats = {
|
||||
"loss_posnet": linear_loss.item(),
|
||||
"loss_decoder": mel_loss.item(),
|
||||
iter_stats = {"loss_posnet": postnet_loss.item(),
|
||||
"loss_decoder": decoder_loss.item(),
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"grad_norm_st": grad_norm_st,
|
||||
"step_time": step_time
|
||||
}
|
||||
"step_time": step_time}
|
||||
tb_logger.tb_train_iter_stats(current_step, iter_stats)
|
||||
|
||||
if current_step % c.save_step == 0:
|
||||
if c.checkpoint:
|
||||
# save model
|
||||
save_checkpoint(model, optimizer, optimizer_st,
|
||||
linear_loss.item(), OUT_PATH, current_step,
|
||||
postnet_loss.item(), OUT_PATH, current_step,
|
||||
epoch)
|
||||
|
||||
# Diagnostic visualizations
|
||||
const_spec = linear_output[0].data.cpu().numpy()
|
||||
gt_spec = linear_input[0].data.cpu().numpy()
|
||||
const_spec = postnet_output[0].data.cpu().numpy()
|
||||
gt_spec = linear_input[0].data.cpu().numpy() if c.model == "Tacotron" else mel_input[0].data.cpu().numpy()
|
||||
align_img = alignments[0].data.cpu().numpy()
|
||||
|
||||
figures = {
|
||||
|
@ -210,57 +210,61 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
|||
tb_logger.tb_train_figures(current_step, figures)
|
||||
|
||||
# Sample audio
|
||||
tb_logger.tb_train_audios(
|
||||
current_step, {'TrainAudio': ap.inv_spectrogram(const_spec.T)},
|
||||
if c.model == "Tacotron":
|
||||
train_audio = ap.inv_spectrogram(const_spec.T)
|
||||
else:
|
||||
train_audio = ap.inv_mel_spectrogram(const_spec.T)
|
||||
tb_logger.tb_train_audios(current_step,
|
||||
{'TrainAudio': train_audio},
|
||||
c.audio["sample_rate"])
|
||||
|
||||
avg_linear_loss /= (num_iter + 1)
|
||||
avg_mel_loss /= (num_iter + 1)
|
||||
avg_postnet_loss /= (num_iter + 1)
|
||||
avg_decoder_loss /= (num_iter + 1)
|
||||
avg_stop_loss /= (num_iter + 1)
|
||||
avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss
|
||||
avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss
|
||||
avg_step_time /= (num_iter + 1)
|
||||
|
||||
# print epoch stats
|
||||
print(
|
||||
" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
|
||||
"AvgLinearLoss:{:.5f} AvgMelLoss:{:.5f} "
|
||||
"AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} "
|
||||
"AvgStopLoss:{:.5f} EpochTime:{:.2f} "
|
||||
"AvgStepTime:{:.2f}".format(current_step, avg_total_loss,
|
||||
avg_linear_loss, avg_mel_loss,
|
||||
avg_postnet_loss, avg_decoder_loss,
|
||||
avg_stop_loss, epoch_time, avg_step_time),
|
||||
flush=True)
|
||||
|
||||
# Plot Epoch Stats
|
||||
if args.rank == 0:
|
||||
# Plot Training Epoch Stats
|
||||
epoch_stats = {
|
||||
"loss_postnet": avg_linear_loss,
|
||||
"loss_decoder": avg_mel_loss,
|
||||
epoch_stats = {"loss_postnet": avg_postnet_loss,
|
||||
"loss_decoder": avg_decoder_loss,
|
||||
"stop_loss": avg_stop_loss,
|
||||
"epoch_time": epoch_time
|
||||
}
|
||||
"epoch_time": epoch_time}
|
||||
tb_logger.tb_train_epoch_stats(current_step, epoch_stats)
|
||||
if c.tb_model_param_stats:
|
||||
tb_logger.tb_model_weights(model, current_step)
|
||||
return avg_linear_loss, current_step
|
||||
return avg_postnet_loss, current_step
|
||||
|
||||
|
||||
def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
||||
data_loader = setup_loader(is_val=True)
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
avg_linear_loss = 0
|
||||
avg_mel_loss = 0
|
||||
avg_postnet_loss = 0
|
||||
avg_decoder_loss = 0
|
||||
avg_stop_loss = 0
|
||||
print("\n > Validation")
|
||||
if c.test_sentences_file is None:
|
||||
test_sentences = [
|
||||
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||
"Be a voice, not an echo.",
|
||||
"I'm sorry Dave. I'm afraid I can't do that.",
|
||||
"This cake is great. It's so delicious and moist."
|
||||
]
|
||||
n_priority_freq = int(
|
||||
3000 / (c.audio['sample_rate'] * 0.5) * c.audio['num_freq'])
|
||||
else:
|
||||
with open(c.test_sentences_file, "r") as f:
|
||||
test_sentences = [s.strip() for s in f.readlines()]
|
||||
with torch.no_grad():
|
||||
if data_loader is not None:
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
|
@ -269,7 +273,7 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
|||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
linear_input = data[2]
|
||||
linear_input = data[2] if c.model == "Tacotron" else None
|
||||
mel_input = data[3]
|
||||
mel_lengths = data[4]
|
||||
stop_targets = data[5]
|
||||
|
@ -278,56 +282,64 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
|||
stop_targets = stop_targets.view(text_input.shape[0],
|
||||
stop_targets.size(1) // c.r,
|
||||
-1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float()
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
text_input = text_input.cuda()
|
||||
mel_input = mel_input.cuda()
|
||||
mel_lengths = mel_lengths.cuda()
|
||||
linear_input = linear_input.cuda()
|
||||
linear_input = linear_input.cuda() if c.model == "Tacotron" else None
|
||||
stop_targets = stop_targets.cuda()
|
||||
|
||||
# forward pass
|
||||
mel_output, linear_output, alignments, stop_tokens =\
|
||||
model.forward(text_input, mel_input)
|
||||
decoder_output, postnet_output, alignments, stop_tokens =\
|
||||
model.forward(text_input, text_lengths, mel_input)
|
||||
|
||||
# loss computation
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||
mel_loss = criterion(mel_output, mel_input, mel_lengths)
|
||||
linear_loss = 0.5 * criterion(linear_output, linear_input, mel_lengths) \
|
||||
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
||||
linear_input[:, :, :n_priority_freq],
|
||||
mel_lengths)
|
||||
loss = mel_loss + linear_loss + stop_loss
|
||||
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
||||
if c.loss_masking:
|
||||
decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
|
||||
if c.model == "Tacotron":
|
||||
postnet_loss = criterion(postnet_output, linear_input, mel_lengths)
|
||||
else:
|
||||
postnet_loss = criterion(postnet_output, mel_input, mel_lengths)
|
||||
else:
|
||||
decoder_loss = criterion(decoder_output, mel_input)
|
||||
if c.model == "Tacotron":
|
||||
postnet_loss = criterion(postnet_output, linear_input)
|
||||
else:
|
||||
postnet_loss = criterion(postnet_output, mel_input)
|
||||
loss = decoder_loss + postnet_loss + stop_loss
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
if num_iter % c.print_step == 0:
|
||||
print(
|
||||
" | > TotalLoss: {:.5f} LinearLoss: {:.5f} MelLoss:{:.5f} "
|
||||
" | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} "
|
||||
"StopLoss: {:.5f} ".format(loss.item(),
|
||||
linear_loss.item(),
|
||||
mel_loss.item(),
|
||||
postnet_loss.item(),
|
||||
decoder_loss.item(),
|
||||
stop_loss.item()),
|
||||
flush=True)
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
linear_loss = reduce_tensor(linear_loss.data, num_gpus)
|
||||
mel_loss = reduce_tensor(mel_loss.data, num_gpus)
|
||||
postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
|
||||
decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
|
||||
if c.stopnet:
|
||||
stop_loss = reduce_tensor(stop_loss.data, num_gpus)
|
||||
|
||||
avg_linear_loss += float(linear_loss.item())
|
||||
avg_mel_loss += float(mel_loss.item())
|
||||
avg_postnet_loss += float(postnet_loss.item())
|
||||
avg_decoder_loss += float(decoder_loss.item())
|
||||
avg_stop_loss += stop_loss.item()
|
||||
|
||||
if args.rank == 0:
|
||||
# Diagnostic visualizations
|
||||
idx = np.random.randint(mel_input.shape[0])
|
||||
const_spec = linear_output[idx].data.cpu().numpy()
|
||||
gt_spec = linear_input[idx].data.cpu().numpy()
|
||||
const_spec = postnet_output[idx].data.cpu().numpy()
|
||||
gt_spec = linear_input[idx].data.cpu().numpy() if c.model == "Tacotron" else mel_input[idx].data.cpu().numpy()
|
||||
align_img = alignments[idx].data.cpu().numpy()
|
||||
|
||||
eval_figures = {
|
||||
|
@ -338,21 +350,21 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
|||
tb_logger.tb_eval_figures(current_step, eval_figures)
|
||||
|
||||
# Sample audio
|
||||
tb_logger.tb_eval_audios(
|
||||
current_step, {"ValAudio": ap.inv_spectrogram(const_spec.T)},
|
||||
c.audio["sample_rate"])
|
||||
if c.model == "Tacotron":
|
||||
eval_audio = ap.inv_spectrogram(const_spec.T)
|
||||
else:
|
||||
eval_audio = ap.inv_mel_spectrogram(const_spec.T)
|
||||
tb_logger.tb_eval_audios(current_step, {"ValAudio": eval_audio}, c.audio["sample_rate"])
|
||||
|
||||
# compute average losses
|
||||
avg_linear_loss /= (num_iter + 1)
|
||||
avg_mel_loss /= (num_iter + 1)
|
||||
avg_postnet_loss /= (num_iter + 1)
|
||||
avg_decoder_loss /= (num_iter + 1)
|
||||
avg_stop_loss /= (num_iter + 1)
|
||||
|
||||
# Plot Validation Stats
|
||||
epoch_stats = {
|
||||
"loss_postnet": avg_linear_loss,
|
||||
"loss_decoder": avg_mel_loss,
|
||||
"stop_loss": avg_stop_loss
|
||||
}
|
||||
epoch_stats = {"loss_postnet": avg_postnet_loss,
|
||||
"loss_decoder": avg_decoder_loss,
|
||||
"stop_loss": avg_stop_loss}
|
||||
tb_logger.tb_eval_stats(current_step, epoch_stats)
|
||||
|
||||
if args.rank == 0 and epoch > c.test_delay_epochs:
|
||||
|
@ -362,7 +374,7 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
|||
print(" | > Synthesizing test sentences")
|
||||
for idx, test_sentence in enumerate(test_sentences):
|
||||
try:
|
||||
wav, alignment, linear_spec, _, stop_tokens = synthesis(
|
||||
wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
|
||||
model, test_sentence, c, use_cuda, ap)
|
||||
file_path = os.path.join(AUDIO_PATH, str(current_step))
|
||||
os.makedirs(file_path, exist_ok=True)
|
||||
|
@ -370,16 +382,14 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
|||
"TestSentence_{}.wav".format(idx))
|
||||
ap.save_wav(wav, file_path)
|
||||
test_audios['{}-audio'.format(idx)] = wav
|
||||
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
|
||||
linear_spec, ap)
|
||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(
|
||||
alignment)
|
||||
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(postnet_output, ap)
|
||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment)
|
||||
except:
|
||||
print(" !! Error creating Test Sentence -", idx)
|
||||
traceback.print_exc()
|
||||
tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate'])
|
||||
tb_logger.tb_test_figures(current_step, test_figures)
|
||||
return avg_linear_loss
|
||||
return avg_postnet_loss
|
||||
|
||||
|
||||
def main(args):
|
||||
|
@ -388,65 +398,52 @@ def main(args):
|
|||
init_distributed(args.rank, num_gpus, args.group_id,
|
||||
c.distributed["backend"], c.distributed["url"])
|
||||
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
|
||||
model = Tacotron(
|
||||
num_chars=num_chars,
|
||||
embedding_dim=c.embedding_size,
|
||||
linear_dim=ap.num_freq,
|
||||
mel_dim=ap.num_mels,
|
||||
r=c.r,
|
||||
memory_size=c.memory_size)
|
||||
model = setup_model(num_chars, c)
|
||||
|
||||
print(" | > Num output units : {}".format(ap.num_freq), flush=True)
|
||||
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0)
|
||||
if c.stopnet and c.separate_stopnet:
|
||||
optimizer_st = optim.Adam(
|
||||
model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0)
|
||||
else:
|
||||
optimizer_st = None
|
||||
|
||||
criterion = L1LossMasked()
|
||||
criterion_st = nn.BCELoss()
|
||||
if c.loss_masking:
|
||||
criterion = L1LossMasked() if c.model == "Tacotron" else MSELossMasked()
|
||||
else:
|
||||
criterion = nn.L1Loss() if c.model == "Tacotron" else nn.MSELoss()
|
||||
criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None
|
||||
|
||||
if args.restore_path:
|
||||
checkpoint = torch.load(args.restore_path)
|
||||
try:
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
# TODO: fix optimizer init, model.cuda() needs to be called before
|
||||
# optimizer restore
|
||||
# optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if len(c.reinit_layers) > 0:
|
||||
raise RuntimeError
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
except:
|
||||
print(" > Partial model initialization.")
|
||||
partial_init_flag = True
|
||||
model_dict = model.state_dict()
|
||||
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
|
||||
# 1. filter out unnecessary keys
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in checkpoint['model'].items() if k in model_dict
|
||||
}
|
||||
# 2. filter out different size layers
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in pretrained_dict.items()
|
||||
if v.numel() == model_dict[k].numel()
|
||||
}
|
||||
# 3. overwrite entries in the existing state dict
|
||||
model_dict.update(pretrained_dict)
|
||||
# 4. load the new state dict
|
||||
model_dict = set_init_dict(model_dict, checkpoint, c)
|
||||
model.load_state_dict(model_dict)
|
||||
print(" | > {} / {} layers are initialized".format(
|
||||
len(pretrained_dict), len(model_dict)))
|
||||
if use_cuda:
|
||||
model = model.cuda()
|
||||
criterion.cuda()
|
||||
criterion_st.cuda()
|
||||
del model_dict
|
||||
for group in optimizer.param_groups:
|
||||
group['lr'] = c.lr
|
||||
print(
|
||||
" > Model restored from step %d" % checkpoint['step'], flush=True)
|
||||
start_epoch = checkpoint['epoch']
|
||||
best_loss = checkpoint['linear_loss']
|
||||
args.restore_step = checkpoint['step']
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
||||
if use_cuda:
|
||||
model = model.cuda()
|
||||
criterion.cuda()
|
||||
criterion_st.cuda()
|
||||
if criterion_st: criterion_st.cuda();
|
||||
|
||||
# DISTRUBUTED
|
||||
if num_gpus > 1:
|
||||
|
@ -497,7 +494,7 @@ if __name__ == '__main__':
|
|||
parser.add_argument(
|
||||
'--debug',
|
||||
type=bool,
|
||||
default=False,
|
||||
default=True,
|
||||
help='Do not verify commit integrity to run training.')
|
||||
parser.add_argument(
|
||||
'--data_path',
|
||||
|
@ -509,6 +506,12 @@ if __name__ == '__main__':
|
|||
type=str,
|
||||
help='path for training outputs.',
|
||||
default='')
|
||||
parser.add_argument(
|
||||
'--output_folder',
|
||||
type=str,
|
||||
default='',
|
||||
help='folder name for traning outputs.'
|
||||
)
|
||||
|
||||
# DISTRUBUTED
|
||||
parser.add_argument(
|
||||
|
@ -534,15 +537,20 @@ if __name__ == '__main__':
|
|||
else:
|
||||
OUT_PATH = args.output_path
|
||||
|
||||
if args.group_id == '':
|
||||
OUT_PATH = create_experiment_folder(OUT_PATH, c.model_name, args.debug)
|
||||
if args.group_id == '' and args.output_folder == '':
|
||||
OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
|
||||
else:
|
||||
OUT_PATH = os.path.join(OUT_PATH, args.output_folder)
|
||||
|
||||
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
|
||||
|
||||
if args.rank == 0:
|
||||
os.makedirs(AUDIO_PATH, exist_ok=True)
|
||||
shutil.copyfile(args.config_path, os.path.join(OUT_PATH,
|
||||
'config.json'))
|
||||
new_fields = {}
|
||||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
import pickle
|
||||
import copy
|
||||
import numpy as np
|
||||
|
@ -172,6 +173,14 @@ class AudioProcessor(object):
|
|||
else:
|
||||
return self._griffin_lim(S**self.power)
|
||||
|
||||
def out_linear_to_mel(self, linear_spec):
|
||||
S = self._denormalize(linear_spec)
|
||||
S = self._db_to_amp(S + self.ref_level_db)
|
||||
S = self._linear_to_mel(np.abs(S))
|
||||
S = self._amp_to_db(S) - self.ref_level_db
|
||||
mel = self._normalize(S)
|
||||
return mel
|
||||
|
||||
def _griffin_lim(self, S):
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
|
@ -230,7 +239,7 @@ class AudioProcessor(object):
|
|||
# return np.sign(signal) * magnitude
|
||||
|
||||
def load_wav(self, filename, encode=False):
|
||||
x, sr = librosa.load(filename, sr=self.sample_rate)
|
||||
x, sr = sf.read(filename)
|
||||
if self.do_trim_silence:
|
||||
x = self.trim_silence(x)
|
||||
# sr, x = io.wavfile.read(filename)
|
||||
|
|
|
@ -8,6 +8,7 @@ import datetime
|
|||
import json
|
||||
import torch
|
||||
import subprocess
|
||||
import importlib
|
||||
import numpy as np
|
||||
from collections import OrderedDict
|
||||
from torch.autograd import Variable
|
||||
|
@ -31,6 +32,12 @@ def load_config(config_path):
|
|||
return config
|
||||
|
||||
|
||||
def get_git_branch():
|
||||
out = subprocess.check_output(["git", "branch"]).decode("utf8")
|
||||
current = next(line for line in out.split("\n") if line.startswith("*"))
|
||||
return current.replace("* ", "")
|
||||
|
||||
|
||||
def get_commit_hash():
|
||||
"""https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script"""
|
||||
# try:
|
||||
|
@ -48,9 +55,9 @@ def get_commit_hash():
|
|||
def create_experiment_folder(root_path, model_name, debug):
|
||||
""" Create a folder with the current date and time """
|
||||
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
|
||||
if debug:
|
||||
commit_hash = 'debug'
|
||||
else:
|
||||
# if debug:
|
||||
# commit_hash = 'debug'
|
||||
# else:
|
||||
commit_hash = get_commit_hash()
|
||||
output_folder = os.path.join(
|
||||
root_path, model_name + '-' + date_str + '-' + commit_hash)
|
||||
|
@ -71,10 +78,19 @@ def remove_experiment_folder(experiment_path):
|
|||
print(" ! Run is kept in {}".format(experiment_path))
|
||||
|
||||
|
||||
def copy_config_file(config_file, path):
|
||||
def copy_config_file(config_file, out_path, new_fields):
|
||||
config_name = os.path.basename(config_file)
|
||||
out_path = os.path.join(path, config_name)
|
||||
shutil.copyfile(config_file, out_path)
|
||||
config_lines = open(config_file, "r").readlines()
|
||||
# add extra information fields
|
||||
for key, value in new_fields.items():
|
||||
if type(value) == str:
|
||||
new_line = '"{}":"{}",\n'.format(key, value)
|
||||
else:
|
||||
new_line = '"{}":{},\n'.format(key, value)
|
||||
config_lines.insert(1, new_line)
|
||||
config_out_file = open(out_path, "w")
|
||||
config_out_file.writelines(config_lines)
|
||||
config_out_file.close()
|
||||
|
||||
|
||||
def _trim_model_state_dict(state_dict):
|
||||
|
@ -99,7 +115,6 @@ def save_checkpoint(model, optimizer, optimizer_st, model_loss, out_path,
|
|||
state = {
|
||||
'model': new_state_dict,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'optimizer_st': optimizer_st.state_dict(),
|
||||
'step': current_step,
|
||||
'epoch': epoch,
|
||||
'linear_loss': model_loss,
|
||||
|
@ -191,7 +206,72 @@ def sequence_mask(sequence_length, max_len=None):
|
|||
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
|
||||
if sequence_length.is_cuda:
|
||||
seq_range_expand = seq_range_expand.cuda()
|
||||
seq_length_expand = (sequence_length.unsqueeze(1)
|
||||
.expand_as(seq_range_expand))
|
||||
seq_length_expand = (
|
||||
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
|
||||
# B x T_max
|
||||
return seq_range_expand < seq_length_expand
|
||||
|
||||
|
||||
def set_init_dict(model_dict, checkpoint, c):
|
||||
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
|
||||
for k, v in checkpoint['model'].items():
|
||||
if k not in model_dict:
|
||||
print(" | > Layer missing in the model definition: {}".format(k))
|
||||
# 1. filter out unnecessary keys
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in checkpoint['model'].items() if k in model_dict
|
||||
}
|
||||
# 2. filter out different size layers
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in pretrained_dict.items()
|
||||
if v.numel() == model_dict[k].numel()
|
||||
}
|
||||
# 3. skip reinit layers
|
||||
if c.reinit_layers is not None:
|
||||
for reinit_layer_name in c.reinit_layers:
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in pretrained_dict.items()
|
||||
if reinit_layer_name not in k
|
||||
}
|
||||
# 4. overwrite entries in the existing state dict
|
||||
model_dict.update(pretrained_dict)
|
||||
print(" | > {} / {} layers are restored.".format(
|
||||
len(pretrained_dict), len(model_dict)))
|
||||
return model_dict
|
||||
|
||||
|
||||
def setup_model(num_chars, c):
|
||||
print(" > Using model: {}".format(c.model))
|
||||
MyModel = importlib.import_module('models.' + c.model.lower())
|
||||
MyModel = getattr(MyModel, c.model)
|
||||
if c.model.lower() == "tacotron":
|
||||
model = MyModel(
|
||||
num_chars=num_chars,
|
||||
r=c.r,
|
||||
linear_dim=1025,
|
||||
mel_dim=80,
|
||||
memory_size=c.memory_size,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
location_attn=c.location_attn,
|
||||
separate_stopnet=c.separate_stopnet)
|
||||
elif c.model.lower() == "tacotron2":
|
||||
model = MyModel(
|
||||
num_chars=num_chars,
|
||||
r=c.r,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
location_attn=c.location_attn,
|
||||
separate_stopnet=c.separate_stopnet)
|
||||
return model
|
|
@ -8,23 +8,49 @@ from .visual import visualize
|
|||
from matplotlib import pylab as plt
|
||||
|
||||
|
||||
def synthesis(m, s, CONFIG, use_cuda, ap):
|
||||
""" Given the text, synthesising the audio """
|
||||
def synthesis(model, text, CONFIG, use_cuda, ap, truncated=False, enable_eos_bos_chars=False, trim_silence=False):
|
||||
"""Synthesize voice for the given text.
|
||||
|
||||
Args:
|
||||
model (TTS.models): model to synthesize.
|
||||
text (str): target text
|
||||
CONFIG (dict): config dictionary to be loaded from config.json.
|
||||
use_cuda (bool): enable cuda.
|
||||
ap (TTS.utils.audio.AudioProcessor): audio processor to process
|
||||
model outputs.
|
||||
truncated (bool): keep model states after inference. It can be used
|
||||
for continuous inference at long texts.
|
||||
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
|
||||
trim_silence (bool): trim silence after synthesis.
|
||||
"""
|
||||
# preprocess the given text
|
||||
text_cleaner = [CONFIG.text_cleaner]
|
||||
if CONFIG.use_phonemes:
|
||||
seq = np.asarray(
|
||||
phoneme_to_sequence(s, text_cleaner, CONFIG.phoneme_language),
|
||||
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language, enable_eos_bos_chars),
|
||||
dtype=np.int32)
|
||||
else:
|
||||
seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32)
|
||||
seq = np.asarray(text_to_sequence(text, text_cleaner), dtype=np.int32)
|
||||
chars_var = torch.from_numpy(seq).unsqueeze(0)
|
||||
# synthesize voice
|
||||
if use_cuda:
|
||||
chars_var = chars_var.cuda()
|
||||
mel_spec, linear_spec, alignments, stop_tokens = m.forward(
|
||||
if truncated:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
|
||||
chars_var.long())
|
||||
linear_spec = linear_spec[0].data.cpu().numpy()
|
||||
mel_spec = mel_spec[0].data.cpu().numpy()
|
||||
else:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||
chars_var.long())
|
||||
# convert outputs to numpy
|
||||
postnet_output = postnet_output[0].data.cpu().numpy()
|
||||
decoder_output = decoder_output[0].data.cpu().numpy()
|
||||
alignment = alignments[0].cpu().data.numpy()
|
||||
wav = ap.inv_spectrogram(linear_spec.T)
|
||||
# plot results
|
||||
if CONFIG.model == "Tacotron":
|
||||
wav = ap.inv_spectrogram(postnet_output.T)
|
||||
else:
|
||||
wav = ap.inv_mel_spectrogram(postnet_output.T)
|
||||
# trim silence
|
||||
if trim_silence:
|
||||
wav = wav[:ap.find_endpoint(wav)]
|
||||
return wav, alignment, linear_spec, mel_spec, stop_tokens
|
||||
return wav, alignment, decoder_output, postnet_output, stop_tokens
|
|
@ -17,7 +17,7 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
|
|||
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
||||
|
||||
# Regular expression matchinf punctuations, ignoring empty space
|
||||
pat = r'['+_phoneme_punctuations[:-1]+']+'
|
||||
pat = r'['+_phoneme_punctuations+']+'
|
||||
|
||||
|
||||
def text2phone(text, language):
|
||||
|
@ -28,31 +28,38 @@ def text2phone(text, language):
|
|||
#try:
|
||||
punctuations = re.findall(pat, text)
|
||||
ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
|
||||
ph = ph[:-1].strip() # skip the last empty character
|
||||
# Replace \n with matching punctuations.
|
||||
if len(punctuations) > 0:
|
||||
# if text ends with a punctuation.
|
||||
if text[-1] == punctuations[-1]:
|
||||
for punct in punctuations[:-1]:
|
||||
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
|
||||
try:
|
||||
ph = ph[:-1] + punctuations[-1]
|
||||
ph = ph + punctuations[-1]
|
||||
except:
|
||||
print(text)
|
||||
else:
|
||||
for punct in punctuations:
|
||||
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
|
||||
return ph
|
||||
|
||||
|
||||
def phoneme_to_sequence(text, cleaner_names, language):
|
||||
'''
|
||||
TODO: This ignores punctuations
|
||||
'''
|
||||
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):
|
||||
if enable_eos_bos:
|
||||
sequence = [_phonemes_to_id['^']]
|
||||
else:
|
||||
sequence = []
|
||||
text = text.replace(":", "")
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
phonemes = text2phone(clean_text, language)
|
||||
# print(phonemes.replace('|', ''))
|
||||
if phonemes is None:
|
||||
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
|
||||
for phoneme in phonemes.split('|'):
|
||||
# print(word, ' -- ', phonemes_text)
|
||||
# iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
|
||||
for phoneme in filter(None, phonemes.split('|')):
|
||||
sequence += _phoneme_to_sequence(phoneme)
|
||||
# Append EOS char
|
||||
if enable_eos_bos:
|
||||
sequence.append(_phonemes_to_id['~'])
|
||||
return sequence
|
||||
|
||||
|
@ -81,7 +88,6 @@ def text_to_sequence(text, cleaner_names):
|
|||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = []
|
||||
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while len(text):
|
||||
m = _curly_re.match(text)
|
||||
|
@ -92,9 +98,6 @@ def text_to_sequence(text, cleaner_names):
|
|||
_clean_text(m.group(1), cleaner_names))
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
|
||||
# Append EOS token
|
||||
sequence.append(_symbol_to_id['~'])
|
||||
return sequence
|
||||
|
||||
|
||||
|
@ -133,8 +136,8 @@ def _arpabet_to_sequence(text):
|
|||
|
||||
|
||||
def _should_keep_symbol(s):
|
||||
return s in _symbol_to_id and s is not '_' and s is not '~'
|
||||
return s in _symbol_to_id and s not in ['~', '^', '_']
|
||||
|
||||
|
||||
def _should_keep_phoneme(p):
|
||||
return p in _phonemes_to_id and p is not '_' and p is not '~'
|
||||
return p in _phonemes_to_id and p not in ['~', '^', '_']
|
||||
|
|
|
@ -56,7 +56,7 @@ def lowercase(text):
|
|||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, ' ', text)
|
||||
return re.sub(_whitespace_re, ' ', text).strip()
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
|
|
|
@ -8,6 +8,7 @@ through Unidecode. For other data, you can modify _characters. See TRAINING_DATA
|
|||
|
||||
_pad = '_'
|
||||
_eos = '~'
|
||||
_bos = '^'
|
||||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
|
||||
_punctuations = '!\'(),-.:;? '
|
||||
_phoneme_punctuations = '.!;:,?'
|
||||
|
@ -25,8 +26,12 @@ _phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonant
|
|||
_arpabet = ['@' + s for s in _phonemes]
|
||||
|
||||
# Export all symbols:
|
||||
symbols = [_pad, _eos] + list(_characters) + _arpabet
|
||||
phonemes = [_pad, _eos] + _phonemes + list(_punctuations)
|
||||
symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
|
||||
phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
|
||||
|
||||
# Generate ALIEN language
|
||||
# from random import shuffle
|
||||
# shuffle(phonemes)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(" > TTS symbols {}".format(len(symbols)))
|
||||
|
|
|
@ -30,22 +30,23 @@ def plot_spectrogram(linear_output, audio):
|
|||
return fig
|
||||
|
||||
|
||||
def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None):
|
||||
def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None):
|
||||
if spectrogram is not None:
|
||||
num_plot = 4
|
||||
else:
|
||||
num_plot = 3
|
||||
|
||||
label_fontsize = 16
|
||||
plt.figure(figsize=(16, 48))
|
||||
fig = plt.figure(figsize=(8, 24))
|
||||
|
||||
plt.subplot(num_plot, 1, 1)
|
||||
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
||||
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
||||
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
||||
if CONFIG.use_phonemes:
|
||||
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language)
|
||||
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars)
|
||||
text = sequence_to_phoneme(seq)
|
||||
print(text)
|
||||
plt.yticks(range(len(text)), list(text))
|
||||
plt.colorbar()
|
||||
|
||||
|
@ -69,3 +70,8 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
|
|||
plt.ylabel("Hz", fontsize=label_fontsize)
|
||||
plt.tight_layout()
|
||||
plt.colorbar()
|
||||
|
||||
if output_path:
|
||||
print(output_path)
|
||||
fig.savefig(output_path)
|
||||
plt.close()
|
||||
|
|
Loading…
Reference in New Issue