From 73581cd94cea080ffb09be443f3817d2ef3fb7dc Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 29 Oct 2020 16:50:07 +0100
Subject: [PATCH] renaming train scripts and updating tests

---
 README.md                                        | 16 +++++++++-------
 TTS/bin/{train_tts.py => train_tacotron.py}      | 13 +++++--------
 ...train_gan_vocoder.py => train_vocoder_gan.py} |  0
 ...ain_wavegrad.py => train_vocoder_wavegrad.py} |  8 +++-----
 ...vernn_vocoder.py => train_vocoder_wavernn.py} |  0
 TTS/tts/configs/config.json                      |  9 ++++++---
 run_tests.sh                                     |  3 ++-
 tests/inputs/test_train_config.json              | 10 ++++++++++
 ...{test_tts_train.sh => test_tacotron_train.sh} |  0
 tests/test_vocoder_gan_train.sh                  |  4 ++--
 tests/test_vocoder_wavernn_train.sh              |  4 ++--
 11 files changed, 39 insertions(+), 28 deletions(-)
 rename TTS/bin/{train_tts.py => train_tacotron.py} (98%)
 rename TTS/bin/{train_gan_vocoder.py => train_vocoder_gan.py} (100%)
 rename TTS/bin/{train_wavegrad.py => train_vocoder_wavegrad.py} (99%)
 rename TTS/bin/{train_wavernn_vocoder.py => train_vocoder_wavernn.py} (100%)
 rename tests/{test_tts_train.sh => test_tacotron_train.sh} (100%)

diff --git a/README.md b/README.md
index 5b048c42..7488103c 100644
--- a/README.md
+++ b/README.md
@@ -150,23 +150,25 @@ head -n 12000 metadata_shuf.csv > metadata_train.csv
 tail -n 1100 metadata_shuf.csv > metadata_val.csv
 ```
 
-To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below. You also set the model architecture in  ```config.json```.
+To train a new model, you need to define your own ```config.json``` to define model details, trainin configuration and more (check the examples). Then call the corressponding train script.
 
-```python TTS/bin/train_tts.py --config_path TTS/tts/configs/config.json```
+For instance, in order to train a tacotron or tacotron2 model on LJSpeech dataset, follow these steps.
+
+```python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json```
 
 To fine-tune a model, use ```--restore_path```.
 
-```python TTS/bin/train_tts.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar```
+```python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar```
 
 To continue an old training run, use ```--continue_path```.
 
-```python TTS/bin/train_tts.py --continue_path /path/to/your/run_folder/```
+```python TTS/bin/train_tacotron.py --continue_path /path/to/your/run_folder/```
 
-For multi-GPU training use ```distribute.py```. It enables process based multi-GPU training where each process uses a single GPU.
+For multi-GPU training, call ```distribute.py```. It runs any provided train script in multi-GPU setting.
 
-```CUDA_VISIBLE_DEVICES="0,1,4" TTS/bin/distribute.py --script train_tts.py --config_path TTS/tts/configs/config.json```
+```CUDA_VISIBLE_DEVICES="0,1,4" python TTS/bin/distribute.py --script train_tacotron.py --config_path TTS/tts/configs/config.json```
 
-Each run creates a new output folder and ```config.json``` is copied under this folder.
+Each run creates a new output folder accomodating used ```config.json```, model checkpoints and tensorboard logs.
 
 In case of any error or intercepted execution, if there is no checkpoint yet under the output folder, the whole folder is going to be removed.
 
diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tacotron.py
similarity index 98%
rename from TTS/bin/train_tts.py
rename to TTS/bin/train_tacotron.py
index 8029ab21..dd9f0e55 100644
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tacotron.py
@@ -7,27 +7,25 @@ import os
 import sys
 import time
 import traceback
+from random import randrange
 
 import numpy as np
 import torch
-
-from random import randrange
 from torch.utils.data import DataLoader
 from TTS.tts.datasets.preprocess import load_meta_data
 from TTS.tts.datasets.TTSDataset import MyDataset
 from TTS.tts.layers.losses import TacotronLoss
-from TTS.tts.utils.distribute import (DistributedSampler,
-                                      apply_gradient_allreduce,
-                                      init_distributed, reduce_tensor)
-from TTS.tts.utils.generic_utils import setup_model, check_config_tts
+from TTS.tts.utils.generic_utils import check_config_tts, setup_model
 from TTS.tts.utils.io import save_best_model, save_checkpoint
 from TTS.tts.utils.measures import alignment_diagonal_score
-from TTS.tts.utils.speakers import parse_speakers, load_speaker_mapping
+from TTS.tts.utils.speakers import load_speaker_mapping, parse_speakers
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.console_logger import ConsoleLogger
+from TTS.utils.distribute import (DistributedSampler, apply_gradient_allreduce,
+                                  init_distributed, reduce_tensor)
 from TTS.utils.generic_utils import (KeepAverage, count_parameters,
                                      create_experiment_folder, get_git_branch,
                                      remove_experiment_folder, set_init_dict)
@@ -38,7 +36,6 @@ from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
                                 gradual_training_scheduler, set_weight_decay,
                                 setup_torch_training_env)
 
-
 use_cuda, num_gpus = setup_torch_training_env(True, False)
 
 
diff --git a/TTS/bin/train_gan_vocoder.py b/TTS/bin/train_vocoder_gan.py
similarity index 100%
rename from TTS/bin/train_gan_vocoder.py
rename to TTS/bin/train_vocoder_gan.py
diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py
similarity index 99%
rename from TTS/bin/train_wavegrad.py
rename to TTS/bin/train_vocoder_wavegrad.py
index 13434979..96191569 100644
--- a/TTS/bin/train_wavegrad.py
+++ b/TTS/bin/train_vocoder_wavegrad.py
@@ -132,10 +132,6 @@ def train(model, criterion, optimizer,
 
         optimizer.zero_grad()
 
-         # schedule update
-        if scheduler is not None:
-            scheduler.step()
-
         # backward pass with loss scaling
         if c.mixed_precision:
             scaler.scale(loss).backward()
@@ -150,7 +146,9 @@ def train(model, criterion, optimizer,
                                            c.clip_grad)
             optimizer.step()
 
-
+        # schedule update
+        if scheduler is not None:
+            scheduler.step()
 
         # disconnect loss values
         loss_dict = dict()
diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_vocoder_wavernn.py
similarity index 100%
rename from TTS/bin/train_wavernn_vocoder.py
rename to TTS/bin/train_vocoder_wavernn.py
diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json
index 1b63b037..55f9306c 100644
--- a/TTS/tts/configs/config.json
+++ b/TTS/tts/configs/config.json
@@ -68,11 +68,14 @@
     "apex_amp_level": null,     // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
 
     // LOSS SETTINGS
-    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "loss_masking": false,       // enable / disable loss masking against the sequence padding.
     "decoder_loss_alpha": 0.5,  // decoder loss weight. If > 0, it is enabled
     "postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled
-    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
-    "diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "ga_alpha": 10.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // differential spectral loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
 
     // VALIDATION
     "run_eval": true,
diff --git a/run_tests.sh b/run_tests.sh
index 998d8ec4..46f18f01 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -6,9 +6,10 @@ TF_CPP_MIN_LOG_LEVEL=3
 # runtime tests
 ./tests/test_server_package.sh && \
 ./tests/test_tts_train.sh && \
+./tests/test_glow-tts_train.sh && \
 ./tests/test_vocoder_gan_train.sh && \
 ./tests/test_vocoder_wavernn_train.sh && \
-./tests/test_glow-tts_train.sh && \
+./tests/test_vocoder_wavegrad_train.sh && \
 
 # linter check
 cardboardlinter --refspec master
\ No newline at end of file
diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_train_config.json
index ddb71384..2e2d6d46 100644
--- a/tests/inputs/test_train_config.json
+++ b/tests/inputs/test_train_config.json
@@ -74,6 +74,16 @@
     "test_delay_epochs": 0,  //Until attention is aligned, testing only wastes computation time.
     "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
 
+    // LOSS SETTINGS
+    "loss_masking": false,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled
+    "ga_alpha": 10.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // differential spectral loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+
     // OPTIMIZER
     "noam_schedule": false,        // use noam warmup and lr schedule.
     "grad_clip": 1.0,              // upper limit for gradients for clipping.
diff --git a/tests/test_tts_train.sh b/tests/test_tacotron_train.sh
similarity index 100%
rename from tests/test_tts_train.sh
rename to tests/test_tacotron_train.sh
diff --git a/tests/test_vocoder_gan_train.sh b/tests/test_vocoder_gan_train.sh
index 75773cc3..474ef9a7 100755
--- a/tests/test_vocoder_gan_train.sh
+++ b/tests/test_vocoder_gan_train.sh
@@ -5,11 +5,11 @@ echo "$BASEDIR"
 # create run dir
 mkdir $BASEDIR/train_outputs
 # run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_gan.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json
 # find the training folder
 LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
 echo $LATEST_FOLDER
 # continue the previous training
-CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_gan_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_vocoder_gan.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
 # remove all the outputs
 rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER
diff --git a/tests/test_vocoder_wavernn_train.sh b/tests/test_vocoder_wavernn_train.sh
index f2e32116..ffa30d40 100755
--- a/tests/test_vocoder_wavernn_train.sh
+++ b/tests/test_vocoder_wavernn_train.sh
@@ -5,11 +5,11 @@ echo "$BASEDIR"
 # create run dir
 mkdir $BASEDIR/train_outputs
 # run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavernn.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json
 # find the training folder
 LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
 echo $LATEST_FOLDER
 # continue the previous training
-CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_wavernn_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_vocoder_wavernn.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
 # remove all the outputs
 rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER
\ No newline at end of file