Merge branch 'dev' into main

2021-04-02 16:42:37 +02:00 · 2021-04-02 16:42:37 +02:00 · 2344379cb8
parent 3efdad3776 e3ccfe37ea
commit 2344379cb8
79 changed files with 2317 additions and 3925 deletions
--- a/.github/stale.yml
+++ b/.github/stale.yml
@ -1,5 +1,5 @@
 # Number of days of inactivity before an issue becomes stale
-daysUntilStale: 60
+daysUntilStale: 30
 # Number of days of inactivity before a stale issue is closed
 daysUntilClose: 7
 # Issues with these labels will never be considered stale
@ -12,8 +12,7 @@ staleLabel: wontfix
 markComment: >
  This issue has been automatically marked as stale because it has not had
  recent activity. It will be closed if no further activity occurs. Thank you
-  for your contributions. You might also look our discourse page for further help.
-  https://discourse.mozilla.org/c/tts
+  for your contributions. You might also look our discussion channels.
 # Comment to post when closing a stale issue. Set to `false` to disable
 closeComment: false

--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -35,7 +35,8 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install espeak-ng git
+          sudo apt install -y espeak-ng git
+          sudo apt install -y python3-wheel gcc
      - name: Upgrade pip
        # so we can take advantage of pyproject.toml build-dependency support
        run: python3 -m pip install --upgrade pip
@ -45,7 +46,7 @@ jobs:
          python3 setup.py egg_info
      - name: Lint check
        run: |
-          cardboardlinter -n auto
+          cardboardlinter
      - name: Unit tests
        run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker
      - name: Test scripts
@ -57,3 +58,5 @@ jobs:
          ./tests/test_vocoder_wavegrad_train.sh
          ./tests/test_vocoder_wavernn_train.sh
          ./tests/test_speedy_speech_train.sh
+          ./tests/test_resample.sh
+          ./tests/test_compute_statistics.sh
--- a/.gitignore
+++ b/.gitignore
@ -130,4 +130,6 @@ TODO.txt
 data/*
 notebooks/data/*
 TTS/tts/layers/glow_tts/monotonic_align/core.c
-temp_build/*
+.vscode-upload.json
+temp_build/*
+recipes/*
--- a/README.md
+++ b/README.md
@ -74,12 +74,14 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
 - Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
 - Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
+- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)

 ### Attention Methods
 - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
 - Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
 - Graves Attention: [paper](https://arxiv.org/abs/1907.09006)
 - Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
+- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)

 ### Speaker Encoder
 - GE2E: [paper](https://arxiv.org/abs/1710.10467)
@ -174,7 +176,7 @@ Run a tts and a vocoder model from the released model list. (Simply copy and pas
 tts --text "Text for TTS" \
    --model_name "<type>/<language>/<dataset>/<model_name>" \
    --vocoder_name "<type>/<language>/<dataset>/<model_name>" \
-    --out_path folder/to/save/output/
+    --out_path folder/to/save/output.wav
 ```

 Run your own TTS model (Using Griffin-Lim Vocoder)
@ -182,7 +184,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder)
 tts --text "Text for TTS" \
    --model_path path/to/model.pth.tar \
    --config_path path/to/config.json \
-    --out_path output/path/speech.wav
+    --out_path folder/to/save/output.wav
 ```

 Run your own TTS and Vocoder models
@ -190,7 +192,7 @@ Run your own TTS and Vocoder models
 tts --text "Text for TTS" \
    --model_path path/to/config.json \
    --config_path path/to/model.pth.tar \
-    --out_path output/path/speech.wav \
+    --out_path folder/to/save/output.wav \
    --vocoder_path path/to/vocoder.pth.tar \
    --vocoder_config_path path/to/vocoder_config.json
 ```
@ -263,7 +265,6 @@ cardboardlinter --refspec master
 Feel free to ping us at any step you need help using our communication channels.
 [Here](https://github.com/firstcontributions/first-contributions) is a good resource for complete beginners.

-
 ### Acknowledgement
 - https://github.com/keithito/tacotron (Dataset pre-processing)
 - https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture)
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -94,6 +94,16 @@
                    "contact": "egolge@coqui.com"
                }
            }
+        },
+        "de":{
+            "thorsten":{
+                "tacotron2-DCA":{
+                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
+                    "default_vocoder": "vocoder_models/de/thorsten/wavegrad",
+                    "author": "@thorstenMueller",
+                    "commit": "unknown"
+                }
+            }
        }
    },
    "vocoder_models":{
@ -141,6 +151,15 @@
                    "commit": "unknown"
                }
            }
+        },
+        "de":{
+            "thorsten":{
+                "wavegrad":{
+                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
+                    "author": "@thorstenMueller",
+                    "commit": "unknown"
+                }
+            }
        }
    }
 }
--- a/TTS/bin/convert_tacotron2_torch_to_tf.py
+++ b/TTS/bin/convert_tacotron2_torch_to_tf.py
@ -1,11 +1,7 @@
-# %%
-# %%
 import argparse
 from difflib import SequenceMatcher
 import os
 import sys
-# %%
-# print variable match
 from pprint import pprint

 import numpy as np
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@ -0,0 +1,68 @@
+import argparse
+import glob
+import os
+import librosa
+from distutils.dir_util import copy_tree
+from argparse import RawTextHelpFormatter
+from multiprocessing import Pool
+from tqdm import tqdm
+
+def resample_file(func_args):
+    filename, output_sr = func_args
+    y, sr = librosa.load(filename, sr=output_sr)
+    librosa.output.write_wav(filename, y, sr)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description='''Resample a folder recusively with librosa
+                       Can be used in place or create a copy of the folder as an output.\n\n
+                       Example run:
+                            python TTS/bin/resample.py
+                                --input_dir /root/LJSpeech-1.1/
+                                --output_sr 22050
+                                --output_dir /root/resampled_LJSpeech-1.1/
+                                --n_jobs 24
+                    ''',
+        formatter_class=RawTextHelpFormatter)
+
+    parser.add_argument('--input_dir',
+                        type=str,
+                        default=None,
+                        required=True,
+                        help='Path of the folder containing the audio files to resample')
+
+    parser.add_argument('--output_sr',
+                        type=int,
+                        default=22050,
+                        required=False,
+                        help='Samlple rate to which the audio files should be resampled')
+
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default=None,
+                        required=False,
+                        help='Path of the destination folder. If not defined, the operation is done in place')
+
+    parser.add_argument('--n_jobs',
+                        type=int,
+                        default=None,
+                        help='Number of threads to use, by default it uses all cores')
+
+    args = parser.parse_args()
+
+    if args.output_dir:
+        print('Recursively copying the input folder...')
+        copy_tree(args.input_dir, args.output_dir)
+        args.input_dir = args.output_dir
+
+    print('Resampling the audio files...')
+    audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True)
+    print(f'Found {len(audio_files)} files...')
+    audio_files = list(zip(audio_files, len(audio_files)*[args.output_sr]))
+    with Pool(processes=args.n_jobs) as p:
+        with tqdm(total=len(audio_files)) as pbar:
+            for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
+                pbar.update()
+
+    print('Done !')
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -2,9 +2,7 @@
 # -*- coding: utf-8 -*-

 import argparse
-import os
 import sys
-import string
 from argparse import RawTextHelpFormatter
 # pylint: disable=redefined-outer-name, unused-argument
 from pathlib import Path
@ -103,8 +101,8 @@ def main():
    parser.add_argument(
        '--out_path',
        type=str,
-        default=Path(__file__).resolve().parent,
-        help='Path to save final wav file. Wav file will be named as the given text.',
+        default='tts_output.wav',
+        help='Output wav file path.',
    )
    parser.add_argument(
        '--use_cuda',
@ -218,12 +216,8 @@ def main():
    wav = synthesizer.tts(args.text)

    # save the results
-    file_name = args.text.replace(" ", "_")[0:20]
-    file_name = file_name.translate(
-        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
-    out_path = os.path.join(args.out_path, file_name)
-    print(" > Saving output to {}".format(out_path))
-    synthesizer.save_wav(wav, out_path,)
+    print(" > Saving output to {}".format(args.out_path))
+    synthesizer.save_wav(wav, args.out_path)


 if __name__ == "__main__":
--- a/TTS/bin/train_align_tts.py
+++ b/TTS/bin/train_align_tts.py
@ -0,0 +1,625 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import time
+import traceback
+from random import randrange
+
+import numpy as np
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP_th
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from TTS.tts.datasets.preprocess import load_meta_data
+from TTS.tts.datasets.TTSDataset import MyDataset
+from TTS.tts.layers.losses import AlignTTSLoss
+from TTS.tts.utils.generic_utils import setup_model
+from TTS.tts.utils.io import save_best_model, save_checkpoint
+from TTS.tts.utils.measures import alignment_diagonal_score
+from TTS.tts.utils.speakers import parse_speakers
+from TTS.tts.utils.synthesis import synthesis
+from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
+from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.arguments import parse_arguments, process_args
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.distribute import init_distributed, reduce_tensor
+from TTS.utils.generic_utils import (KeepAverage, count_parameters,
+                                     remove_experiment_folder, set_init_dict)
+from TTS.utils.radam import RAdam
+from TTS.utils.training import NoamLR, setup_torch_training_env
+
+if __name__ == '__main__':
+    use_cuda, num_gpus = setup_torch_training_env(True, False)
+    # torch.autograd.set_detect_anomaly(True)
+
+    def setup_loader(ap, r, is_val=False, verbose=False):
+        if is_val and not c.run_eval:
+            loader = None
+        else:
+            dataset = MyDataset(
+                r,
+                c.text_cleaner,
+                compute_linear_spec=False,
+                meta_data=meta_data_eval if is_val else meta_data_train,
+                ap=ap,
+                tp=c.characters if 'characters' in c.keys() else None,
+                add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
+                batch_group_size=0 if is_val else c.batch_group_size *
+                c.batch_size,
+                min_seq_len=c.min_seq_len,
+                max_seq_len=c.max_seq_len,
+                phoneme_cache_path=c.phoneme_cache_path,
+                use_phonemes=c.use_phonemes,
+                phoneme_language=c.phoneme_language,
+                enable_eos_bos=c.enable_eos_bos_chars,
+                use_noise_augment=not is_val,
+                verbose=verbose,
+                speaker_mapping=speaker_mapping if c.use_speaker_embedding
+                and c.use_external_speaker_embedding_file else None)
+
+            if c.use_phonemes and c.compute_input_seq_cache:
+                # precompute phonemes to have a better estimate of sequence lengths.
+                dataset.compute_input_seq(c.num_loader_workers)
+            dataset.sort_items()
+
+            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+            loader = DataLoader(
+                dataset,
+                batch_size=c.eval_batch_size if is_val else c.batch_size,
+                shuffle=False,
+                collate_fn=dataset.collate_fn,
+                drop_last=False,
+                sampler=sampler,
+                num_workers=c.num_val_loader_workers
+                if is_val else c.num_loader_workers,
+                pin_memory=False)
+        return loader
+
+    def format_data(data):
+        # setup input data
+        text_input = data[0]
+        text_lengths = data[1]
+        speaker_names = data[2]
+        mel_input = data[4].permute(0, 2, 1)  # B x D x T
+        mel_lengths = data[5]
+        item_idx = data[7]
+        avg_text_length = torch.mean(text_lengths.float())
+        avg_spec_length = torch.mean(mel_lengths.float())
+
+        if c.use_speaker_embedding:
+            if c.use_external_speaker_embedding_file:
+                # return precomputed embedding vector
+                speaker_c = data[8]
+            else:
+                # return speaker_id to be used by an embedding layer
+                speaker_c = [
+                    speaker_mapping[speaker_name]
+                    for speaker_name in speaker_names
+                ]
+                speaker_c = torch.LongTensor(speaker_c)
+        else:
+            speaker_c = None
+        # dispatch data to GPU
+        if use_cuda:
+            text_input = text_input.cuda(non_blocking=True)
+            text_lengths = text_lengths.cuda(non_blocking=True)
+            mel_input = mel_input.cuda(non_blocking=True)
+            mel_lengths = mel_lengths.cuda(non_blocking=True)
+            if speaker_c is not None:
+                speaker_c = speaker_c.cuda(non_blocking=True)
+        return text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
+            avg_text_length, avg_spec_length, item_idx
+
+    def train(data_loader, model, criterion, optimizer, scheduler, ap,
+              global_step, epoch, training_phase):
+
+        model.train()
+        epoch_time = 0
+        keep_avg = KeepAverage()
+        if use_cuda:
+            batch_n_iter = int(
+                len(data_loader.dataset) / (c.batch_size * num_gpus))
+        else:
+            batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
+        end_time = time.time()
+        c_logger.print_train_start()
+        scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
+        for num_iter, data in enumerate(data_loader):
+            start_time = time.time()
+
+            # format data
+            text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
+                avg_text_length, avg_spec_length, _ = format_data(data)
+
+            loader_time = time.time() - end_time
+
+            global_step += 1
+            optimizer.zero_grad()
+
+            # forward pass model
+            with torch.cuda.amp.autocast(enabled=c.mixed_precision):
+                decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward(
+                    text_input,
+                    text_lengths,
+                    mel_targets,
+                    mel_lengths,
+                    g=speaker_c,
+                    phase=training_phase)
+
+                # compute loss
+                loss_dict = criterion(logp,
+                                      decoder_output,
+                                      mel_targets,
+                                      mel_lengths,
+                                      dur_output,
+                                      dur_mas_output,
+                                      text_lengths,
+                                      global_step,
+                                      phase=training_phase)
+
+            # backward pass with loss scaling
+            if c.mixed_precision:
+                scaler.scale(loss_dict['loss']).backward()
+                scaler.unscale_(optimizer)
+                grad_norm = torch.nn.utils.clip_grad_norm_(
+                    model.parameters(), c.grad_clip)
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                loss_dict['loss'].backward()
+                grad_norm = torch.nn.utils.clip_grad_norm_(
+                    model.parameters(), c.grad_clip)
+                optimizer.step()
+
+            # setup lr
+            if c.noam_schedule:
+                scheduler.step()
+
+            # current_lr
+            current_lr = optimizer.param_groups[0]['lr']
+
+            # compute alignment error (the lower the better )
+            align_error = 1 - alignment_diagonal_score(alignments, binary=True)
+            loss_dict['align_error'] = align_error
+
+            step_time = time.time() - start_time
+            epoch_time += step_time
+
+            # aggregate losses from processes
+            if num_gpus > 1:
+                loss_dict['loss_l1'] = reduce_tensor(loss_dict['loss_l1'].data,
+                                                     num_gpus)
+                loss_dict['loss_ssim'] = reduce_tensor(
+                    loss_dict['loss_ssim'].data, num_gpus)
+                loss_dict['loss_dur'] = reduce_tensor(
+                    loss_dict['loss_dur'].data, num_gpus)
+                loss_dict['loss'] = reduce_tensor(loss_dict['loss'].data,
+                                                  num_gpus)
+
+            # detach loss values
+            loss_dict_new = dict()
+            for key, value in loss_dict.items():
+                if isinstance(value, (int, float)):
+                    loss_dict_new[key] = value
+                else:
+                    loss_dict_new[key] = value.item()
+            loss_dict = loss_dict_new
+
+            # update avg stats
+            update_train_values = dict()
+            for key, value in loss_dict.items():
+                update_train_values['avg_' + key] = value
+            update_train_values['avg_loader_time'] = loader_time
+            update_train_values['avg_step_time'] = step_time
+            keep_avg.update_values(update_train_values)
+
+            # print training progress
+            if global_step % c.print_step == 0:
+                log_dict = {
+                    "avg_spec_length": [avg_spec_length,
+                                        1],  # value, precision
+                    "avg_text_length": [avg_text_length, 1],
+                    "step_time": [step_time, 4],
+                    "loader_time": [loader_time, 2],
+                    "current_lr": current_lr,
+                }
+                c_logger.print_train_step(batch_n_iter, num_iter, global_step,
+                                          log_dict, loss_dict,
+                                          keep_avg.avg_values)
+
+            if args.rank == 0:
+                # Plot Training Iter Stats
+                # reduce TB load
+                if global_step % c.tb_plot_step == 0:
+                    iter_stats = {
+                        "lr": current_lr,
+                        "grad_norm": grad_norm,
+                        "step_time": step_time
+                    }
+                    iter_stats.update(loss_dict)
+                    tb_logger.tb_train_iter_stats(global_step, iter_stats)
+
+                if global_step % c.save_step == 0:
+                    if c.checkpoint:
+                        # save model
+                        save_checkpoint(model,
+                                        optimizer,
+                                        global_step,
+                                        epoch,
+                                        1,
+                                        OUT_PATH,
+                                        model_characters,
+                                        model_loss=loss_dict['loss'])
+
+                    # wait all kernels to be completed
+                    torch.cuda.synchronize()
+
+                    # Diagnostic visualizations
+                    if decoder_output is not None:
+                        idx = np.random.randint(mel_targets.shape[0])
+                        pred_spec = decoder_output[idx].detach().data.cpu(
+                        ).numpy().T
+                        gt_spec = mel_targets[idx].data.cpu().numpy().T
+                        align_img = alignments[idx].data.cpu()
+
+                        figures = {
+                            "prediction": plot_spectrogram(pred_spec, ap),
+                            "ground_truth": plot_spectrogram(gt_spec, ap),
+                            "alignment": plot_alignment(align_img),
+                        }
+
+                        tb_logger.tb_train_figures(global_step, figures)
+
+                        # Sample audio
+                        train_audio = ap.inv_melspectrogram(pred_spec.T)
+                        tb_logger.tb_train_audios(global_step,
+                                                  {'TrainAudio': train_audio},
+                                                  c.audio["sample_rate"])
+            end_time = time.time()
+
+        # print epoch stats
+        c_logger.print_train_epoch_end(global_step, epoch, epoch_time,
+                                       keep_avg)
+
+        # Plot Epoch Stats
+        if args.rank == 0:
+            epoch_stats = {"epoch_time": epoch_time}
+            epoch_stats.update(keep_avg.avg_values)
+            tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
+            if c.tb_model_param_stats:
+                tb_logger.tb_model_weights(model, global_step)
+        return keep_avg.avg_values, global_step
+
+    @torch.no_grad()
+    def evaluate(data_loader, model, criterion, ap, global_step, epoch,
+                 training_phase):
+        model.eval()
+        epoch_time = 0
+        keep_avg = KeepAverage()
+        c_logger.print_eval_start()
+        if data_loader is not None:
+            for num_iter, data in enumerate(data_loader):
+                start_time = time.time()
+
+                # format data
+                text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
+                _, _, _ = format_data(data)
+
+                # forward pass model
+                with torch.cuda.amp.autocast(enabled=c.mixed_precision):
+                    decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward(
+                        text_input,
+                        text_lengths,
+                        mel_targets,
+                        mel_lengths,
+                        g=speaker_c,
+                        phase=training_phase)
+
+                # compute loss
+                loss_dict = criterion(logp,
+                                      decoder_output,
+                                      mel_targets,
+                                      mel_lengths,
+                                      dur_output,
+                                      dur_mas_output,
+                                      text_lengths,
+                                      global_step,
+                                      phase=training_phase)
+
+
+                # step time
+                step_time = time.time() - start_time
+                epoch_time += step_time
+
+                # compute alignment score
+                align_error = 1 - alignment_diagonal_score(alignments,
+                                                           binary=True)
+                loss_dict['align_error'] = align_error
+
+                # aggregate losses from processes
+                if num_gpus > 1:
+                    loss_dict['loss_l1'] = reduce_tensor(
+                        loss_dict['loss_l1'].data, num_gpus)
+                    loss_dict['loss_ssim'] = reduce_tensor(
+                        loss_dict['loss_ssim'].data, num_gpus)
+                    loss_dict['loss_dur'] = reduce_tensor(
+                        loss_dict['loss_dur'].data, num_gpus)
+                    loss_dict['loss'] = reduce_tensor(loss_dict['loss'].data,
+                                                      num_gpus)
+
+                # detach loss values
+                loss_dict_new = dict()
+                for key, value in loss_dict.items():
+                    if isinstance(value, (int, float)):
+                        loss_dict_new[key] = value
+                    else:
+                        loss_dict_new[key] = value.item()
+                loss_dict = loss_dict_new
+
+                # update avg stats
+                update_train_values = dict()
+                for key, value in loss_dict.items():
+                    update_train_values['avg_' + key] = value
+                keep_avg.update_values(update_train_values)
+
+                if c.print_eval:
+                    c_logger.print_eval_step(num_iter, loss_dict,
+                                             keep_avg.avg_values)
+
+            if args.rank == 0:
+                # Diagnostic visualizations
+                idx = np.random.randint(mel_targets.shape[0])
+                pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
+                gt_spec = mel_targets[idx].data.cpu().numpy().T
+                align_img = alignments[idx].data.cpu()
+
+                eval_figures = {
+                    "prediction": plot_spectrogram(pred_spec,
+                                                   ap,
+                                                   output_fig=False),
+                    "ground_truth": plot_spectrogram(gt_spec,
+                                                     ap,
+                                                     output_fig=False),
+                    "alignment": plot_alignment(align_img, output_fig=False)
+                }
+
+                # Sample audio
+                eval_audio = ap.inv_melspectrogram(pred_spec.T)
+                tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
+                                         c.audio["sample_rate"])
+
+                # Plot Validation Stats
+                tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
+                tb_logger.tb_eval_figures(global_step, eval_figures)
+
+        if args.rank == 0 and epoch >= c.test_delay_epochs:
+            if c.test_sentences_file is None:
+                test_sentences = [
+                    "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+                    "Be a voice, not an echo.",
+                    "I'm sorry Dave. I'm afraid I can't do that.",
+                    "This cake is great. It's so delicious and moist.",
+                    "Prior to November 22, 1963."
+                ]
+            else:
+                with open(c.test_sentences_file, "r") as f:
+                    test_sentences = [s.strip() for s in f.readlines()]
+
+            # test sentences
+            test_audios = {}
+            test_figures = {}
+            print(" | > Synthesizing test sentences")
+            if c.use_speaker_embedding:
+                if c.use_external_speaker_embedding_file:
+                    speaker_embedding = speaker_mapping[list(
+                        speaker_mapping.keys())[randrange(
+                            len(speaker_mapping) - 1)]]['embedding']
+                    speaker_id = None
+                else:
+                    speaker_id = 0
+                    speaker_embedding = None
+            else:
+                speaker_id = None
+                speaker_embedding = None
+
+            style_wav = c.get("style_wav_for_test")
+            for idx, test_sentence in enumerate(test_sentences):
+                try:
+                    wav, alignment, _, postnet_output, _, _ = synthesis(
+                        model,
+                        test_sentence,
+                        c,
+                        use_cuda,
+                        ap,
+                        speaker_id=speaker_id,
+                        speaker_embedding=speaker_embedding,
+                        style_wav=style_wav,
+                        truncated=False,
+                        enable_eos_bos_chars=c.enable_eos_bos_chars,  #pylint: disable=unused-argument
+                        use_griffin_lim=True,
+                        do_trim_silence=False)
+
+                    file_path = os.path.join(AUDIO_PATH, str(global_step))
+                    os.makedirs(file_path, exist_ok=True)
+                    file_path = os.path.join(file_path,
+                                             "TestSentence_{}.wav".format(idx))
+                    ap.save_wav(wav, file_path)
+                    test_audios['{}-audio'.format(idx)] = wav
+                    test_figures['{}-prediction'.format(
+                        idx)] = plot_spectrogram(postnet_output, ap)
+                    test_figures['{}-alignment'.format(idx)] = plot_alignment(
+                        alignment)
+                except:  #pylint: disable=bare-except
+                    print(" !! Error creating Test Sentence -", idx)
+                    traceback.print_exc()
+            tb_logger.tb_test_audios(global_step, test_audios,
+                                     c.audio['sample_rate'])
+            tb_logger.tb_test_figures(global_step, test_figures)
+        return keep_avg.avg_values
+
+    def main(args):  # pylint: disable=redefined-outer-name
+        # pylint: disable=global-variable-undefined
+        global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping
+        # Audio processor
+        ap = AudioProcessor(**c.audio)
+        if 'characters' in c.keys():
+            symbols, phonemes = make_symbols(**c.characters)
+
+        # DISTRUBUTED
+        if num_gpus > 1:
+            init_distributed(args.rank, num_gpus, args.group_id,
+                             c.distributed["backend"], c.distributed["url"])
+
+        # set model characters
+        model_characters = phonemes if c.use_phonemes else symbols
+        num_chars = len(model_characters)
+
+        # load data instances
+        meta_data_train, meta_data_eval = load_meta_data(c.datasets,
+                                                         eval_split=True)
+
+        # set the portion of the data used for training if set in config.json
+        if 'train_portion' in c.keys():
+            meta_data_train = meta_data_train[:int(
+                len(meta_data_train) * c.train_portion)]
+        if 'eval_portion' in c.keys():
+            meta_data_eval = meta_data_eval[:int(
+                len(meta_data_eval) * c.eval_portion)]
+
+        # parse speakers
+        num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(
+            c, args, meta_data_train, OUT_PATH)
+
+        # setup model
+        model = setup_model(num_chars,
+                            num_speakers,
+                            c,
+                            speaker_embedding_dim=speaker_embedding_dim)
+        optimizer = RAdam(model.parameters(),
+                          lr=c.lr,
+                          weight_decay=0,
+                          betas=(0.9, 0.98),
+                          eps=1e-9)
+        criterion = AlignTTSLoss(c)
+
+        if args.restore_path:
+            print(
+                f" > Restoring from {os.path.basename(args.restore_path)} ...")
+            checkpoint = torch.load(args.restore_path, map_location='cpu')
+            try:
+                # TODO: fix optimizer init, model.cuda() needs to be called before
+                # optimizer restore
+                optimizer.load_state_dict(checkpoint['optimizer'])
+                if c.reinit_layers:
+                    raise RuntimeError
+                model.load_state_dict(checkpoint['model'])
+            except:  #pylint: disable=bare-except
+                print(" > Partial model initialization.")
+                model_dict = model.state_dict()
+                model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+                model.load_state_dict(model_dict)
+                del model_dict
+
+            for group in optimizer.param_groups:
+                group['initial_lr'] = c.lr
+            print(" > Model restored from step %d" % checkpoint['step'],
+                  flush=True)
+            args.restore_step = checkpoint['step']
+        else:
+            args.restore_step = 0
+
+        if use_cuda:
+            model.cuda()
+            criterion.cuda()
+
+        # DISTRUBUTED
+        if num_gpus > 1:
+            model = DDP_th(model, device_ids=[args.rank])
+
+        if c.noam_schedule:
+            scheduler = NoamLR(optimizer,
+                               warmup_steps=c.warmup_steps,
+                               last_epoch=args.restore_step - 1)
+        else:
+            scheduler = None
+
+        num_params = count_parameters(model)
+        print("\n > Model has {} parameters".format(num_params), flush=True)
+
+        if args.restore_step == 0 or not args.best_path:
+            best_loss = float('inf')
+            print(" > Starting with inf best loss.")
+        else:
+            print(" > Restoring best loss from "
+                  f"{os.path.basename(args.best_path)} ...")
+            best_loss = torch.load(args.best_path,
+                                   map_location='cpu')['model_loss']
+            print(f" > Starting with loaded last best loss {best_loss}.")
+        keep_all_best = c.get('keep_all_best', False)
+        keep_after = c.get('keep_after', 10000)  # void if keep_all_best False
+
+        # define dataloaders
+        train_loader = setup_loader(ap, 1, is_val=False, verbose=True)
+        eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
+
+        global_step = args.restore_step
+
+        def set_phase():
+            """Set AlignTTS training phase"""
+            if isinstance(c.phase_start_steps, list):
+                vals = [i < global_step for i in c.phase_start_steps]
+                if not True in vals:
+                    phase = 0
+                else:
+                    phase = len(c.phase_start_steps) - [
+                        i < global_step for i in c.phase_start_steps
+                    ][::-1].index(True) - 1
+            else:
+                phase = None
+            return phase
+
+        for epoch in range(0, c.epochs):
+            cur_phase = set_phase()
+            print(f"\n > Current AlignTTS phase: {cur_phase}")
+            c_logger.print_epoch_start(epoch, c.epochs)
+            train_avg_loss_dict, global_step = train(train_loader, model,
+                                                     criterion, optimizer,
+                                                     scheduler, ap,
+                                                     global_step, epoch,
+                                                     cur_phase)
+            eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap,
+                                          global_step, epoch, cur_phase)
+            c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
+            target_loss = train_avg_loss_dict['avg_loss']
+            if c.run_eval:
+                target_loss = eval_avg_loss_dict['avg_loss']
+            best_loss = save_best_model(target_loss,
+                                        best_loss,
+                                        model,
+                                        optimizer,
+                                        global_step,
+                                        epoch,
+                                        1,
+                                        OUT_PATH,
+                                        model_characters,
+                                        keep_all_best=keep_all_best,
+                                        keep_after=keep_after)
+
+    args = parse_arguments(sys.argv)
+    c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
+        args, model_class='tts')
+
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)
--- a/TTS/bin/train_glow_tts.py
+++ b/TTS/bin/train_glow_tts.py
@ -580,7 +580,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 if __name__ == '__main__':
    args = parse_arguments(sys.argv)
    c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
-        args, model_type='glow_tts')
+        args, model_class='tts')

    try:
        main(args)
--- a/TTS/bin/train_speedy_speech.py
+++ b/TTS/bin/train_speedy_speech.py
@ -540,7 +540,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 if __name__ == '__main__':
    args = parse_arguments(sys.argv)
    c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
-        args, model_type='tts')
+        args, model_class='tts')

    try:
        main(args)
--- a/TTS/bin/train_tacotron.py
+++ b/TTS/bin/train_tacotron.py
@ -85,7 +85,7 @@ def format_data(data):
    text_input = data[0]
    text_lengths = data[1]
    speaker_names = data[2]
-    linear_input = data[3] if c.model in ["Tacotron"] else None
+    linear_input = data[3] if c.model.lower() in ["tacotron"] else None
    mel_input = data[4]
    mel_lengths = data[5]
    stop_targets = data[6]
@ -658,7 +658,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 if __name__ == '__main__':
    args = parse_arguments(sys.argv)
    c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
-        args, model_type='tacotron')
+        args, model_class='tts')

    try:
        main(args)
--- a/TTS/bin/train_vocoder_gan.py
+++ b/TTS/bin/train_vocoder_gan.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# TODO: mixed precision training
 """Trains GAN based vocoder model."""

 import os
@ -590,7 +591,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 if __name__ == '__main__':
    args = parse_arguments(sys.argv)
    c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
-        args, model_type='gan')
+        args, model_class='vocoder')

    try:
        main(args)
--- a/TTS/bin/train_vocoder_wavegrad.py
+++ b/TTS/bin/train_vocoder_wavegrad.py
@ -436,7 +436,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 if __name__ == '__main__':
    args = parse_arguments(sys.argv)
    c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
-        args, model_type='wavegrad')
+        args, model_class='vocoder')

    try:
        main(args)
--- a/TTS/bin/train_vocoder_wavernn.py
+++ b/TTS/bin/train_vocoder_wavernn.py
@ -460,7 +460,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 if __name__ == "__main__":
    args = parse_arguments(sys.argv)
    c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
-        args, model_type='wavernn')
+        args, model_class='vocoder')

    try:
        main(args)
--- a/TTS/speaker_encoder/losses.py
+++ b/TTS/speaker_encoder/losses.py
@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import numpy as np
+

 # adapted from https://github.com/cvqluu/GE2E-Loss
 class GE2ELoss(nn.Module):
@ -155,6 +155,6 @@ class AngleProtoLoss(nn.Module):
        cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2))
        torch.clamp(self.w, 1e-6)
        cos_sim_matrix = cos_sim_matrix * self.w + self.b
-        label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device)
+        label = torch.arange(num_speakers).to(cos_sim_matrix.device)
        L = self.criterion(cos_sim_matrix, label)
        return L
--- a/TTS/tts/configs/aligntts_transformers.json
+++ b/TTS/tts/configs/aligntts_transformers.json
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@ -1,12 +1,12 @@
 import os
-from glob import glob
 import re
 import sys
+import xml.etree.ElementTree as ET
+from glob import glob
 from pathlib import Path
 from typing import List

 from tqdm import tqdm
-
 from TTS.tts.utils.generic_utils import split_dataset

 ####################
@ -35,7 +35,7 @@ def load_meta_data(datasets, eval_split=True):
            meta_data_eval_all += meta_data_eval
        meta_data_train_all += meta_data_train
        # load attention masks for duration predictor training
-        if 'meta_file_attn_mask' in dataset:
+        if 'meta_file_attn_mask' in dataset and dataset['meta_file_attn_mask'] is not None:
            meta_data = dict(load_attention_mask_meta_data(dataset['meta_file_attn_mask']))
            for idx, ins in enumerate(meta_data_train_all):
                attn_file = meta_data[ins[1]].strip()
@ -159,7 +159,7 @@ def ljspeech(root_path, meta_file):
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "ljspeech"
-    with open(txt_file, 'r') as ttf:
+    with open(txt_file, 'r', encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split('|')
            wav_file = os.path.join(root_path, 'wavs', cols[0] + '.wav')
@ -168,13 +168,30 @@ def ljspeech(root_path, meta_file):
    return items


+def sam_accenture(root_path, meta_file):
+    """Normalizes the sam-accenture meta data file to TTS format
+    https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
+    xml_file = os.path.join(root_path, 'voice_over_recordings', meta_file)
+    xml_root = ET.parse(xml_file).getroot()
+    items = []
+    speaker_name = "sam_accenture"
+    for item in xml_root.findall('./fileid'):
+        text = item.text
+        wav_file = os.path.join(root_path, 'vo_voice_quality_transformation', item.get('id')+'.wav')
+        if not os.path.exists(wav_file):
+            print(f' [!] {wav_file} in metafile does not exist. Skipping...')
+            continue
+        items.append([text, wav_file, speaker_name])
+    return items
+
+
 def ruslan(root_path, meta_file):
    """Normalizes the RUSLAN meta data file to TTS format
    https://ruslan-corpus.github.io/"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "ljspeech"
-    with open(txt_file, 'r') as ttf:
+    with open(txt_file, 'r', encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split('|')
            wav_file = os.path.join(root_path, 'RUSLAN', cols[0] + '.wav')
--- a/TTS/tts/layers/align_tts/init.py
+++ b/TTS/tts/layers/align_tts/init.py
--- a/TTS/tts/layers/align_tts/duration_predictor.py
+++ b/TTS/tts/layers/align_tts/duration_predictor.py
@ -0,0 +1,20 @@
+from torch import nn
+from TTS.tts.layers.generic.transformer import FFTransformerBlock
+from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
+
+
+class DurationPredictor(nn.Module):
+    def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads):
+        super().__init__()
+        self.embed = nn.Embedding(num_chars, hidden_channels)
+        self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1)
+        self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1)
+        self.out_layer = nn.Conv1d(hidden_channels, 1, 1)
+
+    def forward(self, text, text_lengths):
+        # B, L -> B, L
+        emb = self.embed(text)
+        emb = self.pos_enc(emb.transpose(1, 2))
+        x = self.FFT(emb, text_lengths)
+        x = self.out_layer(x).squeeze(-1)
+        return x
--- a/TTS/tts/layers/align_tts/mdn.py
+++ b/TTS/tts/layers/align_tts/mdn.py
@ -0,0 +1,29 @@
+from torch import nn
+
+
+class MDNBlock(nn.Module):
+    """Mixture of Density Network implementation
+    https://arxiv.org/pdf/2003.01950.pdf
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.out_channels = out_channels
+        self.conv1 = nn.Conv1d(in_channels, in_channels, 1)
+        self.norm = nn.LayerNorm(in_channels)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(0.1)
+        self.conv2 = nn.Conv1d(in_channels, out_channels, 1)
+
+    def forward(self, x):
+        o = self.conv1(x)
+        o = o.transpose(1, 2)
+        o = self.norm(o)
+        o = o.transpose(1, 2)
+        o = self.relu(o)
+        o = self.dropout(o)
+        mu_sigma = self.conv2(o)
+        # TODO: check this sigmoid
+        # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :])
+        mu = mu_sigma[:, :self.out_channels//2, :]
+        log_sigma = mu_sigma[:, self.out_channels//2:, :]
+        return mu, log_sigma
--- a/TTS/tts/layers/feed_forward/init.py
+++ b/TTS/tts/layers/feed_forward/init.py
--- a/TTS/tts/layers/speedy_speech/decoder.py
+++ b/TTS/tts/layers/speedy_speech/decoder.py
@ -3,6 +3,7 @@ from torch import nn
 from TTS.tts.layers.generic.res_conv_bn import Conv1dBNBlock, ResidualConv1dBNBlock, Conv1dBN
 from TTS.tts.layers.generic.wavenet import WNBlocks
 from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
+from TTS.tts.layers.generic.transformer import FFTransformerBlock


 class WaveNetDecoder(nn.Module):
@ -89,6 +90,37 @@ class RelativePositionTransformerDecoder(nn.Module):
        return o


+class FFTransformerDecoder(nn.Module):
+    """Decoder with FeedForwardTransformer.
+
+    Default params
+            params={
+                'hidden_channels_ffn': 1024,
+                'num_heads': 2,
+                "dropout_p": 0.1,
+                "num_layers": 6,
+            }
+
+    Args:
+        in_channels (int): number of input channels.
+        out_channels (int): number of output channels.
+        hidden_channels (int): number of hidden channels including Transformer layers.
+        params (dict): dictionary for residual convolutional blocks.
+    """
+    def __init__(self, in_channels, out_channels, params):
+
+        super().__init__()
+        self.transformer_block = FFTransformerBlock(in_channels, **params)
+        self.postnet = nn.Conv1d(in_channels, out_channels, 1)
+
+    def forward(self, x, x_mask=None, g=None):  # pylint: disable=unused-argument
+        # TODO: handle multi-speaker
+        x_mask = 1 if x_mask is None else x_mask
+        o = self.transformer_block(x) * x_mask
+        o = self.postnet(o)*  x_mask
+        return o
+
+
 class ResidualConv1dBNDecoder(nn.Module):
    """Residual Convolutional Decoder as in the original Speedy Speech paper

@ -159,24 +191,26 @@ class Decoder(nn.Module):
            c_in_channels=0):
        super().__init__()

-        if decoder_type == 'transformer':
+        if decoder_type.lower() == "relative_position_transformer":
            self.decoder = RelativePositionTransformerDecoder(
                in_channels=in_hidden_channels,
                out_channels=out_channels,
                hidden_channels=in_hidden_channels,
                params=decoder_params)
-        elif decoder_type == 'residual_conv_bn':
+        elif decoder_type.lower() == 'residual_conv_bn':
            self.decoder = ResidualConv1dBNDecoder(
                in_channels=in_hidden_channels,
                out_channels=out_channels,
                hidden_channels=in_hidden_channels,
                params=decoder_params)
-        elif decoder_type == 'wavenet':
+        elif decoder_type.lower() == 'wavenet':
            self.decoder = WaveNetDecoder(in_channels=in_hidden_channels,
                                          out_channels=out_channels,
                                          hidden_channels=in_hidden_channels,
                                          c_in_channels=c_in_channels,
                                          params=decoder_params)
+        elif decoder_type.lower() == 'fftransformer':
+            self.decoder = FFTransformerDecoder(in_hidden_channels, out_channels, decoder_params)
        else:
            raise ValueError(f'[!] Unknown decoder type - {decoder_type}')

--- a/TTS/tts/layers/speedy_speech/duration_predictor.py
+++ b/TTS/tts/layers/speedy_speech/duration_predictor.py
--- a/TTS/tts/layers/speedy_speech/encoder.py
+++ b/TTS/tts/layers/speedy_speech/encoder.py
@ -1,62 +1,8 @@
-import math
-import torch
 from torch import nn

 from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
 from TTS.tts.layers.generic.res_conv_bn import  ResidualConv1dBNBlock
-
-
-
-class PositionalEncoding(nn.Module):
-    """Sinusoidal positional encoding for non-recurrent neural networks.
-    Implementation based on "Attention Is All You Need"
-    Args:
-       channels (int): embedding size
-       dropout (float): dropout parameter
-    """
-    def __init__(self, channels, dropout=0.0, max_len=5000):
-        super().__init__()
-        if channels % 2 != 0:
-            raise ValueError(
-                "Cannot use sin/cos positional encoding with "
-                "odd channels (got channels={:d})".format(channels))
-        pe = torch.zeros(max_len, channels)
-        position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp((torch.arange(0, channels, 2, dtype=torch.float) *
-                              -(math.log(10000.0) / channels)))
-        pe[:, 0::2] = torch.sin(position.float() * div_term)
-        pe[:, 1::2] = torch.cos(position.float() * div_term)
-        pe = pe.unsqueeze(0).transpose(1, 2)
-        self.register_buffer('pe', pe)
-        if dropout > 0:
-            self.dropout = nn.Dropout(p=dropout)
-        self.channels = channels
-
-    def forward(self, x, mask=None, first_idx=None, last_idx=None):
-        """
-        Shapes:
-            x: [B, C, T]
-            mask: [B, 1, T]
-            first_idx: int
-            last_idx: int
-        """
-
-        x = x * math.sqrt(self.channels)
-        if first_idx is None:
-            if self.pe.size(2) < x.size(2):
-                raise RuntimeError(
-                    f"Sequence is {x.size(2)} but PositionalEncoding is"
-                    f" limited to {self.pe.size(2)}. See max_len argument.")
-            if mask is not None:
-                pos_enc = (self.pe[:, :, :x.size(2)] * mask)
-            else:
-                pos_enc = self.pe[:, :, :x.size(2)]
-            x = x + pos_enc
-        else:
-            x = x + self.pe[:, :, first_idx:last_idx]
-        if hasattr(self, 'dropout'):
-            x = self.dropout(x)
-        return x
+from TTS.tts.layers.generic.transformer import FFTransformerBlock


 class RelativePositionTransformerEncoder(nn.Module):
@ -138,26 +84,36 @@ class Encoder(nn.Module):
        c_in_channels (int): number of channels for conditional input.

    Note:
-        Default encoder_params...
+        Default encoder_params to be set in config.json...

-        for 'transformer'
-            encoder_params={
-                'hidden_channels_ffn': 128,
-                'num_heads': 2,
-                "kernel_size": 3,
-                "dropout_p": 0.1,
-                "num_layers": 6,
-                "rel_attn_window_size": 4,
-                "input_length": None
-            },
+        ```python
+        # for 'relative_position_transformer'
+        encoder_params={
+            'hidden_channels_ffn': 128,
+            'num_heads': 2,
+            "kernel_size": 3,
+            "dropout_p": 0.1,
+            "num_layers": 6,
+            "rel_attn_window_size": 4,
+            "input_length": None
+        },

-        for 'residual_conv_bn'
-            encoder_params = {
-                "kernel_size": 4,
-                "dilations": 4 * [1, 2, 4] + [1],
-                "num_conv_blocks": 2,
-                "num_res_blocks": 13
-            }
+        # for 'residual_conv_bn'
+        encoder_params = {
+            "kernel_size": 4,
+            "dilations": 4 * [1, 2, 4] + [1],
+            "num_conv_blocks": 2,
+            "num_res_blocks": 13
+        }
+
+        # for 'fftransformer'
+        encoder_params = {
+            "hidden_channels_ffn": 1024 ,
+            "num_heads": 2,
+            "num_layers": 6,
+            "dropout_p": 0.1
+        }
+        ```
    """
    def __init__(
            self,
@ -179,7 +135,7 @@ class Encoder(nn.Module):
        self.c_in_channels = c_in_channels

        # init encoder
-        if encoder_type.lower() == "transformer":
+        if encoder_type.lower() == "relative_position_transformer":
            # text encoder
            self.encoder = RelativePositionTransformerEncoder(
                in_hidden_channels, out_channels, in_hidden_channels,
@ -189,11 +145,13 @@ class Encoder(nn.Module):
                                                   out_channels,
                                                   in_hidden_channels,
                                                   encoder_params)
+        elif encoder_type.lower() == 'fftransformer':
+            assert in_hidden_channels == out_channels, \
+                "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'"
+            self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params) # pylint: disable=unexpected-keyword-arg
        else:
            raise NotImplementedError(' [!] unknown encoder type.')

-        # final projection layers
-

    def forward(self, x, x_mask, g=None):  # pylint: disable=unused-argument
        """
--- a/TTS/tts/layers/generic/pos_encoding.py
+++ b/TTS/tts/layers/generic/pos_encoding.py
@ -0,0 +1,56 @@
+import torch
+import math
+
+from torch import nn
+
+
+class PositionalEncoding(nn.Module):
+    """Sinusoidal positional encoding for non-recurrent neural networks.
+    Implementation based on "Attention Is All You Need"
+    Args:
+       channels (int): embedding size
+       dropout (float): dropout parameter
+    """
+    def __init__(self, channels, dropout_p=0.0, max_len=5000):
+        super().__init__()
+        if channels % 2 != 0:
+            raise ValueError(
+                "Cannot use sin/cos positional encoding with "
+                "odd channels (got channels={:d})".format(channels))
+        pe = torch.zeros(max_len, channels)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.pow(10000,
+                             torch.arange(0, channels, 2).float() / channels)
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0).transpose(1, 2)
+        self.register_buffer('pe', pe)
+        if dropout_p > 0:
+            self.dropout = nn.Dropout(p=dropout_p)
+        self.channels = channels
+
+    def forward(self, x, mask=None, first_idx=None, last_idx=None):
+        """
+        Shapes:
+            x: [B, C, T]
+            mask: [B, 1, T]
+            first_idx: int
+            last_idx: int
+        """
+
+        x = x * math.sqrt(self.channels)
+        if first_idx is None:
+            if self.pe.size(2) < x.size(2):
+                raise RuntimeError(
+                    f"Sequence is {x.size(2)} but PositionalEncoding is"
+                    f" limited to {self.pe.size(2)}. See max_len argument.")
+            if mask is not None:
+                pos_enc = (self.pe[:, :, :x.size(2)] * mask)
+            else:
+                pos_enc = self.pe[:, :, :x.size(2)]
+            x = x + pos_enc
+        else:
+            x = x + self.pe[:, :, first_idx:last_idx]
+        if hasattr(self, 'dropout'):
+            x = self.dropout(x)
+        return x
--- a/TTS/tts/layers/generic/transformer.py
+++ b/TTS/tts/layers/generic/transformer.py
@ -0,0 +1,74 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FFTransformer(nn.Module):
+    def __init__(self,
+                 in_out_channels,
+                 num_heads,
+                 hidden_channels_ffn=1024,
+                 kernel_size_fft=3,
+                 dropout_p=0.1):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(in_out_channels,
+                                               num_heads,
+                                               dropout=dropout_p)
+
+        padding = (kernel_size_fft - 1) // 2
+        self.conv1 = nn.Conv1d(in_out_channels, hidden_channels_ffn, kernel_size=kernel_size_fft, padding=padding)
+        self.conv2 = nn.Conv1d(hidden_channels_ffn, in_out_channels, kernel_size=kernel_size_fft, padding=padding)
+
+        self.norm1 = nn.LayerNorm(in_out_channels)
+        self.norm2 = nn.LayerNorm(in_out_channels)
+
+        self.dropout = nn.Dropout(dropout_p)
+
+    def forward(self, src, src_mask=None, src_key_padding_mask=None):
+        """😦 ugly looking with all the transposing """
+        src = src.permute(2, 0, 1)
+        src2, enc_align = self.self_attn(src,
+                                         src,
+                                         src,
+                                         attn_mask=src_mask,
+                                         key_padding_mask=src_key_padding_mask)
+        src = self.norm1(src + src2)
+        # T x B x D -> B x D x T
+        src = src.permute(1, 2, 0)
+        src2 = self.conv2(F.relu(self.conv1(src)))
+        src2 = self.dropout(src2)
+        src = src + src2
+        src = src.transpose(1, 2)
+        src = self.norm2(src)
+        src = src.transpose(1, 2)
+        return src, enc_align
+
+
+class FFTransformerBlock(nn.Module):
+    def __init__(self, in_out_channels, num_heads, hidden_channels_ffn,
+                 num_layers, dropout_p):
+        super().__init__()
+        self.fft_layers = nn.ModuleList([
+            FFTransformer(in_out_channels=in_out_channels,
+                          num_heads=num_heads,
+                          hidden_channels_ffn=hidden_channels_ffn,
+                          dropout_p=dropout_p) for _ in range(num_layers)
+        ])
+
+    def forward(self, x, mask=None, g=None):  # pylint: disable=unused-argument
+        """
+        TODO: handle multi-speaker
+        Shapes:
+            x: [B, C, T]
+            mask:  [B, 1, T] or [B, T]
+        """
+        if mask is not None and mask.ndim == 3:
+            mask = mask.squeeze(1)
+            # mask is negated, torch uses 1s and 0s reversely.
+            mask = ~mask.bool()
+        alignments = []
+        for layer in self.fft_layers:
+            x, align = layer(x, src_key_padding_mask=mask)
+            alignments.append(align.unsqueeze(1))
+        alignments = torch.cat(alignments, 1)
+        return x
--- a/TTS/tts/layers/generic/wavenet.py
+++ b/TTS/tts/layers/generic/wavenet.py
@ -91,6 +91,7 @@ class WN(torch.nn.Module):
    def forward(self, x, x_mask=None, g=None, **kwargs):  # pylint: disable=unused-argument
        output = torch.zeros_like(x)
        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+        x_mask = 1.0 if x_mask is None else x_mask
        if g is not None:
            g = self.cond_layer(g)
        for i in range(self.num_layers):
@ -163,7 +164,7 @@ class WNBlocks(nn.Module):
                       weight_norm=weight_norm)
            self.wn_blocks.append(layer)

-    def forward(self, x, x_mask, g=None):
+    def forward(self, x, x_mask=None, g=None):
        o = x
        for layer in self.wn_blocks:
            o = layer(o, x_mask, g)
--- a/TTS/tts/layers/glow_tts/monotonic_align/init.py
+++ b/TTS/tts/layers/glow_tts/monotonic_align/init.py
@ -23,7 +23,6 @@ def generate_path(duration, mask):
    mask: [b, t_x, t_y]
    """
    device = duration.device
-
    b, t_x, t_y = mask.shape
    cum_duration = torch.cumsum(duration, 1)
    path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -297,6 +297,11 @@ class TacotronLoss(torch.nn.Module):
                stopnet_output, stopnet_target, output_lens, decoder_b_output,
                alignments, alignment_lens, alignments_backwards, input_lens):

+
+        # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
+        # the target should be set acccordingly
+        postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
+
        return_dict = {}
        # remove lengths if no masking is applied
        if not self.config.loss_masking:
@ -307,20 +312,13 @@ class TacotronLoss(torch.nn.Module):
                decoder_loss = self.criterion(decoder_output, mel_input,
                                              output_lens)
            if self.postnet_alpha > 0:
-                if self.config.model in ["Tacotron", "TacotronGST"]:
-                    postnet_loss = self.criterion(postnet_output, linear_input,
-                                                  output_lens)
-                else:
-                    postnet_loss = self.criterion(postnet_output, mel_input,
-                                                  output_lens)
+                postnet_loss = self.criterion(postnet_output, postnet_target,
+                                              output_lens)
        else:
            if self.decoder_alpha > 0:
                decoder_loss = self.criterion(decoder_output, mel_input)
            if self.postnet_alpha > 0:
-                if self.config.model in ["Tacotron", "TacotronGST"]:
-                    postnet_loss = self.criterion(postnet_output, linear_input)
-                else:
-                    postnet_loss = self.criterion(postnet_output, mel_input)
+                postnet_loss = self.criterion(postnet_output, postnet_target)
        loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss
        return_dict['decoder_loss'] = decoder_loss
        return_dict['postnet_loss'] = postnet_loss
@ -373,7 +371,7 @@ class TacotronLoss(torch.nn.Module):

        # postnet differential spectral loss
        if self.config.postnet_diff_spec_alpha > 0:
-            postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens)
+            postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, postnet_target, output_lens)
            loss += postnet_diff_spec_loss * self.postnet_diff_spec_alpha
            return_dict['postnet_diff_spec_loss'] = postnet_diff_spec_loss

@ -385,7 +383,7 @@ class TacotronLoss(torch.nn.Module):

        # postnet ssim loss
        if self.config.postnet_ssim_alpha > 0:
-            postnet_ssim_loss = self.criterion_ssim(postnet_output, mel_input, output_lens)
+            postnet_ssim_loss = self.criterion_ssim(postnet_output, postnet_target, output_lens)
            loss += postnet_ssim_loss * self.postnet_ssim_alpha
            return_dict['postnet_ssim_loss'] = postnet_ssim_loss

@ -442,5 +440,117 @@ class SpeedySpeechLoss(nn.Module):
        l1_loss = self.l1(decoder_output, decoder_target, decoder_output_lens)
        ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
        huber_loss = self.huber(dur_output, dur_target, input_lens)
-        loss = l1_loss + ssim_loss + huber_loss
+        loss = self.l1_alpha * l1_loss + self.ssim_alpha * ssim_loss + self.huber_alpha * huber_loss
        return {'loss': loss, 'loss_l1': l1_loss, 'loss_ssim': ssim_loss, 'loss_dur': huber_loss}
+
+
+def mse_loss_custom(x, y):
+    """MSE loss using the torch back-end without reduction.
+    It uses less VRAM than the raw code"""
+    expanded_x, expanded_y = torch.broadcast_tensors(x, y)
+    return torch._C._nn.mse_loss(expanded_x, expanded_y, 0)  # pylint: disable=protected-access, c-extension-no-member
+
+
+class MDNLoss(nn.Module):
+    """Mixture of Density Network Loss as described in https://arxiv.org/pdf/2003.01950.pdf.
+    """
+
+    def forward(self, logp, text_lengths, mel_lengths):  # pylint: disable=no-self-use
+        '''
+        Shapes:
+            mu: [B, D, T]
+            log_sigma: [B, D, T]
+            mel_spec: [B, D, T]
+        '''
+        B, T_seq, T_mel = logp.shape
+        log_alpha = logp.new_ones(B, T_seq, T_mel)*(-1e4)
+        log_alpha[:, 0, 0] = logp[:, 0, 0]
+        for t in range(1, T_mel):
+            prev_step = torch.cat([log_alpha[:, :, t-1:t], functional.pad(log_alpha[:, :, t-1:t],
+                                                                          (0, 0, 1, -1), value=-1e4)], dim=-1)
+            log_alpha[:, :, t] = torch.logsumexp(prev_step + 1e-4, dim=-1) + logp[:, :, t]
+        alpha_last = log_alpha[torch.arange(B), text_lengths-1, mel_lengths-1]
+        mdn_loss = -alpha_last.mean() / T_seq
+        return mdn_loss#, log_prob_matrix
+
+
+class AlignTTSLoss(nn.Module):
+    """Modified AlignTTS Loss.
+    Computes following losses
+        - L1 and SSIM losses from output spectrograms.
+        - Huber loss for duration predictor.
+        - MDNLoss for Mixture of Density Network.
+
+    All the losses are aggregated by a weighted sum with the loss alphas.
+    Alphas can be scheduled based on number of steps.
+
+    Args:
+        c (dict): TTS model configuration.
+    """
+    def __init__(self, c):
+        super().__init__()
+        self.mdn_loss = MDNLoss()
+        self.spec_loss = MSELossMasked(False)
+        self.ssim = SSIMLoss()
+        self.dur_loss = MSELossMasked(False)
+
+        self.ssim_alpha = c.ssim_alpha
+        self.dur_loss_alpha = c.dur_loss_alpha
+        self.spec_loss_alpha = c.spec_loss_alpha
+        self.mdn_alpha = c.mdn_alpha
+
+    def forward(self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target,
+                input_lens, step, phase):
+        ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(
+            step)
+        spec_loss, ssim_loss, dur_loss, mdn_loss = 0, 0, 0, 0
+        if phase == 0:
+            mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
+        elif phase == 1:
+            spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens)
+            ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
+        elif phase == 2:
+            mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
+            spec_loss = self.spec_lossX(decoder_output, decoder_target, decoder_output_lens)
+            ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
+        elif phase == 3:
+            dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens)
+        else:
+            mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
+            spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens)
+            ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
+            dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens)
+        loss = spec_loss_alpha * spec_loss + ssim_alpha * ssim_loss + dur_loss_alpha * dur_loss + mdn_alpha * mdn_loss
+        return {'loss': loss, 'loss_l1': spec_loss, 'loss_ssim': ssim_loss, 'loss_dur': dur_loss, 'mdn_loss': mdn_loss}
+
+    @staticmethod
+    def _set_alpha(step, alpha_settings):
+        '''Set the loss alpha wrt number of steps.
+        Return the corresponding value if no schedule is set.
+
+        Example:
+            Setting a alpha schedule.
+            if ```alpha_settings``` is ```[[0, 1], [10000, 0.1]]```  then ```return_alpha == 1``` until 10k steps, then set to 0.1.
+            if ```alpha_settings``` is a constant value then ```return_alpha``` is set to that constant.
+
+        Args:
+            step (int): number of training steps.
+            alpha_settings (int or list): constant alpha value or a list defining the schedule as explained above.
+        '''
+        return_alpha = None
+        if isinstance(alpha_settings, list):
+            for key, alpha in alpha_settings:
+                if key < step:
+                    return_alpha = alpha
+        elif isinstance(alpha_settings, (float, int)):
+            return_alpha = alpha_settings
+        return return_alpha
+
+    def set_alphas(self, step):
+        '''Set the alpha values for all the loss functions
+        '''
+        ssim_alpha = self._set_alpha(step, self.ssim_alpha)
+        dur_loss_alpha = self._set_alpha(step, self.dur_loss_alpha)
+        spec_loss_alpha = self._set_alpha(step, self.spec_loss_alpha)
+        mdn_alpha = self._set_alpha(step, self.mdn_alpha)
+        return ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha
--- a/TTS/tts/layers/tacotron/init.py
+++ b/TTS/tts/layers/tacotron/init.py
--- a/TTS/tts/layers/tacotron/attentions.py
+++ b/TTS/tts/layers/tacotron/attentions.py
@ -2,7 +2,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F

-from TTS.tts.layers.common_layers import Linear
+from TTS.tts.layers.tacotron.common_layers import Linear
 from scipy.stats import betabinom


--- a/TTS/tts/layers/tacotron/common_layers.py
+++ b/TTS/tts/layers/tacotron/common_layers.py
--- a/TTS/tts/layers/tacotron/gst_layers.py
+++ b/TTS/tts/layers/tacotron/gst_layers.py
--- a/TTS/tts/layers/tacotron/tacotron.py
+++ b/TTS/tts/layers/tacotron/tacotron.py
--- a/TTS/tts/layers/tacotron/tacotron2.py
+++ b/TTS/tts/layers/tacotron/tacotron2.py
--- a/TTS/tts/models/align_tts.py
+++ b/TTS/tts/models/align_tts.py
@ -0,0 +1,323 @@
+import torch
+import torch.nn as nn
+from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
+from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
+from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
+from TTS.tts.utils.generic_utils import sequence_mask
+from TTS.tts.layers.align_tts.mdn import MDNBlock
+from TTS.tts.layers.feed_forward.encoder import Encoder
+from TTS.tts.layers.feed_forward.decoder import Decoder
+
+
+class AlignTTS(nn.Module):
+    """AlignTTS with modified duration predictor.
+    https://arxiv.org/pdf/2003.01950.pdf
+
+    Encoder -> DurationPredictor -> Decoder
+
+    AlignTTS's Abstract - Targeting at both high efficiency and performance, we propose AlignTTS to predict the
+    mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a
+    sequence of characters, and the duration of each character is determined by a duration predictor.Instead of
+    adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented
+    to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s
+    how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean
+    option score (MOS), but also a high efficiency which is more than 50 times faster than real-time.
+
+    Note:
+        Original model uses a separate character embedding layer for duration predictor. However, it causes the
+        duration predictor to overfit and prevents learning higher level interactions among characters. Therefore,
+        we predict durations based on encoder outputs which has higher level information about input characters. This
+        enables training without phases as in the original paper.
+
+        Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture
+        differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters.
+
+    Args:
+        num_chars (int):
+            number of unique input to characters
+        out_channels (int):
+            number of output tensor channels. It is equal to the expected spectrogram size.
+        hidden_channels (int):
+            number of channels in all the model layers.
+        hidden_channels_ffn (int):
+            number of channels in transformer's conv layers.
+        hidden_channels_dp (int):
+            number of channels in duration predictor network.
+        num_heads (int):
+            number of attention heads in transformer networks.
+        num_transformer_layers (int):
+            number of layers in encoder and decoder transformer blocks.
+        dropout_p (int):
+            dropout rate in transformer layers.
+        length_scale (int, optional):
+            coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
+        num_speakers (int, optional):
+            number of speakers for multi-speaker training. Defaults to 0.
+        external_c (bool, optional):
+            enable external speaker embeddings. Defaults to False.
+        c_in_channels (int, optional):
+            number of channels in speaker embedding vectors. Defaults to 0.
+    """
+
+    # pylint: disable=dangerous-default-value
+
+    def __init__(
+            self,
+            num_chars,
+            out_channels,
+            hidden_channels=256,
+            hidden_channels_dp=256,
+            encoder_type='fftransformer',
+            encoder_params={
+                'hidden_channels_ffn': 1024,
+                'num_heads': 2,
+                'num_layers': 6,
+                'dropout_p': 0.1
+            },
+            decoder_type='fftransformer',
+            decoder_params={
+                'hidden_channels_ffn': 1024,
+                'num_heads': 2,
+                'num_layers': 6,
+                'dropout_p': 0.1
+            },
+            length_scale=1,
+            num_speakers=0,
+            external_c=False,
+            c_in_channels=0):
+
+        super().__init__()
+        self.length_scale = float(length_scale) if isinstance(
+            length_scale, int) else length_scale
+        self.emb = nn.Embedding(num_chars, hidden_channels)
+        self.pos_encoder = PositionalEncoding(hidden_channels)
+        self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type,
+                               encoder_params, c_in_channels)
+        self.decoder = Decoder(out_channels, hidden_channels, decoder_type,
+                               decoder_params)
+        self.duration_predictor = DurationPredictor(hidden_channels_dp)
+
+        self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1)
+        self.mdn_block = MDNBlock(hidden_channels, 2 * out_channels)
+
+        if num_speakers > 1 and not external_c:
+            # speaker embedding layer
+            self.emb_g = nn.Embedding(num_speakers, c_in_channels)
+            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
+
+        if c_in_channels > 0 and c_in_channels != hidden_channels:
+            self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1)
+
+    @staticmethod
+    def compute_log_probs(mu, log_sigma, y):
+        # pylint: disable=protected-access, c-extension-no-member
+        y = y.transpose(1, 2).unsqueeze(1) # [B, 1, T1, D]
+        mu = mu.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D]
+        log_sigma = log_sigma.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D]
+        expanded_y, expanded_mu = torch.broadcast_tensors(y, mu)
+        exponential = -0.5 * torch.mean(torch._C._nn.mse_loss(
+            expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2),
+                                        dim=-1)  # B, L, T
+        logp = exponential - 0.5 * log_sigma.mean(dim=-1)
+        return logp
+
+    def compute_align_path(self, mu, log_sigma, y, x_mask, y_mask):
+        # find the max alignment path
+        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
+        log_p = self.compute_log_probs(mu, log_sigma, y)
+        # [B, T_en, T_dec]
+        attn = maximum_path(log_p, attn_mask.squeeze(1)).unsqueeze(1)
+        dr_mas = torch.sum(attn, -1)
+        return dr_mas.squeeze(1), log_p
+
+    @staticmethod
+    def convert_dr_to_align(dr, x_mask, y_mask):
+        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
+        attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
+        return attn
+
+    def expand_encoder_outputs(self, en, dr, x_mask, y_mask):
+        """Generate attention alignment map from durations and
+        expand encoder outputs
+
+        Example:
+            encoder output: [a,b,c,d]
+            durations: [1, 3, 2, 1]
+
+            expanded: [a, b, b, b, c, c, d]
+            attention map: [[0, 0, 0, 0, 0, 0, 1],
+                            [0, 0, 0, 0, 1, 1, 0],
+                            [0, 1, 1, 1, 0, 0, 0],
+                            [1, 0, 0, 0, 0, 0, 0]]
+        """
+        attn = self.convert_dr_to_align(dr, x_mask, y_mask)
+        o_en_ex = torch.matmul(
+            attn.squeeze(1).transpose(1, 2), en.transpose(1,
+                                                          2)).transpose(1, 2)
+        return o_en_ex, attn
+
+    def format_durations(self, o_dr_log, x_mask):
+        o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
+        o_dr[o_dr < 1] = 1.0
+        o_dr = torch.round(o_dr)
+        return o_dr
+
+    @staticmethod
+    def _concat_speaker_embedding(o_en, g):
+        g_exp = g.expand(-1, -1, o_en.size(-1))  # [B, C, T_en]
+        o_en = torch.cat([o_en, g_exp], 1)
+        return o_en
+
+    def _sum_speaker_embedding(self, x, g):
+        # project g to decoder dim.
+        if hasattr(self, 'proj_g'):
+            g = self.proj_g(g)
+        return x + g
+
+    def _forward_encoder(self, x, x_lengths, g=None):
+        if hasattr(self, 'emb_g'):
+            g = nn.functional.normalize(self.emb_g(g))  # [B, C, 1]
+
+        if g is not None:
+            g = g.unsqueeze(-1)
+
+        # [B, T, C]
+        x_emb = self.emb(x)
+        # [B, C, T]
+        x_emb = torch.transpose(x_emb, 1, -1)
+
+        # compute sequence masks
+        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]),
+                                 1).to(x.dtype)
+
+        # encoder pass
+        o_en = self.encoder(x_emb, x_mask)
+
+        # speaker conditioning for duration predictor
+        if g is not None:
+            o_en_dp = self._concat_speaker_embedding(o_en, g)
+        else:
+            o_en_dp = o_en
+        return o_en, o_en_dp, x_mask, g
+
+    def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g):
+        y_mask = torch.unsqueeze(sequence_mask(y_lengths, None),
+                                 1).to(o_en_dp.dtype)
+        # expand o_en with durations
+        o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
+        # positional encoding
+        if hasattr(self, 'pos_encoder'):
+            o_en_ex = self.pos_encoder(o_en_ex, y_mask)
+        # speaker embedding
+        if g is not None:
+            o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
+        # decoder pass
+        o_de = self.decoder(o_en_ex, y_mask, g=g)
+        return o_de, attn.transpose(1, 2)
+
+    def _forward_mdn(self, o_en, y, y_lengths, x_mask):
+        # MAS potentials and alignment
+        mu, log_sigma = self.mdn_block(o_en)
+        y_mask = torch.unsqueeze(sequence_mask(y_lengths, None),
+                                 1).to(o_en.dtype)
+        dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask,
+                                               y_mask)
+        return dr_mas, mu, log_sigma, logp
+
+    def forward(self, x, x_lengths, y, y_lengths, phase=None, g=None):  # pylint: disable=unused-argument
+        """
+        Shapes:
+            x: [B, T_max]
+            x_lengths: [B]
+            y_lengths: [B]
+            dr: [B, T_max]
+            g: [B, C]
+        """
+        o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None
+        if phase == 0:
+            # train encoder and MDN
+            o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
+            dr_mas, mu, log_sigma, logp = self._forward_mdn(
+                o_en, y, y_lengths, x_mask)
+            y_mask = torch.unsqueeze(sequence_mask(y_lengths, None),
+                                     1).to(o_en_dp.dtype)
+            attn = self.convert_dr_to_align(dr_mas, x_mask, y_mask)
+        elif phase == 1:
+            # train decoder
+            o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
+            dr_mas, _, _, _ = self._forward_mdn(o_en, y, y_lengths, x_mask)
+            o_de, attn = self._forward_decoder(o_en.detach(),
+                                               o_en_dp.detach(),
+                                               dr_mas.detach(),
+                                               x_mask,
+                                               y_lengths,
+                                               g=g)
+        elif phase == 2:
+            # train the whole except duration predictor
+            o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
+            dr_mas, mu, log_sigma, logp = self._forward_mdn(
+                o_en, y, y_lengths, x_mask)
+            o_de, attn = self._forward_decoder(o_en,
+                                               o_en_dp,
+                                               dr_mas,
+                                               x_mask,
+                                               y_lengths,
+                                               g=g)
+        elif phase == 3:
+            # train duration predictor
+            o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
+            o_dr_log = self.duration_predictor(x, x_mask)
+            dr_mas, mu, log_sigma, logp = self._forward_mdn(
+                o_en, y, y_lengths, x_mask)
+            o_de, attn = self._forward_decoder(o_en,
+                                               o_en_dp,
+                                               dr_mas,
+                                               x_mask,
+                                               y_lengths,
+                                               g=g)
+            o_dr_log = o_dr_log.squeeze(1)
+        else:
+            o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
+            o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
+            dr_mas, mu, log_sigma, logp = self._forward_mdn(
+                o_en, y, y_lengths, x_mask)
+            o_de, attn = self._forward_decoder(o_en,
+                                               o_en_dp,
+                                               dr_mas,
+                                               x_mask,
+                                               y_lengths,
+                                               g=g)
+            o_dr_log = o_dr_log.squeeze(1)
+        dr_mas_log = torch.log(dr_mas + 1).squeeze(1)
+        return o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp
+
+    @torch.no_grad()
+    def inference(self, x, x_lengths, g=None):  # pylint: disable=unused-argument
+        """
+        Shapes:
+            x: [B, T_max]
+            x_lengths: [B]
+            g: [B, C]
+        """
+        # pad input to prevent dropping the last word
+        # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
+        o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
+        # o_dr_log = self.duration_predictor(x, x_mask)
+        o_dr_log = self.duration_predictor(o_en_dp, x_mask)
+        # duration predictor pass
+        o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
+        y_lengths = o_dr.sum(1)
+        o_de, attn = self._forward_decoder(o_en,
+                                           o_en_dp,
+                                           o_dr,
+                                           x_mask,
+                                           y_lengths,
+                                           g=g)
+        return o_de, attn
+
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
+        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        self.load_state_dict(state['model'])
+        if eval:
+            self.eval()
+            assert not self.training
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@ -9,7 +9,7 @@ from TTS.tts.utils.generic_utils import sequence_mask
 from TTS.tts.layers.glow_tts.monotonic_align import maximum_path, generate_path


-class GlowTts(nn.Module):
+class GlowTTS(nn.Module):
    """Glow TTS models from https://arxiv.org/abs/2005.11129

    Args:
--- a/TTS/tts/models/speedy_speech.py
+++ b/TTS/tts/models/speedy_speech.py
@ -1,8 +1,9 @@
 import torch
 from torch import nn
-from TTS.tts.layers.speedy_speech.decoder import Decoder
-from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
-from TTS.tts.layers.speedy_speech.encoder import Encoder, PositionalEncoding
+from TTS.tts.layers.feed_forward.decoder import Decoder
+from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
+from TTS.tts.layers.feed_forward.encoder import Encoder
+from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
 from TTS.tts.utils.generic_utils import sequence_mask
 from TTS.tts.layers.glow_tts.monotonic_align import generate_path

--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@ -2,8 +2,8 @@
 import torch
 from torch import nn

-from TTS.tts.layers.gst_layers import GST
-from TTS.tts.layers.tacotron import Decoder, Encoder, PostCBHG
+from TTS.tts.layers.tacotron.gst_layers import GST
+from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
 from TTS.tts.models.tacotron_abstract import TacotronAbstract


--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@ -1,8 +1,8 @@
 import torch
 from torch import nn

-from TTS.tts.layers.gst_layers import GST
-from TTS.tts.layers.tacotron2 import Decoder, Encoder, Postnet
+from TTS.tts.layers.tacotron.gst_layers import GST
+from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
 from TTS.tts.models.tacotron_abstract import TacotronAbstract

 # TODO: match function arguments with tacotron
@ -17,7 +17,7 @@ class Tacotron2(TacotronAbstract):
        r (int): initial model reduction rate.
        postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
        decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
-        attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'.
+        attn_type (str, optional): attention type. Check ```TTS.tts.layers.tacotron.common_layers.init_attn```. Defaults to 'original'.
        attn_win (bool, optional): enable/disable attention windowing.
            It especially useful at inference to keep attention alignment diagonal. Defaults to False.
        attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
--- a/TTS/tts/models/tacotron_abstract.py
+++ b/TTS/tts/models/tacotron_abstract.py
@ -149,8 +149,7 @@ class TacotronAbstract(ABC, nn.Module):
    def _backward_pass(self, mel_specs, encoder_outputs, mask):
        """ Run backwards decoder """
        decoder_outputs_b, alignments_b, _ = self.decoder_backward(
-            encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
-            self.speaker_embeddings_projected)
+            encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask)
        decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
        return decoder_outputs_b, alignments_b

--- a/TTS/tts/tf/layers/tacotron/init.py
+++ b/TTS/tts/tf/layers/tacotron/init.py
--- a/TTS/tts/tf/layers/tacotron/common_layers.py
+++ b/TTS/tts/tf/layers/tacotron/common_layers.py
--- a/TTS/tts/tf/layers/tacotron/tacotron2.py
+++ b/TTS/tts/tf/layers/tacotron/tacotron2.py
@ -1,7 +1,7 @@
 import tensorflow as tf
 from tensorflow import keras
 from TTS.tts.tf.utils.tf_utils import shape_list
-from TTS.tts.tf.layers.common_layers import Prenet, Attention
+from TTS.tts.tf.layers.tacotron.common_layers import Prenet, Attention


 # NOTE: linter has a problem with the current TF release
--- a/TTS/tts/tf/models/tacotron2.py
+++ b/TTS/tts/tf/models/tacotron2.py
@ -1,7 +1,7 @@
 import tensorflow as tf
 from tensorflow import keras

-from TTS.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet
+from TTS.tts.tf.layers.tacotron.tacotron2 import Encoder, Decoder, Postnet
 from TTS.tts.tf.utils.tf_utils import shape_list


--- a/TTS/tts/utils/generic_utils.py
+++ b/TTS/tts/utils/generic_utils.py
@ -41,7 +41,9 @@ def sequence_mask(sequence_length, max_len=None):

 def to_camel(text):
    text = text.capitalize()
-    return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
+    text = re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
+    text = text.replace('Tts', 'TTS')
+    return text


 def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
@ -132,13 +134,23 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
                        decoder_type=c['decoder_type'],
                        decoder_params=c['decoder_params'],
                        c_in_channels=0)
+    elif c.model.lower() == "align_tts":
+        model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False),
+                        out_channels=c.audio['num_mels'],
+                        hidden_channels=c['hidden_channels'],
+                        hidden_channels_dp=c['hidden_channels_dp'],
+                        encoder_type=c['encoder_type'],
+                        encoder_params=c['encoder_params'],
+                        decoder_type=c['decoder_type'],
+                        decoder_params=c['decoder_params'],
+                        c_in_channels=0)
    return model

 def is_tacotron(c):
-    return not c['model'] in ['speedy_speech', 'glow_tts']
+    return 'tacotron' in c['model'].lower()

 def check_config_tts(c):
-    check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
+    check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech', 'align_tts'], restricted=True, val_type=str)
    check_argument('run_name', c, restricted=True, val_type=str)
    check_argument('run_description', c, val_type=str)

@ -195,7 +207,7 @@ def check_config_tts(c):
        check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
        check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
        check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0)
-    if c['model'].lower == "speedy_speech":
+    if c['model'].lower in ["speedy_speech", "align_tts"]:
        check_argument('ssim_alpha', c, restricted=True, val_type=float, min_val=0)
        check_argument('l1_alpha', c, restricted=True, val_type=float, min_val=0)
        check_argument('huber_alpha', c, restricted=True, val_type=float, min_val=0)
@ -239,7 +251,7 @@ def check_config_tts(c):
        check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool)

    # Model Parameters for non-tacotron models
-    if c['model'].lower == "speedy_speech":
+    if c['model'].lower in ["speedy_speech", "align_tts"]:
        check_argument('positional_encoding', c, restricted=True, val_type=type)
        check_argument('encoder_type', c, restricted=True, val_type=str)
        check_argument('encoder_params', c, restricted=True, val_type=dict)
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -77,7 +77,7 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
        # these only belong to tacotron models.
        decoder_output = None
        stop_tokens = None
-    elif 'speedy_speech' in CONFIG.model.lower():
+    elif CONFIG.model.lower() in ['speedy_speech', 'align_tts']:
        inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device)  # pylint: disable=not-callable
        if hasattr(model, 'module'):
            # distributed model
@ -88,6 +88,8 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
        # these only belong to tacotron models.
        decoder_output = None
        stop_tokens = None
+    else:
+        raise ValueError('[!] Unknown model name.')
    return decoder_output, postnet_output, alignments, stop_tokens


--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@ -24,7 +24,7 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                    ]]

 # List of (regular expression, replacement) pairs for abbreviations in french:
-abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
+abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                    for x in [
                        ('M', 'monsieur'),
                        ('Mlle', 'mademoiselle'),
@ -58,4 +58,9 @@ abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
                        ('ex', 'exemple'),
                        ('excl', 'exclusivement'),
                        ('boul', 'boulevard'),
+                    ]] + [(re.compile('\\b%s' % x[0]), x[1]) for x in [
+                        ('Mlle', 'mademoiselle'),
+                        ('Mlles', 'mesdemoiselles'),
+                        ('Mme', 'Madame'),
+                        ('Mmes', 'Mesdames'),
                    ]]
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@ -94,6 +94,7 @@ def basic_turkish_cleaners(text):
    text = collapse_whitespace(text)
    return text

+
 def english_cleaners(text):
    '''Pipeline for English text, including number and abbreviation expansion.'''
    text = convert_to_ascii(text)
@ -106,15 +107,17 @@ def english_cleaners(text):
    text = collapse_whitespace(text)
    return text

+
 def french_cleaners(text):
    '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
-    text = lowercase(text)
    text = expand_abbreviations(text, lang='fr')
+    text = lowercase(text)
    text = replace_symbols(text, lang='fr')
    text = remove_aux_symbols(text)
    text = collapse_whitespace(text)
    return text

+
 def portuguese_cleaners(text):
    '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
        numbers, phonemizer already does that'''
@ -124,13 +127,13 @@ def portuguese_cleaners(text):
    text = collapse_whitespace(text)
    return text

+
 def chinese_mandarin_cleaners(text: str) -> str:
    '''Basic pipeline for chinese'''
    text = replace_numbers_to_characters_in_text(text)
    return text


-
 def phoneme_cleaners(text):
    '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
    text = expand_numbers(text)
--- a/TTS/utils/arguments.py
+++ b/TTS/utils/arguments.py
@ -7,7 +7,7 @@ import glob
 import os
 import re

-from TTS.tts.utils.generic_utils import check_config_tts
+import torch
 from TTS.tts.utils.text.symbols import parse_symbols
 from TTS.utils.console_logger import ConsoleLogger
 from TTS.utils.generic_utils import create_experiment_folder, get_git_branch
@ -104,7 +104,7 @@ def get_last_checkpoint(path):
        key_file_names = [fn for fn in file_names if key in fn]
        if last_model is None and len(key_file_names) > 0:
            last_model = max(key_file_names, key=os.path.getctime)
-            last_model_num = os.path.getctime(last_model)
+            last_model_num = torch.load(last_model)['step']

        if last_model is not None:
            last_models[key] = last_model
@ -125,19 +125,13 @@ def get_last_checkpoint(path):
    return last_models['checkpoint'], last_models['best_model']


-def process_args(args, model_type):
-    """Process parsed comand line arguments.
+def process_args(args, model_class):
+    """Process parsed comand line arguments based on model class (tts or vocoder).

    Args:
        args (argparse.Namespace or dict like): Parsed input arguments.
        model_type (str): Model type used to check config parameters and setup
-            the TensorBoard logger. One of:
-                - tacotron
-                - glow_tts
-                - speedy_speech
-                - gan
-                - wavegrad
-                - wavernn
+            the TensorBoard logger. One of ['tts', 'vocoder'].

    Raises:
        ValueError: If `model_type` is not one of implemented choices.
@ -160,23 +154,9 @@ def process_args(args, model_type):

    # setup output paths and read configs
    c = load_config(args.config_path)
-    if model_type in "tacotron glow_tts speedy_speech":
-        model_class = "TTS"
-    elif model_type in "gan wavegrad wavernn":
-        model_class = "VOCODER"
-    else:
-        raise ValueError("model type {model_type} not recognized!")
-
-    if model_class == "TTS":
-        check_config_tts(c)
-    elif model_class == "VOCODER":
-        print("Vocoder config checker not implemented, skipping ...")
-    else:
-        raise ValueError(f"model type {model_type} not recognized!")
-
    _ = os.path.dirname(os.path.realpath(__file__))

-    if model_type in "tacotron wavegrad wavernn" and c.mixed_precision:
+    if 'mixed_precision' in c and c.mixed_precision:
        print("   >  Mixed precision mode is ON")

    out_path = args.continue_path
@ -198,7 +178,7 @@ def process_args(args, model_type):
        # if model characters are not set in the config file
        # save the default set to the config file for future
        # compatibility.
-        if model_class == 'TTS' and 'characters' not in c:
+        if model_class == 'tts' and 'characters' not in c:
            used_characters = parse_symbols()
            new_fields['characters'] = used_characters
        copy_model_files(c, args.config_path,
@ -208,7 +188,7 @@ def process_args(args, model_type):

        log_path = out_path

-        tb_logger = TensorboardLogger(log_path, model_name=model_class)
+        tb_logger = TensorboardLogger(log_path, model_name=model_class.upper())

        # write model desc to tensorboard
        tb_logger.tb_add_text("model-description", c["run_description"], 0)
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@ -15,6 +15,8 @@ def get_git_branch():
        current.replace("* ", "")
    except subprocess.CalledProcessError:
        current = "inside_docker"
+    except FileNotFoundError:
+        current = "unknown"
    return current


@ -30,7 +32,7 @@ def get_commit_hash():
        commit = subprocess.check_output(
            ['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
    # Not copying .git folder into docker container
-    except subprocess.CalledProcessError:
+    except (subprocess.CalledProcessError, FileNotFoundError):
        commit = "0000000"
    print(' > Git Hash: {}'.format(commit))
    return commit
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -69,7 +69,7 @@ def copy_model_files(c, config_file, out_path, new_fields):
        else:
            new_line = '"{}":{},\n'.format(key, json.dumps(value, ensure_ascii=False))
        config_lines.insert(1, new_line)
-    config_out_file = open(copy_config_path, "w")
+    config_out_file = open(copy_config_path, "w", encoding="utf-8")
    config_out_file.writelines(config_lines)
    config_out_file.close()
    # copy model stats file if available
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -43,7 +43,7 @@ class ModelManager(object):
        Args:
            file_path (str): path to .models.json.
        """
-        with open(file_path) as json_file:
+        with open(file_path, "r", encoding="utf-8") as json_file:
            self.models_dict = json.load(json_file)

    def list_langs(self):
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -12,7 +12,6 @@ from TTS.vocoder.utils.generic_utils import setup_generator, interpolate_vocoder
 # pylint: disable=unused-wildcard-import
 # pylint: disable=wildcard-import
 from TTS.tts.utils.synthesis import synthesis, trim_silence
-
 from TTS.tts.utils.text import make_symbols, phonemes, symbols


--- a/hubconf.py
+++ b/hubconf.py
@ -1,4 +1,4 @@
-dependencies = ['torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode']  # apt install espeak-ng
+dependencies = ['torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode', 'pypinyin']  # apt install espeak-ng
 import torch

 from TTS.utils.synthesizer import Synthesizer
@ -9,7 +9,7 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name=None, us
    """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text.

    Example:
-        >>> synthesizer = torch.hub.load('mozilla/TTS', 'tts', source='github')
+        >>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github')
        >>> wavs = synthesizer.tts("This is a test! This is also a test!!")
            wavs - is a list of values of the synthesized speech.

@ -33,5 +33,5 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name=None, us


 if __name__ == '__main__':
-    synthesizer = torch.hub.load('mozilla/TTS:hub_conf', 'tts', source='github')
+    synthesizer = torch.hub.load('coqui-ai/TTS:hub_conf', 'tts', source='github')
    synthesizer.tts("This is a test!")
--- a/notebooks/dataset_analysis/AnalyzeDataset-Copy1.ipynb
+++ b/notebooks/dataset_analysis/AnalyzeDataset-Copy1.ipynb
--- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb
+++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
@ -393,7 +393,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.7"
+   "version": "3.8.5"
  }
 },
 "nbformat": 4,
--- a/notebooks/dataset_analysis/CheckDatasetSNR.ipynb
+++ b/notebooks/dataset_analysis/CheckDatasetSNR.ipynb
@ -4,7 +4,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn.\n",
+    "This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn. SNR paper can be seen here: https://www.cs.cmu.edu/~robust/Papers/KimSternIS08.pdf\n",
    "\n",
    "To use this notebook, you need:\n",
    "- WADA SNR estimation: http://www.cs.cmu.edu/~robust/archive/algorithms/WADA_SNR_IS_2008/\n",
@ -18,12 +18,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    }
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import os, sys\n",
@ -42,12 +37,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    }
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the meta parameters\n",
@ -60,10 +50,7 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    }
+    "tags": []
   },
   "outputs": [],
   "source": [
@ -89,12 +76,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    }
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "wav_file = \"/home/erogol/Data/LJSpeech-1.1/wavs/LJ001-0001.wav\"\n",
@ -136,7 +118,7 @@
    "snrs = [tup[0] for tup in file_snrs]\n",
    "\n",
    "error_idxs = np.where(np.isnan(snrs) == True)[0]\n",
-    "error_files = [file_names[idx] for idx in error_idxs]\n",
+    "error_files = [wav_files[idx] for idx in error_idxs]\n",
    "\n",
    "file_snrs = [i for j, i in enumerate(file_snrs) if j not in error_idxs]\n",
    "file_names = [tup[1] for tup in file_snrs]\n",
@ -150,12 +132,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    }
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def output_snr_with_audio(idx):\n",
@ -205,12 +182,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    }
-   },
+   "metadata": {},
   "outputs": [],
   "source": []
  }
@ -231,9 +203,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}
--- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb
+++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb
--- a/notebooks/dataset_analysis/PhonemeCoverage.ipynb
+++ b/notebooks/dataset_analysis/PhonemeCoverage.ipynb
@ -243,9 +243,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.9-final"
+   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}
--- a/requirements.txt
+++ b/requirements.txt
@ -22,6 +22,6 @@ nose==1.3.7
 cardboardlint==1.3.0
 pylint==2.5.3
 gdown
-umap-learn
+umap-learn==0.4.6
 cython
 pyyaml
--- a/run_tests.sh
+++ b/run_tests.sh
@ -1,18 +1,20 @@
 set -e
 TF_CPP_MIN_LOG_LEVEL=3

-# tests
+# # tests
 nosetests tests -x &&\

 # runtime tests
 ./tests/test_demo_server.sh && \
+./tests/test_resample.sh && \
 ./tests/test_tacotron_train.sh && \
 ./tests/test_glow-tts_train.sh && \
 ./tests/test_vocoder_gan_train.sh && \
 ./tests/test_vocoder_wavernn_train.sh && \
 ./tests/test_vocoder_wavegrad_train.sh && \
 ./tests/test_speedy_speech_train.sh && \
+./tests/test_aligntts_train.sh && \
 ./tests/test_compute_statistics.sh && \

 # linter check
-cardboardlinter --refspec main
+cardboardlinter --refspec main
--- a/setup.py
+++ b/setup.py
@ -19,7 +19,7 @@ if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version)
    )


-version = '0.0.10.3'
+version = '0.0.11'
 cwd = os.path.dirname(os.path.abspath(__file__))

 class build_py(setuptools.command.build_py.build_py):  # pylint: disable=too-many-ancestors
--- a/tests/inputs/test_align_tts.json
+++ b/tests/inputs/test_align_tts.json
@ -0,0 +1,157 @@
+{
+    "model": "align_tts",
+    "run_name": "test_sample_dataset_run",
+    "run_description": "sample dataset test run",
+
+    // AUDIO PARAMETERS
+    "audio":{
+         // stft parameters
+         "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+         "win_length": 1024,      // stft window length in ms.
+         "hop_length": 256,       // stft window hop-lengh in ms.
+         "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+         "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+         // Audio processing parameters
+         "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+         "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+         "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+         // Silence trimming
+         "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+         "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+         // Griffin-Lim
+         "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+         "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+         // MelSpectrogram parameters
+         "num_mels": 80,         // size of the mel spec frame.
+         "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+         "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
+         "spec_gain": 1,
+
+         // Normalization parameters
+         "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+         "min_level_db": -100,   // lower bound for normalization
+         "symmetric_norm": true, // move normalization to range [-1, 1]
+         "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+         "clip_norm": true,      // clip normalized values into the range.
+         "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "&",
+    //     "bos": "*",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ'̃' "
+    // },
+
+    "add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // MODEL PARAMETERS
+    "positional_encoding": true,
+    "hidden_channels": 256,
+    "encoder_type": "fftransformer",
+    "encoder_params":{
+        "hidden_channels_ffn": 1024 ,
+        "num_heads": 2,
+        "num_layers": 6,
+        "dropout_p": 0.1
+    },
+    "decoder_type": "fftransformer",
+    "decoder_params":{
+        "hidden_channels_ffn": 1024 ,
+        "num_heads": 2,
+        "num_layers": 6,
+        "dropout_p": 0.1
+    },
+
+
+    // TRAINING
+    "batch_size":2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":1,
+    "r": 1,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "loss_masking": true,   // enable / disable loss masking against the sequence padding.
+    "phase_start_steps": [0, 40000, 80000, 160000, 170000],
+
+
+    // LOSS PARAMETERS
+    "ssim_alpha": 1,
+    "spec_loss_alpha": 1,
+    "dur_loss_alpha": 1,
+    "mdn_alpha": 1,
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": -1,       //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // OPTIMIZER
+    "noam_schedule": true,         // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1,               // total number of epochs to train.
+    "lr": 0.002,                    // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+
+   // TENSORBOARD and LOGGING
+   "print_step": 1,       // Number of steps to log training on console.
+   "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+   "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+   "save_step": 5000,      // Number of training steps expected to save traninpg stats and checkpoints.
+   "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+   "keep_all_best": true,  // If true, keeps all best_models after keep_after steps
+   "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+   "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.:set n
+   "mixed_precision": false,
+
+   // DATA LOADING
+   "text_cleaner": "english_cleaners",
+   "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+   "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+   "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+   "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+   "min_seq_len": 2,       // DATASET-RELATED: minimum text length to use in training
+   "max_seq_len": 300,     // DATASET-RELATED: maximum text length
+   "compute_f0": false,     // compute f0 values in data-loader
+   "compute_input_seq_cache": false,  // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
+
+   // PATHS
+   "output_path": "tests/train_outputs/",
+
+   // PHONEMES
+   "phoneme_cache_path": "tests/train_outputs/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+   "use_phonemes": false,           // use phonemes instead of raw characters. It is suggested for better pronoun[ciation.
+   "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+   // MULTI-SPEAKER and GST
+   "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+   "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+   "external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+
+
+    // DATASETS
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+    [
+        {
+            "name": "ljspeech",
+            "path": "tests/data/ljspeech/",
+            "meta_file_train": "metadata.csv",
+            "meta_file_val": "metadata.csv",
+            "meta_file_attn_mask": null
+        }
+    ]
+}
--- a/tests/inputs/test_tacotron2_config.json
+++ b/tests/inputs/test_tacotron2_config.json
--- a/tests/inputs/test_tacotron_bd_config.json
+++ b/tests/inputs/test_tacotron_bd_config.json
@ -0,0 +1,177 @@
+{
+    "model": "Tacotron",
+    "run_name": "test_sample_dataset_run",
+    "run_description": "sample dataset test run",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "~",
+    //     "bos": "^",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    // },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 1,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":1,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "ga_alpha": 10.0,        // weight for guided attention loss. If > 0, guided attention is enabled.
+    "mixed_precision": false,
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 0,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,              // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "bn",            // "original" or "bn".
+    "prenet_dropout": false,        // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": true,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": false,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 1,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": true,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "phoneme_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": true,
+
+    // PATHS
+    "output_path": "tests/train_outputs/",
+
+    // PHONEMES
+    "phoneme_cache_path": "tests/train_outputs/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": null,
+    "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+    "use_gst": true,       			    // use global style tokens
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) == len(gst_style_tokens).
+        "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10
+    },
+
+    // DATASETS
+    "train_portion": 0.1,  // dataset portion used for training. It is mainly for internal experiments.
+    "eval_portion": 0.1,   // dataset portion used for training. It is mainly for internal experiments.
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "ljspeech",
+                "path": "tests/data/ljspeech/",
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": "metadata.csv"
+            }
+        ]
+
+}
+
--- a/tests/inputs/test_tacotron_config.json
+++ b/tests/inputs/test_tacotron_config.json
@ -0,0 +1,177 @@
+{
+    "model": "Tacotron",
+    "run_name": "test_sample_dataset_run",
+    "run_description": "sample dataset test run",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "~",
+    //     "bos": "^",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    // },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 1,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":1,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "ga_alpha": 10.0,        // weight for guided attention loss. If > 0, guided attention is enabled.
+    "mixed_precision": false,
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 0,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,              // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "bn",            // "original" or "bn".
+    "prenet_dropout": false,        // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": true,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 1,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": true,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "phoneme_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": true,
+
+    // PATHS
+    "output_path": "tests/train_outputs/",
+
+    // PHONEMES
+    "phoneme_cache_path": "tests/train_outputs/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": null,
+    "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+    "use_gst": true,       			    // use global style tokens
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) == len(gst_style_tokens).
+        "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10
+    },
+
+    // DATASETS
+    "train_portion": 0.1,  // dataset portion used for training. It is mainly for internal experiments.
+    "eval_portion": 0.1,   // dataset portion used for training. It is mainly for internal experiments.
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "ljspeech",
+                "path": "tests/data/ljspeech/",
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": "metadata.csv"
+            }
+        ]
+
+}
+
--- a/tests/test_aligntts_train.sh
+++ b/tests/test_aligntts_train.sh
@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+set -xe
+BASEDIR=$(dirname "$0")
+echo "$BASEDIR"
+# run training
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_align_tts.py --config_path $BASEDIR/inputs/test_align_tts.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_align_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
--- a/tests/test_feed_forward_layers.py
+++ b/tests/test_feed_forward_layers.py
@ -0,0 +1,106 @@
+import torch
+from TTS.tts.layers.feed_forward.decoder import Decoder
+from TTS.tts.layers.feed_forward.encoder import Encoder
+from TTS.tts.utils.generic_utils import sequence_mask
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+def test_encoder():
+    input_dummy = torch.rand(8, 14, 37).to(device)
+    input_lengths = torch.randint(31, 37, (8, )).long().to(device)
+    input_lengths[-1] = 37
+    input_mask = torch.unsqueeze(
+        sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
+    # relative positional transformer encoder
+    layer = Encoder(out_channels=11,
+                    in_hidden_channels=14,
+                    encoder_type='relative_position_transformer',
+                    encoder_params={
+                        'hidden_channels_ffn': 768,
+                        'num_heads': 2,
+                        "kernel_size": 3,
+                        "dropout_p": 0.1,
+                        "num_layers": 6,
+                        "rel_attn_window_size": 4,
+                        "input_length": None
+                    }).to(device)
+    output = layer(input_dummy, input_mask)
+    assert list(output.shape) == [8, 11, 37]
+    # residual conv bn encoder
+    layer = Encoder(out_channels=11,
+                    in_hidden_channels=14,
+                    encoder_type='residual_conv_bn',
+                    encoder_params={
+                        "kernel_size": 4,
+                        "dilations": 4 * [1, 2, 4] + [1],
+                        "num_conv_blocks": 2,
+                        "num_res_blocks": 13
+                    }).to(device)
+    output = layer(input_dummy, input_mask)
+    assert list(output.shape) == [8, 11, 37]
+    # FFTransformer encoder
+    layer = Encoder(out_channels=14,
+                    in_hidden_channels=14,
+                    encoder_type='fftransformer',
+                    encoder_params={
+                        "hidden_channels_ffn": 31,
+                        "num_heads": 2,
+                        "num_layers": 2,
+                        "dropout_p": 0.1
+                    }).to(device)
+    output = layer(input_dummy, input_mask)
+    assert list(output.shape) == [8, 14, 37]
+
+
+def test_decoder():
+    input_dummy = torch.rand(8, 128, 37).to(device)
+    input_lengths = torch.randint(31, 37, (8, )).long().to(device)
+    input_lengths[-1] = 37
+
+    input_mask = torch.unsqueeze(
+        sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
+    # residual bn conv decoder
+    layer = Decoder(out_channels=11, in_hidden_channels=128).to(device)
+    output = layer(input_dummy, input_mask)
+    assert list(output.shape) == [8, 11, 37]
+    # transformer decoder
+    layer = Decoder(out_channels=11,
+                    in_hidden_channels=128,
+                    decoder_type='relative_position_transformer',
+                    decoder_params={
+                        'hidden_channels_ffn': 128,
+                        'num_heads': 2,
+                        "kernel_size": 3,
+                        "dropout_p": 0.1,
+                        "num_layers": 8,
+                        "rel_attn_window_size": 4,
+                        "input_length": None
+                    }).to(device)
+    output = layer(input_dummy, input_mask)
+    assert list(output.shape) == [8, 11, 37]
+    # wavenet decoder
+    layer = Decoder(out_channels=11,
+                    in_hidden_channels=128,
+                    decoder_type='wavenet',
+                    decoder_params={
+                        "num_blocks": 12,
+                        "hidden_channels": 192,
+                        "kernel_size": 5,
+                        "dilation_rate": 1,
+                        "num_layers": 4,
+                        "dropout_p": 0.05
+                    }).to(device)
+    output = layer(input_dummy, input_mask)
+    # FFTransformer decoder
+    layer = Decoder(out_channels=11,
+                    in_hidden_channels=128,
+                    decoder_type='fftransformer',
+                    decoder_params={
+                        'hidden_channels_ffn': 31,
+                        'num_heads': 2,
+                        "dropout_p": 0.1,
+                        "num_layers": 2,
+                    }).to(device)
+    output = layer(input_dummy, input_mask)
+    assert list(output.shape) == [8, 11, 37]
--- a/tests/test_glow_tts.py
+++ b/tests/test_glow_tts.py
@ -7,7 +7,7 @@ from tests import get_tests_input_path
 from torch import optim

 from TTS.tts.layers.losses import GlowTTSLoss
-from TTS.tts.models.glow_tts import GlowTts
+from TTS.tts.models.glow_tts import GlowTTS
 from TTS.utils.io import load_config
 from TTS.utils.audio import AudioProcessor

@ -35,14 +35,13 @@ class GlowTTSTrainTest(unittest.TestCase):
        input_lengths = torch.randint(100, 129, (8, )).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, c.audio['num_mels'], 30).to(device)
-        linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

-        criterion = criterion = GlowTTSLoss()
+        criterion = GlowTTSLoss()

        # model to train
-        model = GlowTts(
+        model = GlowTTS(
            num_chars=32,
            hidden_channels_enc=48,
            hidden_channels_dec=48,
@ -60,7 +59,7 @@ class GlowTTSTrainTest(unittest.TestCase):
            use_encoder_prenet=True,
            num_flow_blocks_dec=12,
            kernel_size_dec=5,
-            dilation_rate=5,
+            dilation_rate=1,
            num_block_layers=4,
            dropout_p_dec=0.,
            num_speakers=0,
@ -71,7 +70,7 @@ class GlowTTSTrainTest(unittest.TestCase):
            mean_only=False).to(device)

        # reference model to compare model weights
-        model_ref = GlowTts(
+        model_ref = GlowTTS(
            num_chars=32,
            hidden_channels_enc=48,
            hidden_channels_dec=48,
@ -89,7 +88,7 @@ class GlowTTSTrainTest(unittest.TestCase):
            use_encoder_prenet=True,
            num_flow_blocks_dec=12,
            kernel_size_dec=5,
-            dilation_rate=5,
+            dilation_rate=1,
            num_block_layers=4,
            dropout_p_dec=0.,
            num_speakers=0,
@ -112,11 +111,11 @@ class GlowTTSTrainTest(unittest.TestCase):
            assert (param - param_ref).sum() == 0, param
            count += 1

-        optimizer = optim.Adam(model.parameters(), lr=c.lr)
+        optimizer = optim.Adam(model.parameters(), lr=0.001)
        for _ in range(5):
+            optimizer.zero_grad()
            z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
                input_dummy, input_lengths, mel_spec, mel_lengths, None)
-            optimizer.zero_grad()
            loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
                                  o_dur_log, o_total_dur, input_lengths)
            loss = loss_dict['loss']
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@ -1,7 +1,7 @@
 import unittest
 import torch as T

-from TTS.tts.layers.tacotron import Prenet, CBHG, Decoder, Encoder
+from TTS.tts.layers.tacotron.tacotron import Prenet, CBHG, Decoder, Encoder
 from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
 from TTS.tts.utils.generic_utils import sequence_mask

--- a/tests/test_model_manager.py
+++ b/tests/test_model_manager.py
@ -1,20 +1,20 @@
-#!/usr/bin/env python3`
-import os
-import shutil
-import glob
-from tests import get_tests_output_path
-from TTS.utils.manage import ModelManager
+# #!/usr/bin/env python3`
+# import os
+# import shutil
+# import glob
+# from tests import get_tests_output_path
+# from TTS.utils.manage import ModelManager


-def test_if_all_models_available():
-    """Check if all the models are downloadable."""
-    print(" > Checking the availability of all the models under the ModelManager.")
-    manager = ModelManager(output_prefix=get_tests_output_path())
-    model_names = manager.list_models()
-    for model_name in model_names:
-        manager.download_model(model_name)
-        print(f" | > OK: {model_name}")
+# def test_if_all_models_available():
+#     """Check if all the models are downloadable."""
+#     print(" > Checking the availability of all the models under the ModelManager.")
+#     manager = ModelManager(output_prefix=get_tests_output_path())
+#     model_names = manager.list_models()
+#     for model_name in model_names:
+#         manager.download_model(model_name)
+#         print(f" | > OK: {model_name}")

-    folders = glob.glob(os.path.join(manager.output_prefix, '*'))
-    assert len(folders) == len(model_names)
-    shutil.rmtree(manager.output_prefix)
+#     folders = glob.glob(os.path.join(manager.output_prefix, '*'))
+#     assert len(folders) == len(model_names)
+#     shutil.rmtree(manager.output_prefix)
--- a/tests/test_resample.sh
+++ b/tests/test_resample.sh
@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -xe
+BASEDIR=$(dirname "$0")
+TARGET_SR=16000
+echo "$BASEDIR"
+#run the resample script
+python TTS/bin/resample.py --input_dir $BASEDIR/data/ljspeech --output_dir $BASEDIR/outputs/resample_tests --output_sr $TARGET_SR
+#check samplerate of output
+OUT_SR=$( (echo "import librosa" ; echo "y, sr = librosa.load('"$BASEDIR"/outputs/resample_tests/wavs/LJ001-0012.wav', sr=None)" ; echo "print(sr)") | python )
+OUT_SR=$(($OUT_SR + 0))
+if [[ $OUT_SR -ne $TARGET_SR ]]; then
+    echo "Missmatch between target and output sample rates"
+    exit 1
+fi
+#cleaning up
+rm -rf $BASEDIR/outputs/resample_tests
--- a/tests/test_speaker_encoder.py
+++ b/tests/test_speaker_encoder.py
--- a/tests/test_speedy_speech_layers.py
+++ b/tests/test_speedy_speech_layers.py
@ -1,8 +1,5 @@
 import torch
-
-from TTS.tts.layers.speedy_speech.encoder import Encoder
-from TTS.tts.layers.speedy_speech.decoder import Decoder
-from TTS.tts.layers.speedy_speech.duration_predictor import DurationPredictor
+from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
 from TTS.tts.utils.generic_utils import sequence_mask
 from TTS.tts.models.speedy_speech import SpeedySpeech

@ -11,84 +8,6 @@ use_cuda = torch.cuda.is_available()
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


-def test_encoder():
-    input_dummy = torch.rand(8, 14, 37).to(device)
-    input_lengths = torch.randint(31, 37, (8, )).long().to(device)
-    input_lengths[-1] = 37
-    input_mask = torch.unsqueeze(
-        sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
-
-    # residual bn conv encoder
-    layer = Encoder(out_channels=11,
-                    in_hidden_channels=14,
-                    encoder_type='residual_conv_bn').to(device)
-    output = layer(input_dummy, input_mask)
-    assert list(output.shape) == [8, 11, 37]
-
-    # transformer encoder
-    layer = Encoder(out_channels=11,
-                    in_hidden_channels=14,
-                    encoder_type='transformer',
-                    encoder_params={
-                        'hidden_channels_ffn': 768,
-                        'num_heads': 2,
-                        "kernel_size": 3,
-                        "dropout_p": 0.1,
-                        "num_layers": 6,
-                        "rel_attn_window_size": 4,
-                        "input_length": None
-                    }).to(device)
-    output = layer(input_dummy, input_mask)
-    assert list(output.shape) == [8, 11, 37]
-
-
-def test_decoder():
-    input_dummy = torch.rand(8, 128, 37).to(device)
-    input_lengths = torch.randint(31, 37, (8, )).long().to(device)
-    input_lengths[-1] = 37
-
-    input_mask = torch.unsqueeze(
-        sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
-
-    # residual bn conv decoder
-    layer = Decoder(out_channels=11, in_hidden_channels=128).to(device)
-    output = layer(input_dummy, input_mask)
-    assert list(output.shape) == [8, 11, 37]
-
-    # transformer decoder
-    layer = Decoder(out_channels=11,
-                    in_hidden_channels=128,
-                    decoder_type='transformer',
-                    decoder_params={
-                        'hidden_channels_ffn': 128,
-                        'num_heads': 2,
-                        "kernel_size": 3,
-                        "dropout_p": 0.1,
-                        "num_layers": 8,
-                        "rel_attn_window_size": 4,
-                        "input_length": None
-                    }).to(device)
-    output = layer(input_dummy, input_mask)
-    assert list(output.shape) == [8, 11, 37]
-
-
-    # wavenet decoder
-    layer = Decoder(out_channels=11,
-                    in_hidden_channels=128,
-                    decoder_type='wavenet',
-                    decoder_params={
-                        "num_blocks": 12,
-                        "hidden_channels": 192,
-                        "kernel_size": 5,
-                        "dilation_rate": 1,
-                        "num_layers": 4,
-                        "dropout_p": 0.05
-                    }).to(device)
-    output = layer(input_dummy, input_mask)
-    assert list(output.shape) == [8, 11, 37]
-
-
-
 def test_duration_predictor():
    input_dummy = torch.rand(8, 128, 27).to(device)
    input_lengths = torch.randint(20, 27, (8, )).long().to(device)
--- a/tests/test_tacotron_train.sh
+++ b/tests/test_tacotron_train.sh
@ -2,8 +2,9 @@
 set -xe
 BASEDIR=$(dirname "$0")
 echo "$BASEDIR"
+
 # run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json
 # find the training folder
 LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
 echo $LATEST_FOLDER
@ -11,3 +12,25 @@ echo $LATEST_FOLDER
 CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
 # remove all the outputs
 rm -rf $BASEDIR/train_outputs/
+
+# run Tacotron bi-directional decoder
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_bd_config.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
+
+# Tacotron2
+# run training
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
+
--- a/tests/test_vocoder_pqmf.py
+++ b/tests/test_vocoder_pqmf.py
@ -4,7 +4,7 @@ import torch
 import soundfile as sf
 from librosa.core import load

-from tests import get_tests_path, get_tests_input_path
+from tests import get_tests_path, get_tests_input_path, get_tests_output_path
 from TTS.vocoder.layers.pqmf import PQMF


@ -24,4 +24,5 @@ def test_pqmf():
    print(w2_.max())
    print(w2_.min())
    print(w2_.mean())
-    sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)
+    sf.write(os.path.join(get_tests_output_path(), 'pqmf_output.wav'),
+             w2_.flatten().detach(), sr)
--- a/tests/test_vocoder_tf_pqmf.py
+++ b/tests/test_vocoder_tf_pqmf.py
@ -4,7 +4,7 @@ import tensorflow as tf
 import soundfile as sf
 from librosa.core import load

-from tests import get_tests_path, get_tests_input_path
+from tests import get_tests_path, get_tests_input_path, get_tests_output_path
 from TTS.vocoder.tf.layers.pqmf import PQMF


@ -25,4 +25,5 @@ def test_pqmf():
    print(w2_.max())
    print(w2_.min())
    print(w2_.mean())
-    sf.write('tf_pqmf_output.wav', w2_.flatten(), sr)
+    sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'),
+             w2_.flatten(), sr)