From 5e148038be5971f2c7c811d46a1d7b28c759ecda Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Thu, 9 Jan 2020 15:56:09 +0100
Subject: [PATCH 01/36] simpler gmm attention implementaiton

---
 config.json             |  2 +-
 layers/common_layers.py | 15 ++++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/config.json b/config.json
index 91863c4c..d23246a7 100644
--- a/config.json
+++ b/config.json
@@ -109,7 +109,7 @@
         [
             {
                 "name": "ljspeech",
-                "path": "/data5/ro/shared/data/keithito/LJSpeech-1.1/",
+                "path": "/root/LJSpeech-1.1/",
                 // "path": "/home/erogol/Data/LJSpeech-1.1",
                 "meta_file_train": "metadata_train.csv",
                 "meta_file_val": "metadata_val.csv"
diff --git a/layers/common_layers.py b/layers/common_layers.py
index 8b8ff073..112760b3 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -132,7 +132,7 @@ class GravesAttention(nn.Module):
 
     def init_states(self, inputs):
         if self.J is None or inputs.shape[1] > self.J.shape[-1]:
-            self.J = torch.arange(0, inputs.shape[1]).to(inputs.device).expand([inputs.shape[0], self.K, inputs.shape[1]])
+            self.J = torch.arange(0, inputs.shape[1]+1).to(inputs.device) + 0.5
         self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
         self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
 
@@ -164,17 +164,14 @@ class GravesAttention(nn.Module):
         mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
         g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
 
-        # each B x K x T_in
-        g_t = g_t.unsqueeze(2).expand(g_t.size(0),
-                                      g_t.size(1),
-                                      inputs.size(1))
-        sig_t = sig_t.unsqueeze(2).expand_as(g_t)
-        mu_t_ = mu_t.unsqueeze(2).expand_as(g_t)
-        j = self.J[:g_t.size(0), :, :inputs.size(1)]
+        j = self.J[:inputs.size(1)+1]
 
         # attention weights
-        phi_t = g_t * torch.exp(-0.5 * (mu_t_ - j)**2 / (sig_t**2))
+        phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2))
+
+        # discritize attention weights
         alpha_t = self.COEF * torch.sum(phi_t, 1)
+        alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
 
         # apply masking
         if mask is not None:

From e5bf2719bdfd23be8c118276c1009853d1b146ca Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 14 Jan 2020 13:22:23 +0100
Subject: [PATCH 02/36] graves attention as in melnet paper

---
 layers/common_layers.py | 15 ++++++++-------
 utils/measures.py       | 11 +++++++++--
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/layers/common_layers.py b/layers/common_layers.py
index 112760b3..64a3d201 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -131,8 +131,8 @@ class GravesAttention(nn.Module):
         torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)
 
     def init_states(self, inputs):
-        if self.J is None or inputs.shape[1] > self.J.shape[-1]:
-            self.J = torch.arange(0, inputs.shape[1]+1).to(inputs.device) + 0.5
+        if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
+            self.J = torch.arange(0, inputs.shape[1]+2).to(inputs.device) + 0.5
         self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
         self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
 
@@ -160,24 +160,25 @@ class GravesAttention(nn.Module):
 
         # attention GMM parameters
         sig_t = torch.nn.functional.softplus(b_t) + self.eps
-
         mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
         g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
-
         j = self.J[:inputs.size(1)+1]
 
         # attention weights
-        phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2))
+        phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.exp((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
 
         # discritize attention weights
-        alpha_t = self.COEF * torch.sum(phi_t, 1)
+        alpha_t = torch.sum(phi_t, 1)
         alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
+        alpha_t[alpha_t == 0] = 1e-8
 
         # apply masking
         if mask is not None:
             alpha_t.data.masked_fill_(~mask, self._mask_value)
 
         context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
+        # for better visualization
+        # self.attention_weights = torch.clamp(alpha_t, min=0)
         self.attention_weights = alpha_t
         self.mu_prev = mu_t
         return context
@@ -350,7 +351,7 @@ class OriginalAttention(nn.Module):
         if self.forward_attn:
             alignment = self.apply_forward_attention(alignment)
             self.alpha = alignment
-
+        
         context = torch.bmm(alignment.unsqueeze(1), inputs)
         context = context.squeeze(1)
         self.attention_weights = alignment
diff --git a/utils/measures.py b/utils/measures.py
index a76a2225..01d25695 100644
--- a/utils/measures.py
+++ b/utils/measures.py
@@ -1,11 +1,18 @@
+import torch
 
-def alignment_diagonal_score(alignments):
+
+def alignment_diagonal_score(alignments, binary=False):
     """
     Compute how diagonal alignment predictions are. It is useful
     to measure the alignment consistency of a model
     Args:
         alignments (torch.Tensor): batch of alignments.
+        binary (bool): if True, ignore scores and consider attention
+        as a binary mask.
     Shape:
         alignments : batch x decoder_steps x encoder_steps
     """
-    return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0).item()
+    maxs = alignments.max(dim=1)[0]
+    if binary:
+        maxs[maxs > 0] = 1
+    return maxs.mean(dim=1).mean(dim=0).item()

From 3d59e61c6025f077cb0bc9d44dc830f83656080b Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 01:53:27 +0100
Subject: [PATCH 03/36] graves v2

---
 config.json             |  2 +-
 layers/common_layers.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/config.json b/config.json
index d23246a7..115f4dc6 100644
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
 {
     "model": "Tacotron2",          // one of the model in models/  
-    "run_name": "ljspeech-graves",
+    "run_name": "ljspeech-gravesv2",
     "run_description": "tacotron2 wuth graves attention",
 
     // AUDIO PARAMETERS
diff --git a/layers/common_layers.py b/layers/common_layers.py
index 64a3d201..1337977a 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -113,7 +113,7 @@ class GravesAttention(nn.Module):
 
     def __init__(self, query_dim, K):
         super(GravesAttention, self).__init__()
-        self._mask_value = 0.0
+        self._mask_value = 1e-8
         self.K = K
         # self.attention_alignment = 0.05
         self.eps = 1e-5
@@ -160,12 +160,14 @@ class GravesAttention(nn.Module):
 
         # attention GMM parameters
         sig_t = torch.nn.functional.softplus(b_t) + self.eps
+
         mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
         g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
+
         j = self.J[:inputs.size(1)+1]
 
         # attention weights
-        phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.exp((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
+        phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
 
         # discritize attention weights
         alpha_t = torch.sum(phi_t, 1)
@@ -177,8 +179,6 @@ class GravesAttention(nn.Module):
             alpha_t.data.masked_fill_(~mask, self._mask_value)
 
         context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
-        # for better visualization
-        # self.attention_weights = torch.clamp(alpha_t, min=0)
         self.attention_weights = alpha_t
         self.mu_prev = mu_t
         return context
@@ -351,7 +351,7 @@ class OriginalAttention(nn.Module):
         if self.forward_attn:
             alignment = self.apply_forward_attention(alignment)
             self.alpha = alignment
-        
+
         context = torch.bmm(alignment.unsqueeze(1), inputs)
         context = context.squeeze(1)
         self.attention_weights = alignment

From 04ba700b1f5d9af3be7d33bf10dbdd69e188592f Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 12:30:07 +0100
Subject: [PATCH 04/36] seq_len_norm for imbalanced datasets

---
 layers/losses.py | 40 ++++++++++++++++++++++++++++++++--------
 train.py         |  4 ++--
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/layers/losses.py b/layers/losses.py
index e7ecff5f..b8b17c17 100644
--- a/layers/losses.py
+++ b/layers/losses.py
@@ -6,6 +6,11 @@ from TTS.utils.generic_utils import sequence_mask
 
 
 class L1LossMasked(nn.Module):
+
+    def __init__(self, seq_len_norm):
+        super(L1LossMasked, self).__init__()
+        self.seq_len_norm = seq_len_norm
+
     def forward(self, x, target, length):
         """
         Args:
@@ -24,14 +29,26 @@ class L1LossMasked(nn.Module):
         target.requires_grad = False
         mask = sequence_mask(
             sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
-        mask = mask.expand_as(x)
-        loss = functional.l1_loss(
-            x * mask, target * mask, reduction="sum")
-        loss = loss / mask.sum()
+        if self.seq_len_norm:
+            norm_w = mask / mask.sum(dim=1, keepdim=True)
+            out_weights = norm_w.div(target.shape[0] * target.shape[2])
+            mask = mask.expand_as(x)
+            loss = functional.l1_loss(
+                x * mask, target * mask, reduction='none')
+            loss = loss.mul(out_weights.cuda()).sum() 
+        else:
+            loss = functional.l1_loss(
+                x * mask, target * mask, reduction='sum')
+            loss = loss / mask.sum()
         return loss
 
 
 class MSELossMasked(nn.Module):
+
+    def __init__(self, seq_len_norm):
+        super(MSELossMasked, self).__init__()
+        self.seq_len_norm = seq_len_norm
+
     def forward(self, x, target, length):
         """
         Args:
@@ -50,10 +67,17 @@ class MSELossMasked(nn.Module):
         target.requires_grad = False
         mask = sequence_mask(
             sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
-        mask = mask.expand_as(x)
-        loss = functional.mse_loss(
-            x * mask, target * mask, reduction="sum")
-        loss = loss / mask.sum()
+        if self.seq_len_norm:
+            norm_w = mask / mask.sum(dim=1, keepdim=True)
+            out_weights = norm_w.div(target.shape[0] * target.shape[2])
+            mask = mask.expand_as(x)
+            loss = functional.mse_loss(
+                x * mask, target * mask, reduction='none')
+            loss = loss.mul(out_weights.cuda()).sum() 
+        else:
+            loss = functional.mse_loss(
+                x * mask, target * mask, reduction='sum')
+            loss = loss / mask.sum()
         return loss
 
 
diff --git a/train.py b/train.py
index 81bc2c72..f52d24c1 100644
--- a/train.py
+++ b/train.py
@@ -561,8 +561,8 @@ def main(args):  # pylint: disable=redefined-outer-name
         optimizer_st = None
 
     if c.loss_masking:
-        criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"
-                                                  ] else MSELossMasked()
+        criterion = L1LossMasked(c.seq_len_norm) if c.model in ["Tacotron", "TacotronGST"
+                                                  ] else MSELossMasked(c.seq_len_norm)
     else:
         criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
                                                ] else nn.MSELoss()

From 6fd61e82b0965381cb6905b57ebf5048f641e71e Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 12:38:04 +0100
Subject: [PATCH 05/36] seq_len_norm set in config

---
 config.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config.json b/config.json
index ef999fa9..1829fde8 100644
--- a/config.json
+++ b/config.json
@@ -53,6 +53,7 @@
     "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
     "wd": 0.000001,         // Weight decay weight.
     "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,	   // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
     
     // TACOTRON PRENET
     "memory_size": -1,              // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. 

From 3af989643b16a36c182e614461bf301489028293 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 23:17:55 +0100
Subject: [PATCH 06/36] bug fix for losses

---
 layers/losses.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/layers/losses.py b/layers/losses.py
index b8b17c17..90d2ac80 100644
--- a/layers/losses.py
+++ b/layers/losses.py
@@ -37,6 +37,7 @@ class L1LossMasked(nn.Module):
                 x * mask, target * mask, reduction='none')
             loss = loss.mul(out_weights.cuda()).sum() 
         else:
+            mask = mask.expand_as(x)
             loss = functional.l1_loss(
                 x * mask, target * mask, reduction='sum')
             loss = loss / mask.sum()
@@ -75,6 +76,7 @@ class MSELossMasked(nn.Module):
                 x * mask, target * mask, reduction='none')
             loss = loss.mul(out_weights.cuda()).sum() 
         else:
+            mask = mask.expand_as(x)
             loss = functional.mse_loss(
                 x * mask, target * mask, reduction='sum')
             loss = loss / mask.sum()

From a391a7f859463744d7f67d42f2e475945cd91336 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Sat, 18 Jan 2020 00:33:51 +0100
Subject: [PATCH 07/36] stop dividing g_t with sig_t and commenting

---
 layers/common_layers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/layers/common_layers.py b/layers/common_layers.py
index 1337977a..fbedc2b9 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -127,8 +127,8 @@ class GravesAttention(nn.Module):
         self.init_layers()
 
     def init_layers(self):
-        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)
-        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)
+        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)  # bias mean
+        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)  # bias std
 
     def init_states(self, inputs):
         if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
@@ -162,7 +162,7 @@ class GravesAttention(nn.Module):
         sig_t = torch.nn.functional.softplus(b_t) + self.eps
 
         mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
-        g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
+        g_t = torch.softmax(g_t, dim=-1) + self.eps
 
         j = self.J[:inputs.size(1)+1]
 

From eb63c95d979a0156af95122479d92c2ebf3609e1 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 27 Jan 2020 15:42:56 +0100
Subject: [PATCH 08/36] bug fixes

---
 utils/audio.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utils/audio.py b/utils/audio.py
index 05694dce..82e5aa47 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -66,12 +66,11 @@ class AudioProcessor(object):
         return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spec))
 
     def _build_mel_basis(self, ):
-        n_fft = (self.num_freq - 1) * 2
         if self.mel_fmax is not None:
             assert self.mel_fmax <= self.sample_rate // 2
         return librosa.filters.mel(
             self.sample_rate,
-            n_fft,
+            self.n_fft,
             n_mels=self.num_mels,
             fmin=self.mel_fmin,
             fmax=self.mel_fmax)
@@ -197,6 +196,7 @@ class AudioProcessor(object):
             n_fft=self.n_fft,
             hop_length=self.hop_length,
             win_length=self.win_length,
+            pad_mode='constant'
         )
 
     def _istft(self, y):
@@ -217,7 +217,7 @@ class AudioProcessor(object):
         margin = int(self.sample_rate * 0.01)
         wav = wav[margin:-margin]
         return librosa.effects.trim(
-            wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
+            wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0]
 
     @staticmethod
     def mulaw_encode(wav, qc):
@@ -247,7 +247,7 @@ class AudioProcessor(object):
                 print(f' [!] File cannot be trimmed for silence - {filename}')
         assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
         if self.sound_norm:
-            x = x / abs(x.max()) * 0.9
+            x = x / abs(x).max() * 0.9
         return x
 
     @staticmethod

From 7a616aa9ef85fa4833eff6b78fd8e155a152a002 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 27 Jan 2020 15:46:59 +0100
Subject: [PATCH 09/36] remove old graves

---
 layers/common_layers.py | 79 ++---------------------------------------
 1 file changed, 2 insertions(+), 77 deletions(-)

diff --git a/layers/common_layers.py b/layers/common_layers.py
index 023c7404..592f017c 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -111,8 +111,9 @@ class LocationLayer(nn.Module):
 
 
 class GravesAttention(nn.Module):
-    """ Graves attention as described here:
+    """ Discretized Graves attention:
         - https://arxiv.org/abs/1910.10288
+        - https://arxiv.org/pdf/1906.01083.pdf
     """
     COEF = 0.3989422917366028  # numpy.sqrt(1/(2*numpy.pi))
 
@@ -368,82 +369,6 @@ class OriginalAttention(nn.Module):
         return context
 
 
-class GravesAttention(nn.Module):
-    """ Graves attention as described here:
-        - https://arxiv.org/abs/1910.10288
-    """
-    COEF = 0.3989422917366028  # numpy.sqrt(1/(2*numpy.pi))
-
-    def __init__(self, query_dim, K):
-        super(GravesAttention, self).__init__()
-        self._mask_value = 0.0
-        self.K = K
-        # self.attention_alignment = 0.05
-        self.eps = 1e-5
-        self.J = None
-        self.N_a = nn.Sequential(
-            nn.Linear(query_dim, query_dim, bias=True),
-            nn.ReLU(),
-            nn.Linear(query_dim, 3*K, bias=True))
-        self.attention_weights = None
-        self.mu_prev = None
-        self.init_layers()
-
-    def init_layers(self):
-        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)
-        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)
-
-    def init_states(self, inputs):
-        if self.J is None or inputs.shape[1] > self.J.shape[-1]:
-            self.J = torch.arange(0, inputs.shape[1]).to(inputs.device)
-        self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
-        self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
-
-    # pylint: disable=R0201
-    # pylint: disable=unused-argument
-    def preprocess_inputs(self, inputs):
-        return None
-
-    def forward(self, query, inputs, processed_inputs, mask):
-        """
-        shapes:
-            query: B x D_attention_rnn
-            inputs: B x T_in x D_encoder
-            processed_inputs: place_holder
-            mask: B x T_in
-        """
-        gbk_t = self.N_a(query)
-        gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
-
-        # attention model parameters
-        # each B x K
-        g_t = gbk_t[:, 0, :]
-        b_t = gbk_t[:, 1, :]
-        k_t = gbk_t[:, 2, :]
-
-        # attention GMM parameters
-        sig_t = torch.nn.functional.softplus(b_t) + self.eps
-
-        mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
-        g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
-
-        # each B x K x T_in
-        j = self.J[:inputs.size(1)]
-
-        # attention weights
-        phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2))
-        alpha_t = self.COEF * torch.sum(phi_t, 1)
-
-        # apply masking
-        if mask is not None:
-            alpha_t.data.masked_fill_(~mask, self._mask_value)
-
-        context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
-        self.attention_weights = alpha_t
-        self.mu_prev = mu_t
-        return context
-
-
 def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
               location_attention, attention_location_n_filters,
               attention_location_kernel_size, windowing, norm, forward_attn,

From f4678cbd6bd9281957a35627d26bd96b307fa04a Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 27 Jan 2020 16:02:34 +0100
Subject: [PATCH 10/36] testing seq_len_norm

---
 layers/losses.py              |  4 ++--
 tests/test_layers.py          | 41 +++++++++++++++++++++++++++++++----
 tests/test_tacotron2_model.py |  2 +-
 tests/test_tacotron_model.py  |  2 +-
 4 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/layers/losses.py b/layers/losses.py
index 90d2ac80..176e2f09 100644
--- a/layers/losses.py
+++ b/layers/losses.py
@@ -35,7 +35,7 @@ class L1LossMasked(nn.Module):
             mask = mask.expand_as(x)
             loss = functional.l1_loss(
                 x * mask, target * mask, reduction='none')
-            loss = loss.mul(out_weights.cuda()).sum() 
+            loss = loss.mul(out_weights.to(loss.device)).sum()
         else:
             mask = mask.expand_as(x)
             loss = functional.l1_loss(
@@ -74,7 +74,7 @@ class MSELossMasked(nn.Module):
             mask = mask.expand_as(x)
             loss = functional.mse_loss(
                 x * mask, target * mask, reduction='none')
-            loss = loss.mul(out_weights.cuda()).sum() 
+            loss = loss.mul(out_weights.to(loss.device)).sum()
         else:
             mask = mask.expand_as(x)
             loss = functional.mse_loss(
diff --git a/tests/test_layers.py b/tests/test_layers.py
index 6e3c4b13..d7c8829f 100644
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@@ -119,7 +119,7 @@ class EncoderTests(unittest.TestCase):
 class L1LossMaskedTests(unittest.TestCase):
     def test_in_out(self):
         # test input == target
-        layer = L1LossMasked()
+        layer = L1LossMasked(seq_len_norm=False)
         dummy_input = T.ones(4, 8, 128).float()
         dummy_target = T.ones(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
@@ -131,7 +131,7 @@ class L1LossMaskedTests(unittest.TestCase):
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -140,7 +140,7 @@ class L1LossMaskedTests(unittest.TestCase):
         mask = (
             (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
@@ -148,4 +148,37 @@ class L1LossMaskedTests(unittest.TestCase):
         mask = (
             (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.data[0])
+        assert output.item() == 0, "0 vs {}".format(output.item())
+
+        # seq_len_norm = True
+        # test input == target
+        layer = L1LossMasked(seq_len_norm=True)
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.ones(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        # test input != target
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        # test if padded values of input makes any difference
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = (
+            (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+
+        dummy_input = T.rand(4, 8, 128).float()
+        dummy_target = dummy_input.detach()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = (
+            (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0, "0 vs {}".format(output.item())
diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py
index a26f1ddf..aa2869eb 100644
--- a/tests/test_tacotron2_model.py
+++ b/tests/test_tacotron2_model.py
@@ -38,7 +38,7 @@ class TacotronTrainTest(unittest.TestCase):
                                          stop_targets.size(1) // c.r, -1)
         stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
 
-        criterion = MSELossMasked().to(device)
+        criterion = MSELossMasked(seq_len_norm=False).to(device)
         criterion_st = nn.BCEWithLogitsLoss().to(device)
         model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device)
         model.train()
diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py
index 7e5e8daf..48627697 100644
--- a/tests/test_tacotron_model.py
+++ b/tests/test_tacotron_model.py
@@ -106,7 +106,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
         stop_targets = (stop_targets.sum(2) >
                         0.0).unsqueeze(2).float().squeeze()
 
-        criterion = L1LossMasked().to(device)
+        criterion = L1LossMasked(seq_len_norm=False).to(device)
         criterion_st = nn.BCEWithLogitsLoss().to(device)
         model = Tacotron(
             num_chars=32,

From 9d669d1024e554d3460963c7a31a94810dd2f442 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 27 Jan 2020 16:28:47 +0100
Subject: [PATCH 11/36] bug fix

---
 tests/test_tacotron_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py
index 48627697..ac6712b0 100644
--- a/tests/test_tacotron_model.py
+++ b/tests/test_tacotron_model.py
@@ -44,7 +44,7 @@ class TacotronTrainTest(unittest.TestCase):
         stop_targets = (stop_targets.sum(2) >
                         0.0).unsqueeze(2).float().squeeze()
 
-        criterion = L1LossMasked().to(device)
+        criterion = L1LossMasked(seq_len_norm=False).to(device)
         criterion_st = nn.BCEWithLogitsLoss().to(device)
         model = Tacotron(
             num_chars=32,

From 542141c9783c2f7445827460fb00f5c63ace0c61 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 3 Feb 2020 14:16:40 +0100
Subject: [PATCH 12/36] set silence trimming threshold in config

---
 config.json    | 1 +
 utils/audio.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/config.json b/config.json
index 71ba261e..89266a94 100644
--- a/config.json
+++ b/config.json
@@ -24,6 +24,7 @@
         "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
         "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
         "do_trim_silence": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
     },
 
     // DISTRIBUTED TRAINING
diff --git a/utils/audio.py b/utils/audio.py
index 82e5aa47..7b2c4834 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -24,6 +24,7 @@ class AudioProcessor(object):
                  clip_norm=True,
                  griffin_lim_iters=None,
                  do_trim_silence=False,
+                 trim_db=60,
                  sound_norm=False,
                  **_):
 
@@ -46,6 +47,7 @@ class AudioProcessor(object):
         self.max_norm = 1.0 if max_norm is None else float(max_norm)
         self.clip_norm = clip_norm
         self.do_trim_silence = do_trim_silence
+        self.trim_db = trim_db
         self.sound_norm = sound_norm
         self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
         assert min_level_db != 0.0, " [!] min_level_db is 0"
@@ -217,7 +219,7 @@ class AudioProcessor(object):
         margin = int(self.sample_rate * 0.01)
         wav = wav[margin:-margin]
         return librosa.effects.trim(
-            wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0]
+            wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0]
 
     @staticmethod
     def mulaw_encode(wav, qc):

From 9feec72d44a59fdaea7d5af0b66e97c762a56f7b Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 3 Feb 2020 15:29:44 +0100
Subject: [PATCH 13/36] tacotron2 stop condition

---
 layers/tacotron2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/layers/tacotron2.py b/layers/tacotron2.py
index 78bdd10d..c195b277 100644
--- a/layers/tacotron2.py
+++ b/layers/tacotron2.py
@@ -290,7 +290,7 @@ class Decoder(nn.Module):
             stop_tokens += [stop_token]
             alignments += [alignment]
 
-            if stop_token > 0.7:
+            if stop_token > 0.7 and t > inputs.shape[0] / 2:
                 break
             if len(outputs) == self.max_decoder_steps:
                 print("   | > Decoder stopped with 'max_decoder_steps")

From 57e7c1de08c527dbd97dddfe9804d7de969739cd Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 4 Feb 2020 11:16:48 +0100
Subject: [PATCH 14/36] Only use embedded model files if they're not overriden
 by CLI flags

---
 server/server.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/server/server.py b/server/server.py
index d40e2427..3be66f9e 100644
--- a/server/server.py
+++ b/server/server.py
@@ -24,20 +24,32 @@ def create_argparser():
     return parser
 
 
-config = None
 synthesizer = None
 
 embedded_model_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
 checkpoint_file = os.path.join(embedded_model_folder, 'checkpoint.pth.tar')
 config_file = os.path.join(embedded_model_folder, 'config.json')
 
-if os.path.isfile(checkpoint_file) and os.path.isfile(config_file):
-    # Use default config with embedded model files
-    config = create_argparser().parse_args([])
-    config.tts_checkpoint = checkpoint_file
-    config.tts_config = config_file
-    synthesizer = Synthesizer(config)
+# Default options with embedded model files
+if os.path.isfile(checkpoint_file):
+    default_tts_checkpoint = checkpoint_file
+else:
+    default_tts_checkpoint = None
 
+if os.path.isfile(config_file):
+    default_tts_config = config_file
+else:
+    default_tts_config = None
+
+args = create_argparser().parse_args()
+
+# If these were not specified in the CLI args, use default values
+if not args.tts_checkpoint:
+    args.tts_checkpoint = default_tts_checkpoint
+if not args.tts_config:
+    args.tts_config = default_tts_config
+
+synthesizer = Synthesizer(args)
 
 app = Flask(__name__)
 
@@ -55,11 +67,4 @@ def tts():
 
 
 if __name__ == '__main__':
-    args = create_argparser().parse_args()
-
-    # Setup synthesizer from CLI args if they're specified or no embedded model
-    # is present.
-    if not config or not synthesizer or args.tts_checkpoint or args.tts_config:
-        synthesizer = Synthesizer(args)
-
-    app.run(debug=config.debug, host='0.0.0.0', port=config.port)
+    app.run(debug=args.debug, host='0.0.0.0', port=args.port)

From 5c78816f5181a743dc46df3a0ee1746207a57da9 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:09:59 +0100
Subject: [PATCH 15/36] update server and synthesizer to handle ParallelWaveGAN

---
 server/server.py      |  9 ++++++---
 server/synthesizer.py | 46 ++++++++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/server/server.py b/server/server.py
index 3be66f9e..6af119bf 100644
--- a/server/server.py
+++ b/server/server.py
@@ -14,10 +14,13 @@ def create_argparser():
     parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file')
     parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
     parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
-    parser.add_argument('--wavernn_lib_path', type=str, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
-    parser.add_argument('--wavernn_file', type=str, help='path to WaveRNN checkpoint file.')
-    parser.add_argument('--wavernn_config', type=str, help='path to WaveRNN config file.')
+    parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
+    parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
     parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
+    parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.')
+    parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.')
     parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
     parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
     parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
diff --git a/server/synthesizer.py b/server/synthesizer.py
index d8852a3e..b703c62e 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -1,17 +1,18 @@
 import io
 import os
+import re
+import sys
 
 import numpy as np
 import torch
-import sys
+import yaml
 
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import load_config, setup_model
-from TTS.utils.text import phonemes, symbols
 from TTS.utils.speakers import load_speaker_mapping
 from TTS.utils.synthesis import *
+from TTS.utils.text import phonemes, symbols
 
-import re
 alphabets = r"([A-Za-z])"
 prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]"
 suffixes = r"(Inc|Ltd|Jr|Sr|Co)"
@@ -23,6 +24,7 @@ websites = r"[.](com|net|org|io|gov)"
 class Synthesizer(object):
     def __init__(self, config):
         self.wavernn = None
+        self.pwgan = None
         self.config = config
         self.use_cuda = self.config.use_cuda
         if self.use_cuda:
@@ -30,9 +32,11 @@ class Synthesizer(object):
         self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
                       self.config.use_cuda)
         if self.config.wavernn_lib_path:
-            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_path,
-                              self.config.wavernn_file, self.config.wavernn_config,
-                              self.config.use_cuda)
+            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file,
+                              self.config.wavernn_config, self.config.use_cuda)
+        if self.config.pwgan_lib_path:
+            self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file,
+                            self.config.pwgan_config, self.config.use_cuda)
 
     def load_tts(self, tts_checkpoint, tts_config, use_cuda):
         print(" > Loading TTS model ...")
@@ -45,9 +49,9 @@ class Synthesizer(object):
             self.input_size = len(phonemes)
         else:
             self.input_size = len(symbols)
-        # load speakers
+        # TODO: fix this for multi-speaker model - load speakers
         if self.config.tts_speakers is not None:
-            self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers))
+            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
             num_speakers = len(self.tts_speakers)
         else:
             num_speakers = 0
@@ -63,16 +67,14 @@ class Synthesizer(object):
         if 'r' in cp:
             self.tts_model.decoder.set_r(cp['r'])
 
-    def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
+    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
         # TODO: set a function in wavernn code base for model setup and call it here.
         sys.path.append(lib_path) # set this if TTS is not installed globally
         from WaveRNN.models.wavernn import Model
-        wavernn_config = os.path.join(model_path, model_config)
-        model_file = os.path.join(model_path, model_file)
         print(" > Loading WaveRNN model ...")
-        print(" | > model config: ", wavernn_config)
+        print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
-        self.wavernn_config = load_config(wavernn_config)
+        self.wavernn_config = load_config(model_config)
         self.wavernn = Model(
             rnn_dims=512,
             fc_dims=512,
@@ -91,11 +93,27 @@ class Synthesizer(object):
         ).cuda()
 
         check = torch.load(model_file)
-        self.wavernn.load_state_dict(check['model'])
+        self.wavernn.load_state_dict(check['model'], map_location="cpu")
         if use_cuda:
             self.wavernn.cuda()
         self.wavernn.eval()
 
+    def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
+        sys.path.append(lib_path) # set this if TTS is not installed globally
+        from parallel_wavegan.models import ParallelWaveGANGenerator
+        from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder
+        print(" > Loading PWGAN model ...")
+        print(" | > model config: ", model_config)
+        print(" | > model file: ", model_file)
+        with open(model_config) as f:
+            self.pwgan_config = yaml.load(f, Loader=yaml.Loader)
+        self.pwgan = ParallelWaveGANGenerator(**self.pwgan_config["generator_params"])
+        self.pwgan.load_state_dict(torch.load(model_file, map_location="cpu")["model"]["generator"])
+        self.pwgan.remove_weight_norm()
+        if use_cuda:
+            self.pwgan.cuda()
+        self.pwgan.eval()
+
     def save_wav(self, wav, path):
         # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
         wav = np.array(wav)

From 61bdb265540321889a3e959676a0995842833562 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:19:12 +0100
Subject: [PATCH 16/36] README update

---
 server/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/server/README.md b/server/README.md
index 95297225..0563ef94 100644
--- a/server/README.md
+++ b/server/README.md
@@ -6,6 +6,10 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple
 
 #### Development server:
 
+##### Using server.py
+If you have the environment set already for TTS, then you can directly call ```setup.py```.
+
+##### Using .whl
 1. apt-get install -y espeak libsndfile1 python3-venv
 2. python3 -m venv /tmp/venv
 3. source /tmp/venv/bin/activate

From 2a6bce31cb41fb365c5d5f605bb1084ff49f1b5f Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:31:02 +0100
Subject: [PATCH 17/36] update server test

---
 server/synthesizer.py           | 2 --
 tests/inputs/server_config.json | 4 +++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index b703c62e..63f2080a 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -1,5 +1,4 @@
 import io
-import os
 import re
 import sys
 
@@ -101,7 +100,6 @@ class Synthesizer(object):
     def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
         sys.path.append(lib_path) # set this if TTS is not installed globally
         from parallel_wavegan.models import ParallelWaveGANGenerator
-        from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder
         print(" > Loading PWGAN model ...")
         print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json
index 3988db4c..7f5a60fb 100644
--- a/tests/inputs/server_config.json
+++ b/tests/inputs/server_config.json
@@ -3,9 +3,11 @@
     "tts_config":"dummy_model_config.json",     // tts config.json file
     "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
     "wavernn_lib_path": null,   // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
-    "wavernn_path": null,  // wavernn model root path
     "wavernn_file": null, // wavernn checkpoint file name
     "wavernn_config": null, // wavernn config file
+    "pwgan_lib_path": null,
+    "pwgan_file": null,
+    "pwgan_config": null,
     "is_wavernn_batched":true, 
     "port": 5002,
     "use_cuda": false,

From 451f7da6980301820402b82d502b29976fd6ca31 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 6 Feb 2020 15:16:29 +0100
Subject: [PATCH 18/36] pylint check

---
 server/synthesizer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 63f2080a..75fd4e76 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -68,12 +68,15 @@ class Synthesizer(object):
 
     def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
         # TODO: set a function in wavernn code base for model setup and call it here.
-        sys.path.append(lib_path) # set this if TTS is not installed globally
+        sys.path.append(lib_path) # set this if WaveRNN is not installed globally
+        #pylint: disable=import-outside-toplevel
         from WaveRNN.models.wavernn import Model
         print(" > Loading WaveRNN model ...")
         print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
         self.wavernn_config = load_config(model_config)
+        # This is the default architecture we use for our models.
+        # You might need to update it
         self.wavernn = Model(
             rnn_dims=512,
             fc_dims=512,
@@ -98,7 +101,8 @@ class Synthesizer(object):
         self.wavernn.eval()
 
     def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
-        sys.path.append(lib_path) # set this if TTS is not installed globally
+        sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally
+        #pylint: disable=import-outside-toplevel
         from parallel_wavegan.models import ParallelWaveGANGenerator
         print(" > Loading PWGAN model ...")
         print(" | > model config: ", model_config)

From 631fbdcb8e158733b4ec1c9996c6c7cc105cd114 Mon Sep 17 00:00:00 2001
From: Markus Toman <m.toman@neuratec.com>
Date: Fri, 7 Feb 2020 11:08:21 +0100
Subject: [PATCH 19/36] Fix vocoder normalization when no vocoder is used

When G&L is used, ap_vocoder is None and crashes
---
 synthesize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/synthesize.py b/synthesize.py
index cb0ee8af..eec022ab 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -31,8 +31,8 @@ def tts(model,
         postnet_output = ap.out_linear_to_mel(postnet_output.T).T
     # correct if there is a scale difference b/w two models
     postnet_output = ap._denormalize(postnet_output)
-    postnet_output = ap_vocoder._normalize(postnet_output)
     if use_vocoder_model:
+        postnet_output = ap_vocoder._normalize(postnet_output)
         vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
         waveform = vocoder_model.generate(
             vocoder_input.cuda() if use_cuda else vocoder_input,

From 3f54c39b0a4bb4678aec99a2e6b13b825387d712 Mon Sep 17 00:00:00 2001
From: Markus Toman <m.toman@neuratec.com>
Date: Fri, 7 Feb 2020 12:35:03 +0100
Subject: [PATCH 20/36] Pacify pylint

---
 synthesize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/synthesize.py b/synthesize.py
index eec022ab..47b409ef 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -30,9 +30,9 @@ def tts(model,
     if C.model == "Tacotron" and use_vocoder_model:
         postnet_output = ap.out_linear_to_mel(postnet_output.T).T
     # correct if there is a scale difference b/w two models
-    postnet_output = ap._denormalize(postnet_output)
+    postnet_output = ap._denormalize(postnet_output) # pylint: disable=W021
     if use_vocoder_model:
-        postnet_output = ap_vocoder._normalize(postnet_output)
+        postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=W021
         vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
         waveform = vocoder_model.generate(
             vocoder_input.cuda() if use_cuda else vocoder_input,

From 8f37ea9b84c556440c0fca3c7682f101be03cb0a Mon Sep 17 00:00:00 2001
From: Markus Toman <m.toman@neuratec.com>
Date: Fri, 7 Feb 2020 12:58:58 +0100
Subject: [PATCH 21/36] Pacify pylint even more

---
 synthesize.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/synthesize.py b/synthesize.py
index 47b409ef..8312d78d 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -1,3 +1,4 @@
+# pylint: disable=redefined-outer-name, unused-argument
 import os
 import time
 import argparse
@@ -30,9 +31,9 @@ def tts(model,
     if C.model == "Tacotron" and use_vocoder_model:
         postnet_output = ap.out_linear_to_mel(postnet_output.T).T
     # correct if there is a scale difference b/w two models
-    postnet_output = ap._denormalize(postnet_output) # pylint: disable=W021
+    postnet_output = ap._denormalize(postnet_output) # pylint: disable=protected-access
     if use_vocoder_model:
-        postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=W021
+        postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=protected-access
         vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
         waveform = vocoder_model.generate(
             vocoder_input.cuda() if use_cuda else vocoder_input,

From 2996d631457388be3970ec742447d30cd1bc03f0 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 7 Feb 2020 13:00:04 +0100
Subject: [PATCH 22/36] config fixes and enable graves attention wq

---
 config.json | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/config.json b/config.json
index 89266a94..9e4fa906 100644
--- a/config.json
+++ b/config.json
@@ -23,8 +23,8 @@
         "clip_norm": true,      // clip normalized values into the range.
         "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
         "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
     },
 
     // DISTRIBUTED TRAINING
@@ -62,14 +62,14 @@
     "prenet_dropout": true,        // enable/disable dropout at prenet. 
 
     // ATTENTION
-    "attention_type": "original",  // 'original' or 'graves'
-    "attention_heads": 5,          // number of attention heads (only for 'graves')
+    "attention_type": "graves",  // 'original' or 'graves'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
     "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
     "windowing": false,            // Enables attention windowing. Used only in eval mode.
     "use_forward_attn": false,      // if it uses forward attention. In general, it aligns faster.
     "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
     "transition_agent": false,     // enable/disable transition agent of forward attention.
-    "location_attn": true,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "location_attn": false,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
     "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
 
     // STOPNET
@@ -92,8 +92,8 @@
     "max_seq_len": 150,     // DATASET-RELATED: maximum text length
 
     // PATHS
-    "output_path": "/data5/rw/pit/keep/",      // DATASET-RELATED: output path for all training outputs.
-    // "output_path": "/media/erogol/data_ssd/Models/runs/",
+    // "output_path": "/data5/rw/pit/keep/",      // DATASET-RELATED: output path for all training outputs.
+    "output_path": "/home/erogol/Models/LJSpeech/",
  
     // PHONEMES
     "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
@@ -110,10 +110,10 @@
         [
             {
                 "name": "ljspeech",
-                "path": "/root/LJSpeech-1.1/",
+                "path": "/home/erogol/Data/LJSpeech-1.1/",
                 // "path": "/home/erogol/Data/LJSpeech-1.1",
-                "meta_file_train": "metadata_train.csv",
-                "meta_file_val": "metadata_val.csv"
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": null
             }
         ]
 

From 6ee7653fcf8e81d47c0114d9057d3fe070aa83e7 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 7 Feb 2020 13:00:48 +0100
Subject: [PATCH 23/36] Notebook for PWGAN vocoder

---
 notebooks/Benchmark-PWGAN.ipynb | 578 ++++++++++++++++++++++++++++++++
 1 file changed, 578 insertions(+)
 create mode 100644 notebooks/Benchmark-PWGAN.ipynb

diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb
new file mode 100644
index 00000000..430d329f
--- /dev/null
+++ b/notebooks/Benchmark-PWGAN.ipynb
@@ -0,0 +1,578 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is to test TTS models with benchmark sentences for speech synthesis.\n",
+    "\n",
+    "Before running this script please DON'T FORGET: \n",
+    "- to set file paths.\n",
+    "- to download related model files from TTS and PWGAN.\n",
+    "- download or clone related repos, linked below.\n",
+    "- setup the repositories. ```python setup.py install```\n",
+    "- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n",
+    "- to set the right paths in the cell below.\n",
+    "\n",
+    "Repositories:\n",
+    "- TTS: https://github.com/mozilla/TTS\n",
+    "- PWGAN: https://github.com/erogol/ParallelWaveGAN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import os\n",
+    "import sys\n",
+    "import io\n",
+    "import torch \n",
+    "import time\n",
+    "import json\n",
+    "import yaml\n",
+    "import numpy as np\n",
+    "from collections import OrderedDict\n",
+    "import matplotlib.pyplot as plt\n",
+    "plt.rcParams[\"figure.figsize\"] = (16,5)\n",
+    "\n",
+    "import librosa\n",
+    "import librosa.display\n",
+    "\n",
+    "from TTS.models.tacotron import Tacotron \n",
+    "from TTS.layers import *\n",
+    "from TTS.utils.data import *\n",
+    "from TTS.utils.audio import AudioProcessor\n",
+    "from TTS.utils.generic_utils import load_config, setup_model\n",
+    "from TTS.utils.text import text_to_sequence\n",
+    "from TTS.utils.synthesis import synthesis\n",
+    "from TTS.utils.visual import visualize\n",
+    "\n",
+    "import IPython\n",
+    "from IPython.display import Audio\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "# you may need to change this depending on your system\n",
+    "os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
+    "    t_1 = time.time()\n",
+    "    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
+    "    if CONFIG.model == \"Tacotron\" and not use_gl:\n",
+    "        # coorect the normalization differences b/w TTS and the Vocoder.\n",
+    "        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
+    "    mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
+    "#     mel_postnet_spec = np.pad(mel_postnet_spec, pad_width=((2, 2), (0, 0)))\n",
+    "    print(mel_postnet_spec.shape)\n",
+    "    print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n",
+    "    if not use_gl:\n",
+    "        waveform = vocoder_model.inference(torch.FloatTensor(ap_vocoder._normalize(mel_postnet_spec).T).unsqueeze(0), hop_size=ap_vocoder.hop_length)\n",
+    "#         waveform = waveform / abs(waveform).max() * 0.9\n",
+    "    if use_cuda:\n",
+    "        waveform = waveform.cpu()\n",
+    "    waveform = waveform.numpy()\n",
+    "    print(\" >  Run-time: {}\".format(time.time() - t_1))\n",
+    "    if figures:  \n",
+    "        visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec))                                                                       \n",
+    "    IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False))  \n",
+    "    os.makedirs(OUT_FOLDER, exist_ok=True)\n",
+    "    file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
+    "    out_path = os.path.join(OUT_FOLDER, file_name)\n",
+    "    ap.save_wav(waveform, out_path)\n",
+    "    return alignment, mel_postnet_spec, stop_tokens, waveform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set constants\n",
+    "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n",
+    "MODEL_PATH = ROOT_PATH + '/checkpoint_670000.pth.tar'\n",
+    "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
+    "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
+    "CONFIG = load_config(CONFIG_PATH)\n",
+    "VOCODER_MODEL_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/checkpoint-400000steps.pkl\"\n",
+    "VOCODER_CONFIG_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/config.yml\"\n",
+    "\n",
+    "# load PWGAN config\n",
+    "with open(VOCODER_CONFIG_PATH) as f:\n",
+    "    VOCODER_CONFIG = yaml.load(f, Loader=yaml.Loader)\n",
+    "    \n",
+    "# Run FLAGs\n",
+    "use_cuda = False\n",
+    "# Set some config fields manually for testing\n",
+    "CONFIG.windowing = True\n",
+    "CONFIG.use_forward_attn = True \n",
+    "# Set the vocoder\n",
+    "use_gl = False # use GL if True\n",
+    "batched_wavernn = True    # use batched wavernn inference if True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# LOAD TTS MODEL\n",
+    "from TTS.utils.text.symbols import symbols, phonemes\n",
+    "\n",
+    "# multi speaker \n",
+    "if CONFIG.use_speaker_embedding:\n",
+    "    speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
+    "    speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
+    "else:\n",
+    "    speakers = []\n",
+    "    speaker_id = None\n",
+    "\n",
+    "# load the model\n",
+    "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
+    "model = setup_model(num_chars, len(speakers), CONFIG)\n",
+    "\n",
+    "# load the audio processor\n",
+    "ap = AudioProcessor(**CONFIG.audio)         \n",
+    "\n",
+    "\n",
+    "# load model state\n",
+    "cp =  torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
+    "\n",
+    "# load the model\n",
+    "model.load_state_dict(cp['model'])\n",
+    "if use_cuda:\n",
+    "    model.cuda()\n",
+    "model.eval()\n",
+    "print(cp['step'])\n",
+    "print(cp['r'])\n",
+    "\n",
+    "# set model stepsize\n",
+    "if 'r' in cp:\n",
+    "    model.decoder.set_r(cp['r'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# LOAD WAVERNN\n",
+    "if use_gl == False:\n",
+    "    from parallel_wavegan.models import ParallelWaveGANGenerator\n",
+    "    from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
+    "    \n",
+    "    vocoder_model = ParallelWaveGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n",
+    "    vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n",
+    "    vocoder_model.remove_weight_norm()\n",
+    "    ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG['audio'])    \n",
+    "    if use_cuda:\n",
+    "        vocoder_model.cuda()\n",
+    "    vocoder_model.eval();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Comparision with https://mycroft.ai/blog/available-voices/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.eval()\n",
+    "model.decoder.max_decoder_steps = 2000\n",
+    "model.decoder.prenet.eval()\n",
+    "speaker_id = None\n",
+    "sentence = '''A breeding jennet, lusty, young, and proud,'''\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence =  \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### https://espnet.github.io/icassp2020-tts/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The Commission also recommends\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Other examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Be a voice, not an echo.\"  # 'echo' is not in training set. \n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The human voice is the most perfect instrument of all.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"This cake is great. It's so delicious and moist.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Comparison with https://keithito.github.io/audio-samples/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \" He has read the whole thing.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"He reads books.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Thisss isrealy awhsome.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"This is your internet browser, Firefox.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"This is your internet browser Firefox.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Eren, how are you?\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Hard Sentences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Encouraged, he started with a minute a day.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"If he decided to watch TV he really watched it.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for twb dataset\n",
+    "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From c553c7ecd42e8dab2cf55e4dff127b1ee776b035 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 7 Feb 2020 14:21:57 +0100
Subject: [PATCH 24/36] use decorater for torch.no_grad

---
 train.py | 224 +++++++++++++++++++++++++++----------------------------
 1 file changed, 112 insertions(+), 112 deletions(-)

diff --git a/train.py b/train.py
index f52d24c1..b9f5fefb 100644
--- a/train.py
+++ b/train.py
@@ -327,6 +327,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
     return keep_avg['avg_postnet_loss'], global_step
 
 
+@torch.no_grad()
 def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
     data_loader = setup_loader(ap, model.decoder.r, is_val=True)
     if c.use_speaker_embedding:
@@ -346,125 +347,124 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
     keep_avg.add_values(eval_values_dict)
     print("\n > Validation")
 
-    with torch.no_grad():
-        if data_loader is not None:
-            for num_iter, data in enumerate(data_loader):
-                start_time = time.time()
+    if data_loader is not None:
+        for num_iter, data in enumerate(data_loader):
+            start_time = time.time()
 
-                # format data
-                text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
-                assert mel_input.shape[1] % model.decoder.r == 0
+            # format data
+            text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
+            assert mel_input.shape[1] % model.decoder.r == 0
 
-                # forward pass model
-                if c.bidirectional_decoder:
-                    decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
-                        text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
-                else:
-                    decoder_output, postnet_output, alignments, stop_tokens = model(
-                        text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
+            # forward pass model
+            if c.bidirectional_decoder:
+                decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
+                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
+            else:
+                decoder_output, postnet_output, alignments, stop_tokens = model(
+                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
 
-                # loss computation
-                stop_loss = criterion_st(
-                    stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
-                if c.loss_masking:
-                    decoder_loss = criterion(decoder_output, mel_input,
-                                             mel_lengths)
-                    if c.model in ["Tacotron", "TacotronGST"]:
-                        postnet_loss = criterion(postnet_output, linear_input,
-                                                 mel_lengths)
-                    else:
-                        postnet_loss = criterion(postnet_output, mel_input,
-                                                 mel_lengths)
-                else:
-                    decoder_loss = criterion(decoder_output, mel_input)
-                    if c.model in ["Tacotron", "TacotronGST"]:
-                        postnet_loss = criterion(postnet_output, linear_input)
-                    else:
-                        postnet_loss = criterion(postnet_output, mel_input)
-                loss = decoder_loss + postnet_loss + stop_loss
-
-                # backward decoder loss
-                if c.bidirectional_decoder:
-                    if c.loss_masking:
-                        decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths)
-                    else:
-                        decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input)
-                    decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output)
-                    loss += decoder_backward_loss + decoder_c_loss
-                    keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()})
-
-                step_time = time.time() - start_time
-                epoch_time += step_time
-
-                # compute alignment score
-                align_score = alignment_diagonal_score(alignments)
-                keep_avg.update_value('avg_align_score', align_score)
-
-                # aggregate losses from processes
-                if num_gpus > 1:
-                    postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
-                    decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
-                    if c.stopnet:
-                        stop_loss = reduce_tensor(stop_loss.data, num_gpus)
-
-                keep_avg.update_values({
-                    'avg_postnet_loss':
-                    float(postnet_loss.item()),
-                    'avg_decoder_loss':
-                    float(decoder_loss.item()),
-                    'avg_stop_loss':
-                    float(stop_loss.item()),
-                })
-
-                if num_iter % c.print_step == 0:
-                    print(
-                        "   | > TotalLoss: {:.5f}   PostnetLoss: {:.5f} - {:.5f}  DecoderLoss:{:.5f} - {:.5f} "
-                        "StopLoss: {:.5f} - {:.5f}  AlignScore: {:.4f} : {:.4f}"
-                        .format(loss.item(), postnet_loss.item(),
-                                keep_avg['avg_postnet_loss'],
-                                decoder_loss.item(),
-                                keep_avg['avg_decoder_loss'], stop_loss.item(),
-                                keep_avg['avg_stop_loss'], align_score,
-                                keep_avg['avg_align_score']),
-                        flush=True)
-
-            if args.rank == 0:
-                # Diagnostic visualizations
-                idx = np.random.randint(mel_input.shape[0])
-                const_spec = postnet_output[idx].data.cpu().numpy()
-                gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
-                    "Tacotron", "TacotronGST"
-                ] else mel_input[idx].data.cpu().numpy()
-                align_img = alignments[idx].data.cpu().numpy()
-
-                eval_figures = {
-                    "prediction": plot_spectrogram(const_spec, ap),
-                    "ground_truth": plot_spectrogram(gt_spec, ap),
-                    "alignment": plot_alignment(align_img)
-                }
-
-                # Sample audio
+            # loss computation
+            stop_loss = criterion_st(
+                stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
+            if c.loss_masking:
+                decoder_loss = criterion(decoder_output, mel_input,
+                                            mel_lengths)
                 if c.model in ["Tacotron", "TacotronGST"]:
-                    eval_audio = ap.inv_spectrogram(const_spec.T)
+                    postnet_loss = criterion(postnet_output, linear_input,
+                                                mel_lengths)
                 else:
-                    eval_audio = ap.inv_mel_spectrogram(const_spec.T)
-                tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
-                                         c.audio["sample_rate"])
+                    postnet_loss = criterion(postnet_output, mel_input,
+                                                mel_lengths)
+            else:
+                decoder_loss = criterion(decoder_output, mel_input)
+                if c.model in ["Tacotron", "TacotronGST"]:
+                    postnet_loss = criterion(postnet_output, linear_input)
+                else:
+                    postnet_loss = criterion(postnet_output, mel_input)
+            loss = decoder_loss + postnet_loss + stop_loss
 
-                # Plot Validation Stats
-                epoch_stats = {
-                    "loss_postnet": keep_avg['avg_postnet_loss'],
-                    "loss_decoder": keep_avg['avg_decoder_loss'],
-                    "stop_loss": keep_avg['avg_stop_loss'],
-                    "alignment_score": keep_avg['avg_align_score']
-                }
+            # backward decoder loss
+            if c.bidirectional_decoder:
+                if c.loss_masking:
+                    decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths)
+                else:
+                    decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input)
+                decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output)
+                loss += decoder_backward_loss + decoder_c_loss
+                keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()})
 
-                if c.bidirectional_decoder:
-                    epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss']
-                    align_b_img = alignments_backward[idx].data.cpu().numpy()
-                    eval_figures['alignment_backward'] = plot_alignment(align_b_img)
-                tb_logger.tb_eval_stats(global_step, epoch_stats)
-                tb_logger.tb_eval_figures(global_step, eval_figures)
+            step_time = time.time() - start_time
+            epoch_time += step_time
+
+            # compute alignment score
+            align_score = alignment_diagonal_score(alignments)
+            keep_avg.update_value('avg_align_score', align_score)
+
+            # aggregate losses from processes
+            if num_gpus > 1:
+                postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
+                decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
+                if c.stopnet:
+                    stop_loss = reduce_tensor(stop_loss.data, num_gpus)
+
+            keep_avg.update_values({
+                'avg_postnet_loss':
+                float(postnet_loss.item()),
+                'avg_decoder_loss':
+                float(decoder_loss.item()),
+                'avg_stop_loss':
+                float(stop_loss.item()),
+            })
+
+            if num_iter % c.print_step == 0:
+                print(
+                    "   | > TotalLoss: {:.5f}   PostnetLoss: {:.5f} - {:.5f}  DecoderLoss:{:.5f} - {:.5f} "
+                    "StopLoss: {:.5f} - {:.5f}  AlignScore: {:.4f} : {:.4f}"
+                    .format(loss.item(), postnet_loss.item(),
+                            keep_avg['avg_postnet_loss'],
+                            decoder_loss.item(),
+                            keep_avg['avg_decoder_loss'], stop_loss.item(),
+                            keep_avg['avg_stop_loss'], align_score,
+                            keep_avg['avg_align_score']),
+                    flush=True)
+
+        if args.rank == 0:
+            # Diagnostic visualizations
+            idx = np.random.randint(mel_input.shape[0])
+            const_spec = postnet_output[idx].data.cpu().numpy()
+            gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
+                "Tacotron", "TacotronGST"
+            ] else mel_input[idx].data.cpu().numpy()
+            align_img = alignments[idx].data.cpu().numpy()
+
+            eval_figures = {
+                "prediction": plot_spectrogram(const_spec, ap),
+                "ground_truth": plot_spectrogram(gt_spec, ap),
+                "alignment": plot_alignment(align_img)
+            }
+
+            # Sample audio
+            if c.model in ["Tacotron", "TacotronGST"]:
+                eval_audio = ap.inv_spectrogram(const_spec.T)
+            else:
+                eval_audio = ap.inv_mel_spectrogram(const_spec.T)
+            tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
+                                        c.audio["sample_rate"])
+
+            # Plot Validation Stats
+            epoch_stats = {
+                "loss_postnet": keep_avg['avg_postnet_loss'],
+                "loss_decoder": keep_avg['avg_decoder_loss'],
+                "stop_loss": keep_avg['avg_stop_loss'],
+                "alignment_score": keep_avg['avg_align_score']
+            }
+
+            if c.bidirectional_decoder:
+                epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss']
+                align_b_img = alignments_backward[idx].data.cpu().numpy()
+                eval_figures['alignment_backward'] = plot_alignment(align_b_img)
+            tb_logger.tb_eval_stats(global_step, epoch_stats)
+            tb_logger.tb_eval_figures(global_step, eval_figures)
 
     if args.rank == 0 and epoch > c.test_delay_epochs:
         if c.test_sentences_file is None:

From 566c2a4678856d23d1cce4b22ff1a960855d315a Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 12 Feb 2020 10:29:30 +0100
Subject: [PATCH 25/36] add torch.no_grad decorator for inference

---
 models/tacotron.py  | 1 +
 models/tacotron2.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/models/tacotron.py b/models/tacotron.py
index a2d9e1c4..04ecd573 100644
--- a/models/tacotron.py
+++ b/models/tacotron.py
@@ -132,6 +132,7 @@ class Tacotron(nn.Module):
             return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
         return decoder_outputs, postnet_outputs, alignments, stop_tokens
 
+    @torch.no_grad()
     def inference(self, characters, speaker_ids=None, style_mel=None):
         inputs = self.embedding(characters)
         self._init_states()
diff --git a/models/tacotron2.py b/models/tacotron2.py
index 852b1886..3a3863de 100644
--- a/models/tacotron2.py
+++ b/models/tacotron2.py
@@ -82,6 +82,7 @@ class Tacotron2(nn.Module):
             return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
         return decoder_outputs, postnet_outputs, alignments, stop_tokens
 
+    @torch.no_grad()
     def inference(self, text, speaker_ids=None):
         embedded_inputs = self.embedding(text).transpose(1, 2)
         encoder_outputs = self.encoder.inference(embedded_inputs)

From 78464f1eada09f2332194bc004f2a98c138997cf Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 12 Feb 2020 10:32:52 +0100
Subject: [PATCH 26/36] linter fix

---
 train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/train.py b/train.py
index b9f5fefb..e8c240f3 100644
--- a/train.py
+++ b/train.py
@@ -368,13 +368,13 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
                 stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
             if c.loss_masking:
                 decoder_loss = criterion(decoder_output, mel_input,
-                                            mel_lengths)
+                                         mel_lengths)
                 if c.model in ["Tacotron", "TacotronGST"]:
                     postnet_loss = criterion(postnet_output, linear_input,
-                                                mel_lengths)
+                                             mel_lengths)
                 else:
                     postnet_loss = criterion(postnet_output, mel_input,
-                                                mel_lengths)
+                                             mel_lengths)
             else:
                 decoder_loss = criterion(decoder_output, mel_input)
                 if c.model in ["Tacotron", "TacotronGST"]:
@@ -449,7 +449,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
             else:
                 eval_audio = ap.inv_mel_spectrogram(const_spec.T)
             tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
-                                        c.audio["sample_rate"])
+                                     c.audio["sample_rate"])
 
             # Plot Validation Stats
             epoch_stats = {

From 4130674e46b5f5be8bc751c95e0ee0cf72c9f96c Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 12 Feb 2020 12:21:53 +0100
Subject: [PATCH 27/36] update for phonemizer 2.1

---
 tests/test_text_processing.py |  4 ++--
 utils/text/__init__.py        | 43 +++++++++++++++++++++++------------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 8f8e6fab..0ecb9962 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -69,7 +69,7 @@ def test_phoneme_to_sequence():
 
 def test_text2phone():
     text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
-    gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i|| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n||| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
+    gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
     lang = "en-us"
     phonemes = text2phone(text, lang)
-    assert gt == phonemes
+    assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}"
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index 1c5b98c3..e6842dfa 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -28,21 +28,34 @@ def text2phone(text, language):
     seperator = phonemizer.separator.Separator(' |', '', '|')
     #try:
     punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
-    ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
-    ph = ph[:-1].strip() # skip the last empty character
-    # Replace \n with matching punctuations.
-    if punctuations:
-        # if text ends with a punctuation.
-        if text[-1] == punctuations[-1]:
-            for punct in punctuations[:-1]:
-                ph = ph.replace('| |\n', '|'+punct+'| |', 1)
-            try:
-                ph = ph + punctuations[-1]
-            except:
-                print(text)
-        else:
-            for punct in punctuations:
-                ph = ph.replace('| |\n', '|'+punct+'| |', 1)
+    if float(phonemizer.__version__) < 2.1:
+        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
+        ph = ph[:-1].strip() # skip the last empty character
+        # phonemizer does not tackle punctuations. Here we do.
+        # Replace \n with matching punctuations.
+        if punctuations:
+            # if text ends with a punctuation.
+            if text[-1] == punctuations[-1]:
+                for punct in punctuations[:-1]:
+                    ph = ph.replace('| |\n', '|'+punct+'| |', 1)
+                try:
+                    ph = ph + punctuations[-1]
+                except:
+                    print(text)
+            else:
+                for punct in punctuations:
+                    ph = ph.replace('| |\n', '|'+punct+'| |', 1)
+    elif float(phonemizer.__version__) == 2.1:
+        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True)
+        # this is a simple fix for phonemizer.
+        # https://github.com/bootphon/phonemizer/issues/32
+        if punctuations:
+            for punctuation in punctuations:
+                ph = ph.replace(f"| |{punctuation} ", f"|{punctuation}| |").replace(f"| |{punctuation}", f"|{punctuation}| |")
+            ph = ph[:-3]
+    else:
+        raise RuntimeError(" [!] Use 'phonemizer' version 2.1 or older.")
+
     return ph
 
 

From 02e6d0538272f589d6c3c290b81575b7bd866991 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 15:49:46 +0100
Subject: [PATCH 28/36] Use PWGAN if available in Synthesizer.tts

---
 server/synthesizer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 75fd4e76..455bd332 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -168,9 +168,16 @@ class Synthesizer(object):
             postnet_output, decoder_output, _ = parse_outputs(
                 postnet_output, decoder_output, alignments)
 
+            if self.pwgan:
+                vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
+                if self.use_cuda:
+                    vocoder_input.cuda()
+                wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length)
             if self.wavernn:
-                postnet_output = postnet_output[0].data.cpu().numpy()
-                wav = self.wavernn.generate(torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550)
+                vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
+                if self.use_cuda:
+                    vocoder_input.cuda()
+                wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
             else:
                 wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
             # trim silence

From b539ffafc0a0c185438bab262719f4259b6c8f9f Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 15:54:30 +0100
Subject: [PATCH 29/36] Load PWGAN/WaveRNN embedded files if present

---
 server/server.py | 47 +++++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/server/server.py b/server/server.py
index 6af119bf..705937e2 100644
--- a/server/server.py
+++ b/server/server.py
@@ -18,9 +18,9 @@ def create_argparser():
     parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
     parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
     parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
-    parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
-    parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.')
-    parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.')
+    parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
+    parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
     parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
     parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
     parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
@@ -29,28 +29,35 @@ def create_argparser():
 
 synthesizer = None
 
-embedded_model_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
-checkpoint_file = os.path.join(embedded_model_folder, 'checkpoint.pth.tar')
-config_file = os.path.join(embedded_model_folder, 'config.json')
+embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
 
-# Default options with embedded model files
-if os.path.isfile(checkpoint_file):
-    default_tts_checkpoint = checkpoint_file
-else:
-    default_tts_checkpoint = None
+embedded_tts_folder = os.path.join(embedded_models_folder, 'tts')
+tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
+tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
 
-if os.path.isfile(config_file):
-    default_tts_config = config_file
-else:
-    default_tts_config = None
+embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
+wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
+wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
+
+embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan')
+pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl')
+pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml')
 
 args = create_argparser().parse_args()
 
-# If these were not specified in the CLI args, use default values
-if not args.tts_checkpoint:
-    args.tts_checkpoint = default_tts_checkpoint
-if not args.tts_config:
-    args.tts_config = default_tts_config
+# If these were not specified in the CLI args, use default values with embedded model files
+if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
+    args.tts_checkpoint = tts_checkpoint_file
+if not args.tts_config and os.path.isfile(tts_config_file):
+    args.tts_config = tts_config_file
+if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file):
+    args.wavernn_file = wavernn_checkpoint_file
+if not args.wavernn_config and os.path.isfile(wavernn_config_file):
+    args.wavernn_config = wavernn_config_file
+if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file):
+    args.pwgan_file = pwgan_checkpoint_file
+if not args.pwgan_config and os.path.isfile(pwgan_config_file):
+    args.pwgan_config = pwgan_config_file
 
 synthesizer = Synthesizer(args)
 

From 995eb1bf074caae257a87f5ef54ae5f63617b227 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 16:03:30 +0100
Subject: [PATCH 30/36] Fix bug where sometimes the second sentence disappears
 if it doesn't end with punctuation

---
 server/synthesizer.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 455bd332..1082b73a 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -122,7 +122,7 @@ class Synthesizer(object):
         self.ap.save_wav(wav, path)
 
     def split_into_sentences(self, text):
-        text = " " + text + "  "
+        text = " " + text + "  <stop>"
         text = text.replace("\n", " ")
         text = re.sub(prefixes, "\\1<prd>", text)
         text = re.sub(websites, "<prd>\\1", text)
@@ -149,15 +149,13 @@ class Synthesizer(object):
         text = text.replace("<prd>", ".")
         sentences = text.split("<stop>")
         sentences = sentences[:-1]
-        sentences = [s.strip() for s in sentences]
+        sentences = list(filter(None, [s.strip() for s in sentences])) # remove empty sentences
         return sentences
 
     def tts(self, text):
         wavs = []
         sens = self.split_into_sentences(text)
         print(sens)
-        if not sens:
-            sens = [text+'.']
         for sen in sens:
             # preprocess the given text
             inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda)

From 0e35fdc2a1c8a4bc669e3c6d755c551489ee221b Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 13 Feb 2020 17:23:37 +0100
Subject: [PATCH 31/36] fix linter problems and loader test

---
 tests/test_loader.py          | 4 +---
 tests/test_text_processing.py | 4 ++--
 utils/text/__init__.py        | 3 ---
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/test_loader.py b/tests/test_loader.py
index 751bc181..d8727895 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -137,9 +137,7 @@ class TestTTSDataset(unittest.TestCase):
                 # NOTE: Below needs to check == 0 but due to an unknown reason
                 # there is a slight difference between two matrices.
                 # TODO: Check this assert cond more in detail.
-                assert abs((abs(mel.T)
-                            - abs(mel_dl)
-                            ).sum()) < 1e-5, (abs(mel.T) - abs(mel_dl)).sum()
+                assert abs(mel.T - mel_dl).max() < 1e-5, abs(mel.T - mel_dl).max()
 
                 # check mel-spec correctness
                 mel_spec = mel_input[0].cpu().numpy()
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 0ecb9962..aa17f694 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -71,5 +71,5 @@ def test_text2phone():
     text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
     gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
     lang = "en-us"
-    phonemes = text2phone(text, lang)
-    assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}"
+    ph = text2phone(text, lang)
+    assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index e6842dfa..0e6684d2 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -38,10 +38,7 @@ def text2phone(text, language):
             if text[-1] == punctuations[-1]:
                 for punct in punctuations[:-1]:
                     ph = ph.replace('| |\n', '|'+punct+'| |', 1)
-                try:
                     ph = ph + punctuations[-1]
-                except:
-                    print(text)
             else:
                 for punct in punctuations:
                     ph = ph.replace('| |\n', '|'+punct+'| |', 1)

From ffd00ce295e8b68e59dccda99bc467823a62940d Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 17:30:41 +0100
Subject: [PATCH 32/36] Fix linter and server package test

---
 server/synthesizer.py        | 3 ++-
 setup.py                     | 7 ++++---
 tests/test_server_package.sh | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 1082b73a..fcdc8787 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -121,7 +121,8 @@ class Synthesizer(object):
         wav = np.array(wav)
         self.ap.save_wav(wav, path)
 
-    def split_into_sentences(self, text):
+    @staticmethod
+    def split_into_sentences(text):
         text = " " + text + "  <stop>"
         text = text.replace("\n", " ")
         text = re.sub(prefixes, "\\1<prd>", text)
diff --git a/setup.py b/setup.py
index 63782800..f92dac8a 100644
--- a/setup.py
+++ b/setup.py
@@ -61,10 +61,11 @@ package_data = ['server/templates/*']
 if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config:
     print('Embedding model in wheel file...')
     model_dir = os.path.join('server', 'model')
-    os.makedirs(model_dir, exist_ok=True)
-    embedded_checkpoint_path = os.path.join(model_dir, 'checkpoint.pth.tar')
+    tts_dir = os.path.join(model_dir, 'tts')
+    os.makedirs(tts_dir, exist_ok=True)
+    embedded_checkpoint_path = os.path.join(tts_dir, 'checkpoint.pth.tar')
     shutil.copy(args.checkpoint, embedded_checkpoint_path)
-    embedded_config_path = os.path.join(model_dir, 'config.json')
+    embedded_config_path = os.path.join(tts_dir, 'config.json')
     shutil.copy(args.model_config, embedded_config_path)
     package_data.extend([embedded_checkpoint_path, embedded_config_path])
 
diff --git a/tests/test_server_package.sh b/tests/test_server_package.sh
index 01e42843..9fe5e8b1 100755
--- a/tests/test_server_package.sh
+++ b/tests/test_server_package.sh
@@ -11,7 +11,7 @@ source /tmp/venv/bin/activate
 pip install --quiet --upgrade pip setuptools wheel
 
 rm -f dist/*.whl
-python setup.py bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
+python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
 pip install --quiet dist/TTS*.whl
 
 python -m TTS.server.server &

From 9c5c68626825fdebd4af5d02f0bb792fb9f6fa44 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 13 Feb 2020 22:16:40 +0100
Subject: [PATCH 33/36] check config with a function

---
 config.json            |   9 +--
 train.py               |   3 +-
 utils/generic_utils.py | 128 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/config.json b/config.json
index 9e4fa906..c1a8158d 100644
--- a/config.json
+++ b/config.json
@@ -9,7 +9,7 @@
         "num_mels": 80,         // size of the mel spec frame. 
         "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
         "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "frame_length_ms": 50,  // stft window length in ms.
+        "frame_length_ms": 50.0,  // stft window length in ms.
         "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
         "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
         "min_level_db": -100,   // normalization range
@@ -19,7 +19,7 @@
         // Normalization parameters
         "signal_norm": true,    // normalize the spec values in range [0, 1]
         "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
         "clip_norm": true,      // clip normalized values into the range.
         "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
         "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
@@ -36,11 +36,12 @@
     "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
 
     // TRAINING
-    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
     "eval_batch_size":16,   
     "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.  
     "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
     "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "grad_accum": 2,        // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. 
 
     // VALIDATION
     "run_eval": true,
@@ -49,7 +50,7 @@
 
     // OPTIMIZER
     "noam_schedule": false,        // use noam warmup and lr schedule.
-    "grad_clip": 1,                // upper limit for gradients for clipping.
+    "grad_clip": 1.0,                // upper limit for gradients for clipping.
     "epochs": 1000,                // total number of epochs to train.
     "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
     "wd": 0.000001,         // Weight decay weight.
diff --git a/train.py b/train.py
index e8c240f3..7bfb8751 100644
--- a/train.py
+++ b/train.py
@@ -20,7 +20,7 @@ from TTS.utils.generic_utils import (
     get_git_branch, load_config, remove_experiment_folder, save_best_model,
     save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file,
     setup_model, gradual_training_scheduler, KeepAverage,
-    set_weight_decay)
+    set_weight_decay, check_config)
 from TTS.utils.logger import Logger
 from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
     get_speakers
@@ -687,6 +687,7 @@ if __name__ == '__main__':
 
     # setup output paths and read configs
     c = load_config(args.config_path)
+    check_config(c)
     _ = os.path.dirname(os.path.realpath(__file__))
 
     OUT_PATH = args.continue_path
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index cf1a83a6..7a5c2ac2 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -389,3 +389,131 @@ class KeepAverage():
     def update_values(self, value_dict):
         for key, value in value_dict.items():
             self.update_value(key, value)
+
+
+def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None):
+    if restricted:
+        assert name in c.keys(), f' [!] {name} not defined in config.json'
+    if name in c.keys():
+        if max_val:
+            assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
+        if min_val:
+            assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
+        if enum_list:
+            assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
+        if val_type:
+            assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
+    
+
+
+def check_config(c):
+    _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
+    _check_argument('run_name', c, restricted=True, val_type=str)
+    _check_argument('run_description', c, val_type=str)
+
+    # AUDIO
+    _check_argument('audio', c, restricted=True, val_type=dict)
+
+    # audio processing parameters
+    _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
+    _check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
+    _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
+    _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000)
+    _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000)
+    _check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
+    _check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
+    _check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
+    _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
+    _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
+
+    # normalization parameters
+    _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
+    _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
+    _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
+    _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
+    _check_argument('trim_db', c['audio'], restricted=True, val_type=int)
+
+    # training parameters
+    _check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('r', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('gradual_training', c, restricted=False, val_type=list)
+    _check_argument('loss_masking', c, restricted=True, val_type=bool)
+    _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
+
+    # validation parameters
+    _check_argument('run_eval', c, restricted=True, val_type=bool)
+    _check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('test_sentences_file', c, restricted=False, val_type=str)
+
+    # optimizer
+    _check_argument('noam_schedule', c, restricted=False, val_type=bool)
+    _check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
+    _check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('lr', c, restricted=True, val_type=float, min_val=0)
+    _check_argument('wd', c, restricted=True, val_type=float, min_val=0)
+    _check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('seq_len_norm', c, restricted=True, val_type=bool)
+
+    # tacotron prenet
+    _check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
+    _check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
+    _check_argument('prenet_dropout', c, restricted=True, val_type=bool)
+
+    # attention
+    _check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
+    _check_argument('attention_heads', c, restricted=True, val_type=int)
+    _check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
+    _check_argument('windowing', c, restricted=True, val_type=bool)
+    _check_argument('use_forward_attn', c, restricted=True, val_type=bool)
+    _check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
+    _check_argument('transition_agent', c, restricted=True, val_type=bool)
+    _check_argument('transition_agent', c, restricted=True, val_type=bool)
+    _check_argument('location_attn', c, restricted=True, val_type=bool)
+    _check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
+
+    # stopnet
+    _check_argument('stopnet', c, restricted=True, val_type=bool)
+    _check_argument('separate_stopnet', c, restricted=True, val_type=bool)
+
+    # tensorboard
+    _check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('checkpoint', c, restricted=True, val_type=bool)
+    _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
+
+    # dataloading
+    _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=['english_cleaners', 'phoneme_cleaners', 'transliteration_cleaners', 'basic_cleaners'])
+    _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
+    _check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
+
+    # paths
+    _check_argument('output_path', c, restricted=True, val_type=str)
+
+    # multi-speaker gst
+    _check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
+    _check_argument('style_wav_for_test', c, restricted=True, val_type=str)
+    _check_argument('use_gst', c, restricted=True, val_type=bool)
+
+    # datasets - checking only the first entry
+    _check_argument('datasets', c, restricted=True, val_type=list)
+    for dataset_entry in c['datasets']:
+        _check_argument('name', dataset_entry, restricted=True, val_type=str)
+        _check_argument('path', dataset_entry, restricted=True, val_type=str)
+        _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
+        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
+
+
+
+
+    
+
+
+

From 3331afa21932596ca791260e1c14e6942c1d6df2 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 14 Feb 2020 17:47:33 +0100
Subject: [PATCH 34/36] remove grad_accum from config checker

---
 utils/generic_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 7a5c2ac2..942fedf9 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -405,7 +405,6 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric
             assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
     
 
-
 def check_config(c):
     _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
     _check_argument('run_name', c, restricted=True, val_type=str)
@@ -442,7 +441,7 @@ def check_config(c):
     _check_argument('r', c, restricted=True, val_type=int, min_val=1)
     _check_argument('gradual_training', c, restricted=False, val_type=list)
     _check_argument('loss_masking', c, restricted=True, val_type=bool)
-    _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
+    # _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
 
     # validation parameters
     _check_argument('run_eval', c, restricted=True, val_type=bool)

From c48b053cdee1a183d747c8151b96febdb102a291 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 14 Feb 2020 18:00:15 +0100
Subject: [PATCH 35/36] linter fixes

---
 utils/generic_utils.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 942fedf9..a8de5bbb 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -402,8 +402,8 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric
         if enum_list:
             assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
         if val_type:
-            assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
-    
+            assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
+
 
 def check_config(c):
     _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
@@ -507,12 +507,4 @@ def check_config(c):
         _check_argument('name', dataset_entry, restricted=True, val_type=str)
         _check_argument('path', dataset_entry, restricted=True, val_type=str)
         _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
-        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
-
-
-
-
-    
-
-
-
+        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
\ No newline at end of file

From 02df28c7d6059afa31d615a6f24eb27b7c017cff Mon Sep 17 00:00:00 2001
From: richardburleigh <richard@richardburleigh.com.au>
Date: Sat, 15 Feb 2020 14:47:50 +1100
Subject: [PATCH 36/36] Fix GL overriding PWGAN inference

---
 server/synthesizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index fcdc8787..347bef21 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -172,7 +172,7 @@ class Synthesizer(object):
                 if self.use_cuda:
                     vocoder_input.cuda()
                 wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length)
-            if self.wavernn:
+            elif self.wavernn:
                 vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
                 if self.use_cuda:
                     vocoder_input.cuda()