Bug solve on attention module and a new Notebook to experiment spectrogram reconstruction

2018-01-31 07:21:22 -08:00 · 2018-01-31 07:21:22 -08:00 · 1320d5344a
parent 9a22f5d085
commit 1320d5344a
17 changed files with 928 additions and 51 deletions
--- a/PlayGround.ipynb
+++ b/PlayGround.ipynb
--- a/config.json
+++ b/config.json
@ -1,8 +1,8 @@
 {
  "num_mels": 80,
-  "num_freq": 1024,
+  "num_freq": 1025,
  "sample_rate": 20000,
-  "frame_length_ms": 50.0,
+  "frame_length_ms": 50,
  "frame_shift_ms": 12.5,
  "preemphasis": 0.97,
  "min_level_db": -100,
@ -12,11 +12,11 @@
  "text_cleaner": "english_cleaners",

  "epochs": 2000,
-  "lr": 0.001,
-  "lr_patience": 2,
+  "lr": 0.003,
+  "lr_patience": 5,
  "lr_decay": 0.5,
  "batch_size": 256,
-  "griffinf_lim_iters": 60,
+  "griffin_lim_iters": 60,
  "power": 1.5,
  "r": 5,

--- a/BIN
+++ b/BIN
--- a/datasets/.LJSpeech.py.swp
+++ b/datasets/.LJSpeech.py.swp
--- a/debug_config.py
+++ b/debug_config.py
@ -20,6 +20,8 @@
  "power": 1.5,
  "r": 5,

+  "num_loader_workers": 16,
+
  "save_step": 1,
  "data_path": "/data/shared/KeithIto/LJSpeech-1.0",
  "output_path": "result",
--- a/layers/.attention.py.swp
+++ b/layers/.attention.py.swp
--- a/layers/.tacotron.py.swp
+++ b/layers/.tacotron.py.swp
--- a/layers/attention.py
+++ b/layers/attention.py
@ -73,7 +73,8 @@ class AttentionWrapper(nn.Module):
            alignment.data.masked_fill_(mask, self.score_mask_value)

        # Normalize attention weight
-        alignment = F.softmax(alignment, dim=0)
+        alignment = F.softmax(alignment, dim=-1) ## TODO: might be buggy
+        print(alignment.size())

        # Attention context vector
        # (batch, 1, dim)
--- a/models/.tacotron.py.swp
+++ b/models/.tacotron.py.swp
--- a/models/tacotron.py
+++ b/models/tacotron.py
@ -2,7 +2,7 @@
 import torch
 from torch.autograd import Variable
 from torch import nn
-from utils.text.symbols import symbols
+from TTS.utils.text.symbols import symbols
 from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG

 class Tacotron(nn.Module):
--- a/notebooks/PlayGround.ipynb
+++ b/notebooks/PlayGround.ipynb
--- a/notebooks/TestModelOnLJSpeech.ipynb
+++ b/notebooks/TestModelOnLJSpeech.ipynb
--- a/notebooks/utils.py
+++ b/notebooks/utils.py
@ -0,0 +1,51 @@
+import io
+import librosa
+import torch
+import numpy as np
+from TTS.utils.text import text_to_sequence
+from matplotlib import pylab as plt
+
+hop_length = 250
+
+def create_speech(m, s, CONFIG, use_cuda, ap):
+    text_cleaner = [CONFIG.text_cleaner]
+    seq = np.array(text_to_sequence(s, text_cleaner))
+            
+#     mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)
+    
+    if use_cuda:
+        chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
+#         mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda() 
+    else:
+        chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0)
+#         mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)
+        
+    mel_out, linear_out, alignments =m.forward(chars_var)
+    linear_out = linear_out[0].data.cpu().numpy()
+    alignment = alignments[0].cpu().data.numpy()
+    spec = ap._denormalize(linear_out)
+    wav = ap.inv_spectrogram(linear_out.T)
+    wav = wav[:ap.find_endpoint(wav)]
+    out = io.BytesIO()
+    ap.save_wav(wav, out)
+    return wav, alignment, spec
+
+
+def visualize(alignment, spectrogram, CONFIG):
+    label_fontsize = 16
+    plt.figure(figsize=(16,16))
+
+    plt.subplot(2,1,1)
+    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
+    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
+    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
+    plt.colorbar()
+
+    plt.subplot(2,1,2)
+    librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate, 
+                             hop_length=hop_length, x_axis="time", y_axis="linear")
+    plt.xlabel("Time", fontsize=label_fontsize)
+    plt.ylabel("Hz", fontsize=label_fontsize)
+    plt.tight_layout()
+    plt.colorbar()
+
--- a/synthesis.py
+++ b/synthesis.py
@ -38,17 +38,11 @@ def main(args):

    # Sentences for generation
    sentences = [
-        "And it is worth mention in passing that, as an example of fine typography,",
-        # From July 8, 2017 New York Times:
-        'Scientists at the CERN laboratory say they have discovered a new particle.',
-        'There’s a way to measure the acute emotional intelligence that has never gone out of style.',
-        'President Trump met with other leaders at the Group of 20 conference.',
-        'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
-        # From Google's Tacotron example page:
-        'Generative adversarial network or variational auto-encoder.',
-        'The buses aren\'t the problem, they actually provide a solution.',
-        'Does the quick brown fox jump over the lazy dog?',
-        'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
+        "I try my best to translate text to speech. But I know I need more work",
+        "The new Firefox, Fast for good.",
+        "Technology is continually providing us with new ways to create and publish stories.",
+        "For these stories to achieve their full impact, it requires tool.",
+        "I am allien and I am here to destron your world."
    ]

    # Synthesis and save to wav files
--- a/train.py
+++ b/train.py
@ -111,6 +111,8 @@ def main(args):
        progbar = Progbar(len(dataset) / c.batch_size)

        for i, data in enumerate(dataloader):
+            start_time = time.time()
+
            text_input = data[0]
            magnitude_input = data[1]
            mel_input = data[2]
@ -128,42 +130,40 @@ def main(args):

            if use_cuda:
                text_input_var = Variable(torch.from_numpy(text_input).type(
-                    torch.cuda.LongTensor), requires_grad=False).cuda()
+                    torch.cuda.LongTensor)).cuda()
                mel_input_var = Variable(torch.from_numpy(mel_input).type(
-                    torch.cuda.FloatTensor), requires_grad=False).cuda()
+                    torch.cuda.FloatTensor)).cuda()
                mel_spec_var = Variable(torch.from_numpy(mel_input).type(
-                    torch.cuda.FloatTensor), requires_grad=False).cuda()
+                    torch.cuda.FloatTensor)).cuda()
                linear_spec_var = Variable(torch.from_numpy(magnitude_input)
-                    .type(torch.cuda.FloatTensor), requires_grad=False).cuda()
+                    .type(torch.cuda.FloatTensor)).cuda()

            else:
                text_input_var = Variable(torch.from_numpy(text_input).type(
-                    torch.LongTensor), requires_grad=False)
+                    torch.LongTensor),)
                mel_input_var = Variable(torch.from_numpy(mel_input).type(
-                    torch.FloatTensor), requires_grad=False)
+                    torch.FloatTensor))
                mel_spec_var = Variable(torch.from_numpy(
-                    mel_input).type(torch.FloatTensor), requires_grad=False)
+                    mel_input).type(torch.FloatTensor))
                linear_spec_var = Variable(torch.from_numpy(
-                    magnitude_input).type(torch.FloatTensor),
-                                          requires_grad=False)
+                    magnitude_input).type(torch.FloatTensor))

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_input_var)

            mel_loss = criterion(mel_output, mel_spec_var)
-            linear_loss = torch.abs(linear_output - linear_spec_var)
-            linear_loss = 0.5 * \
-                torch.mean(linear_loss) + 0.5 * \
-                torch.mean(linear_loss[:, :n_priority_freq, :])
+            #linear_loss = torch.abs(linear_output - linear_spec_var)
+            #linear_loss = 0.5 * \
+                #torch.mean(linear_loss) + 0.5 * \
+                #torch.mean(linear_loss[:, :n_priority_freq, :])
+            linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
+                    + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
+                                      linear_spec_var[: ,: ,:n_priority_freq])
            loss = mel_loss + linear_loss
-            loss = loss.cuda()
-
-            start_time = time.time()
+            # loss = loss.cuda()

            loss.backward()
-
-            nn.utils.clip_grad_norm(model.parameters(), 1.)
-
+            grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)
            optimizer.step()

            step_time = time.time() - start_time
@ -171,7 +171,8 @@ def main(args):

            progbar.update(i+1, values=[('total_loss', loss.data[0]),
                                      ('linear_loss', linear_loss.data[0]),
-                                      ('mel_loss', mel_loss.data[0])])
+                                      ('mel_loss', mel_loss.data[0]),
+                                      ('grad_norm', grad_norm)])

            tb.add_scalar('Train/TotalLoss', loss.data[0], current_step)
            tb.add_scalar('Train/LinearLoss', linear_loss.data[0],
--- a/utils/audio.py
+++ b/utils/audio.py
@ -81,10 +81,10 @@ class AudioProcessor(object):

    def inv_spectrogram(self, spectrogram):
        '''Converts spectrogram to waveform using librosa'''
-        S = _denormalize(spectrogram)
-        S = _db_to_amp(S + self.ref_level_db)  # Convert back to linear
+        S = self._denormalize(spectrogram)
+        S = self._db_to_amp(S + self.ref_level_db)  # Convert back to linear
        # Reconstruct phase
-        return inv_preemphasis(_griffin_lim(S ** self.power))
+        return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))


    def _griffin_lim(self, S):
@ -93,18 +93,13 @@ class AudioProcessor(object):
        '''
        angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
        S_complex = np.abs(S).astype(np.complex)
-        y = _istft(S_complex * angles)
+        y = self._istft(S_complex * angles)
        for i in range(self.griffin_lim_iters):
-            angles = np.exp(1j * np.angle(_stft(y)))
-            y = _istft(S_complex * angles)
+            angles = np.exp(1j * np.angle(self._stft(y)))
+            y = self._istft(S_complex * angles)
        return y


-    def _istft(self, y):
-        _, hop_length, win_length = _stft_parameters()
-        return librosa.istft(y, hop_length=hop_length, win_length=win_length)
-
-
    def melspectrogram(self, y):
        D = self._stft(self.apply_preemphasis(y))
        S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
@ -115,11 +110,15 @@ class AudioProcessor(object):
        n_fft, hop_length, win_length = self._stft_parameters()
        return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)

+    def _istft(self, y):
+        _, hop_length, win_length = self._stft_parameters()
+        return librosa.istft(y, hop_length=hop_length, win_length=win_length)
+

    def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
        window_length = int(self.sample_rate * min_silence_sec)
        hop_length = int(window_length / 4)
-        threshold = _db_to_amp(threshold_db)
+        threshold = self._db_to_amp(threshold_db)
        for x in range(hop_length, len(wav) - window_length, hop_length):
            if np.max(wav[x:x + window_length]) < threshold:
                return x + hop_length
--- a/utils/data.py
+++ b/utils/data.py
@ -3,7 +3,9 @@ import numpy as np

 def pad_data(x, length):
    _pad = 0
-    return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
+    return np.pad(x, (0, length - x.shape[0]),
+                  mode='constant',
+                  constant_values=_pad)


 def prepare_data(inputs):