Update sythesis function for stop tokens

2018-05-13 06:34:46 -07:00 · 2018-05-13 06:34:46 -07:00 · f27281d529
parent 060200991e
commit f27281d529
1 changed files with 12 additions and 6 deletions
--- a/notebooks/utils.py
+++ b/notebooks/utils.py
@ -23,7 +23,7 @@ def create_speech(m, s, CONFIG, use_cuda, ap):
            torch.from_numpy(seq), volatile=True).unsqueeze(0)
 #         mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)

-    mel_out, linear_out, alignments = m.forward(chars_var)
+    mel_out, linear_out, alignments, stop_tokens = m.forward(chars_var)
    linear_out = linear_out[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    spec = ap._denormalize(linear_out)
@ -31,23 +31,29 @@ def create_speech(m, s, CONFIG, use_cuda, ap):
    wav = wav[:ap.find_endpoint(wav)]
    out = io.BytesIO()
    ap.save_wav(wav, out)
-    return wav, alignment, spec
+    return wav, alignment, spec, stop_tokens


-def visualize(alignment, spectrogram, CONFIG):
+def visualize(alignment, spectrogram, stop_tokens, CONFIG):
    label_fontsize = 16
-    plt.figure(figsize=(16, 16))
+    plt.figure(figsize=(16, 24))

-    plt.subplot(2, 1, 1)
+    plt.subplot(3, 1, 1)
    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
    plt.colorbar()
    
-    plt.subplot(2, 1, 2)
+    stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy()
+    plt.subplot(3, 1, 2)
+    plt.plot(range(len(stop_tokens)), list(stop_tokens))
+
+    plt.subplot(3, 1, 3)
    librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate,
                             hop_length=hop_length, x_axis="time", y_axis="linear")
    plt.xlabel("Time", fontsize=label_fontsize)
    plt.ylabel("Hz", fontsize=label_fontsize)
    plt.tight_layout()
    plt.colorbar()
+    
+