diff --git a/notebooks/utils.py b/notebooks/utils.py index 5d19e204..47d8f857 100644 --- a/notebooks/utils.py +++ b/notebooks/utils.py @@ -23,7 +23,7 @@ def create_speech(m, s, CONFIG, use_cuda, ap): torch.from_numpy(seq), volatile=True).unsqueeze(0) # mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True) - mel_out, linear_out, alignments = m.forward(chars_var) + mel_out, linear_out, alignments, stop_tokens = m.forward(chars_var) linear_out = linear_out[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() spec = ap._denormalize(linear_out) @@ -31,23 +31,29 @@ def create_speech(m, s, CONFIG, use_cuda, ap): wav = wav[:ap.find_endpoint(wav)] out = io.BytesIO() ap.save_wav(wav, out) - return wav, alignment, spec + return wav, alignment, spec, stop_tokens -def visualize(alignment, spectrogram, CONFIG): +def visualize(alignment, spectrogram, stop_tokens, CONFIG): label_fontsize = 16 - plt.figure(figsize=(16, 16)) + plt.figure(figsize=(16, 24)) - plt.subplot(2, 1, 1) + plt.subplot(3, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) plt.colorbar() + + stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() + plt.subplot(3, 1, 2) + plt.plot(range(len(stop_tokens)), list(stop_tokens)) - plt.subplot(2, 1, 2) + plt.subplot(3, 1, 3) librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate, hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() + + \ No newline at end of file