2019-09-05 14:48:36 +00:00
|
|
|
import torch
|
2018-11-02 15:13:51 +00:00
|
|
|
import librosa
|
2018-02-04 16:25:00 +00:00
|
|
|
import matplotlib
|
|
|
|
matplotlib.use('Agg')
|
|
|
|
import matplotlib.pyplot as plt
|
2019-08-29 09:49:53 +00:00
|
|
|
from TTS.utils.text import phoneme_to_sequence, sequence_to_phoneme
|
2018-02-04 16:25:00 +00:00
|
|
|
|
|
|
|
|
2019-09-05 14:48:36 +00:00
|
|
|
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None):
|
|
|
|
if isinstance(alignment, torch.Tensor):
|
|
|
|
alignment_ = alignment.detach().cpu().numpy().squeeze()
|
|
|
|
else:
|
|
|
|
alignment_ = alignment
|
|
|
|
fig, ax = plt.subplots(figsize=fig_size)
|
2018-08-02 14:34:17 +00:00
|
|
|
im = ax.imshow(
|
2019-09-05 14:48:36 +00:00
|
|
|
alignment_.T, aspect='auto', origin='lower', interpolation='none')
|
2018-02-04 16:25:00 +00:00
|
|
|
fig.colorbar(im, ax=ax)
|
|
|
|
xlabel = 'Decoder timestep'
|
|
|
|
if info is not None:
|
|
|
|
xlabel += '\n\n' + info
|
|
|
|
plt.xlabel(xlabel)
|
|
|
|
plt.ylabel('Encoder timestep')
|
2018-11-02 15:13:51 +00:00
|
|
|
# plt.yticks(range(len(text)), list(text))
|
2018-02-04 16:25:00 +00:00
|
|
|
plt.tight_layout()
|
2019-09-05 14:48:36 +00:00
|
|
|
if title is not None:
|
|
|
|
plt.title(title)
|
2018-08-11 14:53:09 +00:00
|
|
|
return fig
|
2018-02-04 16:25:00 +00:00
|
|
|
|
|
|
|
|
2019-09-05 14:48:36 +00:00
|
|
|
def plot_spectrogram(linear_output, audio, fig_size=(16, 10)):
|
|
|
|
if isinstance(linear_output, torch.Tensor):
|
|
|
|
linear_output_ = linear_output.detach().cpu().numpy().squeeze()
|
|
|
|
else:
|
|
|
|
linear_output_ = linear_output
|
2020-03-17 12:28:15 +00:00
|
|
|
spectrogram = audio._denormalize(linear_output_.T) # pylint: disable=protected-access
|
2019-09-05 14:48:36 +00:00
|
|
|
fig = plt.figure(figsize=fig_size)
|
2020-03-17 12:28:15 +00:00
|
|
|
plt.imshow(spectrogram, aspect="auto", origin="lower")
|
2018-02-04 16:25:00 +00:00
|
|
|
plt.colorbar()
|
|
|
|
plt.tight_layout()
|
2018-08-11 14:53:09 +00:00
|
|
|
return fig
|
2018-11-02 15:13:51 +00:00
|
|
|
|
|
|
|
|
2020-04-23 13:46:45 +00:00
|
|
|
def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)):
|
2020-04-09 10:28:52 +00:00
|
|
|
if decoder_output is not None:
|
2018-11-13 11:10:40 +00:00
|
|
|
num_plot = 4
|
|
|
|
else:
|
|
|
|
num_plot = 3
|
|
|
|
|
2018-11-02 15:13:51 +00:00
|
|
|
label_fontsize = 16
|
2020-03-17 12:28:15 +00:00
|
|
|
fig = plt.figure(figsize=figsize)
|
2018-11-02 15:13:51 +00:00
|
|
|
|
2018-11-13 11:10:40 +00:00
|
|
|
plt.subplot(num_plot, 1, 1)
|
2018-11-02 15:13:51 +00:00
|
|
|
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
|
|
|
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
|
|
|
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
2020-04-09 10:28:52 +00:00
|
|
|
# compute phoneme representation and back
|
2019-02-25 16:20:05 +00:00
|
|
|
if CONFIG.use_phonemes:
|
2020-03-03 12:17:56 +00:00
|
|
|
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
|
|
|
|
text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
|
2019-05-14 11:53:26 +00:00
|
|
|
print(text)
|
2018-11-02 15:13:51 +00:00
|
|
|
plt.yticks(range(len(text)), list(text))
|
|
|
|
plt.colorbar()
|
2020-04-09 10:28:52 +00:00
|
|
|
# plot stopnet predictions
|
2018-11-13 11:10:40 +00:00
|
|
|
plt.subplot(num_plot, 1, 2)
|
2018-11-02 15:13:51 +00:00
|
|
|
plt.plot(range(len(stop_tokens)), list(stop_tokens))
|
2020-04-09 10:28:52 +00:00
|
|
|
# plot postnet spectrogram
|
2018-11-13 11:10:40 +00:00
|
|
|
plt.subplot(num_plot, 1, 3)
|
2020-04-09 10:28:52 +00:00
|
|
|
librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'],
|
|
|
|
hop_length=hop_length, x_axis="time", y_axis="linear",
|
|
|
|
fmin=CONFIG.audio['mel_fmin'],
|
|
|
|
fmax=CONFIG.audio['mel_fmax'])
|
|
|
|
|
2018-11-02 15:13:51 +00:00
|
|
|
plt.xlabel("Time", fontsize=label_fontsize)
|
|
|
|
plt.ylabel("Hz", fontsize=label_fontsize)
|
2019-02-25 16:20:05 +00:00
|
|
|
plt.tight_layout()
|
|
|
|
plt.colorbar()
|
2018-11-02 15:13:51 +00:00
|
|
|
|
2020-04-09 10:28:52 +00:00
|
|
|
if decoder_output is not None:
|
2018-11-13 11:10:40 +00:00
|
|
|
plt.subplot(num_plot, 1, 4)
|
2020-04-09 10:28:52 +00:00
|
|
|
librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'],
|
|
|
|
hop_length=hop_length, x_axis="time", y_axis="linear",
|
|
|
|
fmin=CONFIG.audio['mel_fmin'],
|
|
|
|
fmax=CONFIG.audio['mel_fmax'])
|
2018-11-13 11:10:40 +00:00
|
|
|
plt.xlabel("Time", fontsize=label_fontsize)
|
|
|
|
plt.ylabel("Hz", fontsize=label_fontsize)
|
2019-02-25 16:20:05 +00:00
|
|
|
plt.tight_layout()
|
|
|
|
plt.colorbar()
|
2019-07-19 06:46:23 +00:00
|
|
|
|
2019-05-12 15:35:44 +00:00
|
|
|
if output_path:
|
|
|
|
print(output_path)
|
|
|
|
fig.savefig(output_path)
|
2019-05-14 11:53:26 +00:00
|
|
|
plt.close()
|