bug fixes and consider the fmin fmax plotting specs

pull/10/head
erogol 2020-04-09 12:28:52 +02:00
parent 3293d4e05f
commit 668a695763
3 changed files with 17 additions and 12 deletions

View File

@ -36,7 +36,7 @@
"symmetric_norm": true, // move normalization to range [-1, 1] "symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range. "clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
}, },
// VOCABULARY PARAMETERS // VOCABULARY PARAMETERS

View File

@ -269,7 +269,7 @@ class AudioProcessor(object):
y = self._istft(S_complex * angles) y = self._istft(S_complex * angles)
return y return y
def compute_stft_paddings(x, pad_sides=1): def compute_stft_paddings(self,x, pad_sides=1):
'''compute right padding (final frame) or both sides padding (first and final frames) '''compute right padding (final frame) or both sides padding (first and final frames)
''' '''
assert pad_sides in (1, 2) assert pad_sides in (1, 2)

View File

@ -40,8 +40,8 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)):
return fig return fig
def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None, figsize=[8, 24]): def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=[8, 24]):
if spectrogram is not None: if decoder_output is not None:
num_plot = 4 num_plot = 4
else: else:
num_plot = 3 num_plot = 3
@ -53,30 +53,35 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
plt.ylabel("Encoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
# compute phoneme representation and back
if CONFIG.use_phonemes: if CONFIG.use_phonemes:
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
print(text) print(text)
plt.yticks(range(len(text)), list(text)) plt.yticks(range(len(text)), list(text))
plt.colorbar() plt.colorbar()
# plot stopnet predictions
stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy()
plt.subplot(num_plot, 1, 2) plt.subplot(num_plot, 1, 2)
plt.plot(range(len(stop_tokens)), list(stop_tokens)) plt.plot(range(len(stop_tokens)), list(stop_tokens))
# plot postnet spectrogram
plt.subplot(num_plot, 1, 3) plt.subplot(num_plot, 1, 3)
librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'], librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'],
hop_length=hop_length, x_axis="time", y_axis="linear") hop_length=hop_length, x_axis="time", y_axis="linear",
fmin=CONFIG.audio['mel_fmin'],
fmax=CONFIG.audio['mel_fmax'])
plt.xlabel("Time", fontsize=label_fontsize) plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout() plt.tight_layout()
plt.colorbar() plt.colorbar()
if spectrogram is not None: if decoder_output is not None:
plt.subplot(num_plot, 1, 4) plt.subplot(num_plot, 1, 4)
librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'], librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'],
hop_length=hop_length, x_axis="time", y_axis="linear") hop_length=hop_length, x_axis="time", y_axis="linear",
fmin=CONFIG.audio['mel_fmin'],
fmax=CONFIG.audio['mel_fmax'])
plt.xlabel("Time", fontsize=label_fontsize) plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout() plt.tight_layout()