From 668a69576357cd0382a432d85b87ef623c7b42d3 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 9 Apr 2020 12:28:52 +0200 Subject: [PATCH] bug fixes and consider the fmin fmax plotting specs --- config.json | 2 +- utils/audio.py | 2 +- utils/visual.py | 25 +++++++++++++++---------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/config.json b/config.json index 1b497646..e19ea9de 100644 --- a/config.json +++ b/config.json @@ -36,7 +36,7 @@ "symmetric_norm": true, // move normalization to range [-1, 1] "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // VOCABULARY PARAMETERS diff --git a/utils/audio.py b/utils/audio.py index be44cc42..27605800 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -269,7 +269,7 @@ class AudioProcessor(object): y = self._istft(S_complex * angles) return y - def compute_stft_paddings(x, pad_sides=1): + def compute_stft_paddings(self,x, pad_sides=1): '''compute right padding (final frame) or both sides padding (first and final frames) ''' assert pad_sides in (1, 2) diff --git a/utils/visual.py b/utils/visual.py index b0db7b04..56b2ac76 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -40,8 +40,8 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)): return fig -def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None, figsize=[8, 24]): - if spectrogram is not None: +def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=[8, 24]): + if decoder_output is not None: num_plot = 4 else: num_plot = 3 @@ -53,30 +53,35 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) + # compute phoneme representation and back if CONFIG.use_phonemes: seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) print(text) - plt.yticks(range(len(text)), list(text)) plt.colorbar() - + # plot stopnet predictions stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() plt.subplot(num_plot, 1, 2) plt.plot(range(len(stop_tokens)), list(stop_tokens)) - + # plot postnet spectrogram plt.subplot(num_plot, 1, 3) - librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'], - hop_length=hop_length, x_axis="time", y_axis="linear") + librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'], + hop_length=hop_length, x_axis="time", y_axis="linear", + fmin=CONFIG.audio['mel_fmin'], + fmax=CONFIG.audio['mel_fmax']) + plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() - if spectrogram is not None: + if decoder_output is not None: plt.subplot(num_plot, 1, 4) - librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'], - hop_length=hop_length, x_axis="time", y_axis="linear") + librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'], + hop_length=hop_length, x_axis="time", y_axis="linear", + fmin=CONFIG.audio['mel_fmin'], + fmax=CONFIG.audio['mel_fmax']) plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout()