From 668a69576357cd0382a432d85b87ef623c7b42d3 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 9 Apr 2020 12:28:52 +0200
Subject: [PATCH] bug fixes and consider the fmin fmax plotting specs

---
 config.json     |  2 +-
 utils/audio.py  |  2 +-
 utils/visual.py | 25 +++++++++++++++----------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/config.json b/config.json
index 1b497646..e19ea9de 100644
--- a/config.json
+++ b/config.json
@@ -36,7 +36,7 @@
         "symmetric_norm": true, // move normalization to range [-1, 1]
         "max_norm": 1.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
         "clip_norm": true,      // clip normalized values into the range.
-        "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
     },
 
     // VOCABULARY PARAMETERS
diff --git a/utils/audio.py b/utils/audio.py
index be44cc42..27605800 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -269,7 +269,7 @@ class AudioProcessor(object):
             y = self._istft(S_complex * angles)
         return y
 
-    def compute_stft_paddings(x, pad_sides=1):
+    def compute_stft_paddings(self,x, pad_sides=1):
         '''compute right padding (final frame) or both sides padding (first and final frames)
         '''
         assert pad_sides in (1, 2)
diff --git a/utils/visual.py b/utils/visual.py
index b0db7b04..56b2ac76 100644
--- a/utils/visual.py
+++ b/utils/visual.py
@@ -40,8 +40,8 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)):
     return fig
 
 
-def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None, figsize=[8, 24]):
-    if spectrogram is not None:
+def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=[8, 24]):
+    if decoder_output is not None:
         num_plot = 4
     else:
         num_plot = 3
@@ -53,30 +53,35 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
     plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
     plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
     plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
+    # compute phoneme representation and back
     if CONFIG.use_phonemes:
         seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
         text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
         print(text)
-
     plt.yticks(range(len(text)), list(text))
     plt.colorbar()
-
+    # plot stopnet predictions
     stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy()
     plt.subplot(num_plot, 1, 2)
     plt.plot(range(len(stop_tokens)), list(stop_tokens))
-
+    # plot postnet spectrogram
     plt.subplot(num_plot, 1, 3)
-    librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'],
-                             hop_length=hop_length, x_axis="time", y_axis="linear")
+    librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'],
+                             hop_length=hop_length, x_axis="time", y_axis="linear",
+                             fmin=CONFIG.audio['mel_fmin'],
+                             fmax=CONFIG.audio['mel_fmax'])
+
     plt.xlabel("Time", fontsize=label_fontsize)
     plt.ylabel("Hz", fontsize=label_fontsize)
     plt.tight_layout()
     plt.colorbar()
 
-    if spectrogram is not None:
+    if decoder_output is not None:
         plt.subplot(num_plot, 1, 4)
-        librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'],
-                                 hop_length=hop_length, x_axis="time", y_axis="linear")
+        librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'],
+                                 hop_length=hop_length, x_axis="time", y_axis="linear",
+                                 fmin=CONFIG.audio['mel_fmin'],
+                                 fmax=CONFIG.audio['mel_fmax'])
         plt.xlabel("Time", fontsize=label_fontsize)
         plt.ylabel("Hz", fontsize=label_fontsize)
         plt.tight_layout()