bug fixes and consider the fmin fmax plotting specs

2020-04-09 12:28:52 +02:00 · 2020-04-09 12:28:52 +02:00 · 668a695763
parent 3293d4e05f
commit 668a695763
3 changed files with 17 additions and 12 deletions
--- a/config.json
+++ b/config.json
@ -36,7 +36,7 @@
        "symmetric_norm": true, // move normalization to range [-1, 1]
        "max_norm": 1.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
-        "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
    },

    // VOCABULARY PARAMETERS
--- a/utils/audio.py
+++ b/utils/audio.py
@ -269,7 +269,7 @@ class AudioProcessor(object):
            y = self._istft(S_complex * angles)
        return y

-    def compute_stft_paddings(x, pad_sides=1):
+    def compute_stft_paddings(self,x, pad_sides=1):
        '''compute right padding (final frame) or both sides padding (first and final frames)
        '''
        assert pad_sides in (1, 2)
--- a/utils/visual.py
+++ b/utils/visual.py
@ -40,8 +40,8 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)):
    return fig


-def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None, figsize=[8, 24]):
-    if spectrogram is not None:
+def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=[8, 24]):
+    if decoder_output is not None:
        num_plot = 4
    else:
        num_plot = 3
@ -53,30 +53,35 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
+    # compute phoneme representation and back
    if CONFIG.use_phonemes:
        seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
        text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
        print(text)
-
    plt.yticks(range(len(text)), list(text))
    plt.colorbar()
-
+    # plot stopnet predictions
    stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy()
    plt.subplot(num_plot, 1, 2)
    plt.plot(range(len(stop_tokens)), list(stop_tokens))
-
+    # plot postnet spectrogram
    plt.subplot(num_plot, 1, 3)
-    librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'],
-                             hop_length=hop_length, x_axis="time", y_axis="linear")
+    librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'],
+                             hop_length=hop_length, x_axis="time", y_axis="linear",
+                             fmin=CONFIG.audio['mel_fmin'],
+                             fmax=CONFIG.audio['mel_fmax'])
+
    plt.xlabel("Time", fontsize=label_fontsize)
    plt.ylabel("Hz", fontsize=label_fontsize)
    plt.tight_layout()
    plt.colorbar()

-    if spectrogram is not None:
+    if decoder_output is not None:
        plt.subplot(num_plot, 1, 4)
-        librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'],
-                                 hop_length=hop_length, x_axis="time", y_axis="linear")
+        librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'],
+                                 hop_length=hop_length, x_axis="time", y_axis="linear",
+                                 fmin=CONFIG.audio['mel_fmin'],
+                                 fmax=CONFIG.audio['mel_fmax'])
        plt.xlabel("Time", fontsize=label_fontsize)
        plt.ylabel("Hz", fontsize=label_fontsize)
        plt.tight_layout()