From 28eb3abfd6b19648eec95588d940f0b9f66cf1d1 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 12 Feb 2020 23:54:33 +0100
Subject: [PATCH] setting stft parameters with constants

---
 config.json    | 12 +++++-------
 utils/audio.py |  9 ++++++++-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/config.json b/config.json
index c1a8158d..094bb2c6 100644
--- a/config.json
+++ b/config.json
@@ -9,8 +9,8 @@
         "num_mels": 80,         // size of the mel spec frame. 
         "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
         "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "frame_length_ms": 50.0,  // stft window length in ms.
-        "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
+        "win_length": 1024,     // stft window length in ms.
+        "hop_length": 256,      // stft window hop-lengh in ms.
         "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
         "min_level_db": -100,   // normalization range
         "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
@@ -63,7 +63,7 @@
     "prenet_dropout": true,        // enable/disable dropout at prenet. 
 
     // ATTENTION
-    "attention_type": "graves",  // 'original' or 'graves'
+    "attention_type": "original",  // 'original' or 'graves'
     "attention_heads": 4,          // number of attention heads (only for 'graves')
     "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
     "windowing": false,            // Enables attention windowing. Used only in eval mode.
@@ -93,8 +93,7 @@
     "max_seq_len": 150,     // DATASET-RELATED: maximum text length
 
     // PATHS
-    // "output_path": "/data5/rw/pit/keep/",      // DATASET-RELATED: output path for all training outputs.
-    "output_path": "/home/erogol/Models/LJSpeech/",
+    "output_path": "/data4/rw/home/Trainings/",
  
     // PHONEMES
     "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
@@ -111,8 +110,7 @@
         [
             {
                 "name": "ljspeech",
-                "path": "/home/erogol/Data/LJSpeech-1.1/",
-                // "path": "/home/erogol/Data/LJSpeech-1.1",
+                "path": "/root/LJSpeech-1.1/",
                 "meta_file_train": "metadata.csv",
                 "meta_file_val": null
             }
diff --git a/utils/audio.py b/utils/audio.py
index 7b2c4834..771e6a43 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -12,6 +12,8 @@ class AudioProcessor(object):
                  min_level_db=None,
                  frame_shift_ms=None,
                  frame_length_ms=None,
+                 hop_length=None,
+                 win_length=None,
                  ref_level_db=None,
                  num_freq=None,
                  power=None,
@@ -49,7 +51,12 @@ class AudioProcessor(object):
         self.do_trim_silence = do_trim_silence
         self.trim_db = trim_db
         self.sound_norm = sound_norm
-        self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
+        if hop_length is None:
+            self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
+        else:
+            self.hop_length = hop_length
+            self.win_length = win_length
+            self.n_fft = (self.num_freq - 1) * 2
         assert min_level_db != 0.0, " [!] min_level_db is 0"
         members = vars(self)
         for key, value in members.items():