From 28eb3abfd6b19648eec95588d940f0b9f66cf1d1 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 12 Feb 2020 23:54:33 +0100 Subject: [PATCH] setting stft parameters with constants --- config.json | 12 +++++------- utils/audio.py | 9 ++++++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/config.json b/config.json index c1a8158d..094bb2c6 100644 --- a/config.json +++ b/config.json @@ -9,8 +9,8 @@ "num_mels": 80, // size of the mel spec frame. "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50.0, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "min_level_db": -100, // normalization range "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. @@ -63,7 +63,7 @@ "prenet_dropout": true, // enable/disable dropout at prenet. // ATTENTION - "attention_type": "graves", // 'original' or 'graves' + "attention_type": "original", // 'original' or 'graves' "attention_heads": 4, // number of attention heads (only for 'graves') "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "windowing": false, // Enables attention windowing. Used only in eval mode. @@ -93,8 +93,7 @@ "max_seq_len": 150, // DATASET-RELATED: maximum text length // PATHS - // "output_path": "/data5/rw/pit/keep/", // DATASET-RELATED: output path for all training outputs. - "output_path": "/home/erogol/Models/LJSpeech/", + "output_path": "/data4/rw/home/Trainings/", // PHONEMES "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. @@ -111,8 +110,7 @@ [ { "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", - // "path": "/home/erogol/Data/LJSpeech-1.1", + "path": "/root/LJSpeech-1.1/", "meta_file_train": "metadata.csv", "meta_file_val": null } diff --git a/utils/audio.py b/utils/audio.py index 7b2c4834..771e6a43 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -12,6 +12,8 @@ class AudioProcessor(object): min_level_db=None, frame_shift_ms=None, frame_length_ms=None, + hop_length=None, + win_length=None, ref_level_db=None, num_freq=None, power=None, @@ -49,7 +51,12 @@ class AudioProcessor(object): self.do_trim_silence = do_trim_silence self.trim_db = trim_db self.sound_norm = sound_norm - self.n_fft, self.hop_length, self.win_length = self._stft_parameters() + if hop_length is None: + self.n_fft, self.hop_length, self.win_length = self._stft_parameters() + else: + self.hop_length = hop_length + self.win_length = win_length + self.n_fft = (self.num_freq - 1) * 2 assert min_level_db != 0.0, " [!] min_level_db is 0" members = vars(self) for key, value in members.items():