mirror of https://github.com/coqui-ai/TTS.git
setting stft parameters with constants
parent
4130674e46
commit
9eb3c3a342
12
config.json
12
config.json
|
@ -9,8 +9,8 @@
|
||||||
"num_mels": 80, // size of the mel spec frame.
|
"num_mels": 80, // size of the mel spec frame.
|
||||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||||
"frame_length_ms": 50, // stft window length in ms.
|
"win_length": 1024, // stft window length in ms.
|
||||||
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
|
"hop_length": 256, // stft window hop-lengh in ms.
|
||||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||||
"min_level_db": -100, // normalization range
|
"min_level_db": -100, // normalization range
|
||||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||||
|
@ -62,7 +62,7 @@
|
||||||
"prenet_dropout": true, // enable/disable dropout at prenet.
|
"prenet_dropout": true, // enable/disable dropout at prenet.
|
||||||
|
|
||||||
// ATTENTION
|
// ATTENTION
|
||||||
"attention_type": "graves", // 'original' or 'graves'
|
"attention_type": "original", // 'original' or 'graves'
|
||||||
"attention_heads": 4, // number of attention heads (only for 'graves')
|
"attention_heads": 4, // number of attention heads (only for 'graves')
|
||||||
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
||||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||||
|
@ -92,8 +92,7 @@
|
||||||
"max_seq_len": 150, // DATASET-RELATED: maximum text length
|
"max_seq_len": 150, // DATASET-RELATED: maximum text length
|
||||||
|
|
||||||
// PATHS
|
// PATHS
|
||||||
// "output_path": "/data5/rw/pit/keep/", // DATASET-RELATED: output path for all training outputs.
|
"output_path": "/data4/rw/home/Trainings/",
|
||||||
"output_path": "/home/erogol/Models/LJSpeech/",
|
|
||||||
|
|
||||||
// PHONEMES
|
// PHONEMES
|
||||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||||
|
@ -110,8 +109,7 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"name": "ljspeech",
|
"name": "ljspeech",
|
||||||
"path": "/home/erogol/Data/LJSpeech-1.1/",
|
"path": "/root/LJSpeech-1.1/",
|
||||||
// "path": "/home/erogol/Data/LJSpeech-1.1",
|
|
||||||
"meta_file_train": "metadata.csv",
|
"meta_file_train": "metadata.csv",
|
||||||
"meta_file_val": null
|
"meta_file_val": null
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,6 +12,8 @@ class AudioProcessor(object):
|
||||||
min_level_db=None,
|
min_level_db=None,
|
||||||
frame_shift_ms=None,
|
frame_shift_ms=None,
|
||||||
frame_length_ms=None,
|
frame_length_ms=None,
|
||||||
|
hop_length=None,
|
||||||
|
win_length=None,
|
||||||
ref_level_db=None,
|
ref_level_db=None,
|
||||||
num_freq=None,
|
num_freq=None,
|
||||||
power=None,
|
power=None,
|
||||||
|
@ -49,7 +51,12 @@ class AudioProcessor(object):
|
||||||
self.do_trim_silence = do_trim_silence
|
self.do_trim_silence = do_trim_silence
|
||||||
self.trim_db = trim_db
|
self.trim_db = trim_db
|
||||||
self.sound_norm = sound_norm
|
self.sound_norm = sound_norm
|
||||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
if hop_length is None:
|
||||||
|
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
||||||
|
else:
|
||||||
|
self.hop_length = hop_length
|
||||||
|
self.win_length = win_length
|
||||||
|
self.n_fft = (self.num_freq - 1) * 2
|
||||||
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
||||||
members = vars(self)
|
members = vars(self)
|
||||||
for key, value in members.items():
|
for key, value in members.items():
|
||||||
|
|
Loading…
Reference in New Issue