"preemphasis":0.0,// pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db":20,// reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence":true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
"trim_db":60,// threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power":1.5,// value to sharpen wav signals after GL algorithm.
"griffin_lim_iters":60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels":80,// size of the mel spec frame.
"mel_fmin":50.0,// minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax":7600.0,// maximum freq level for mel-spec. Tune for dataset!!
"spec_gain":1,
// Normalization parameters
"signal_norm":true,// normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db":-100,// lower bound for normalization
"symmetric_norm":true,// move normalization to range [-1, 1]
"max_norm":4.0,// scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm":true,// clip normalized values into the range.
"stats_path":null// DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
"add_blank":false,// if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING
"distributed":{
"backend":"nccl",
"url":"tcp:\/\/localhost:54321"
},
"reinit_layers":[],// give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
"batch_group_size":0,//Number of batches to shuffle after bucketing.
"min_seq_len":2,// DATASET-RELATED: minimum text length to use in training
"max_seq_len":300,// DATASET-RELATED: maximum text length
"compute_f0":false,// compute f0 values in data-loader
"compute_input_seq_cache":false,// if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
// PATHS
"output_path":"tests/train_outputs/",
// PHONEMES
"phoneme_cache_path":"tests/train_outputs/phoneme_cache/",// phoneme computation is slow, therefore, it caches results in the given folder.
"use_d_vector_file":false,// if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"d_vector_file":"/home/erogol/Data/libritts/speakers.json",// if not null and use_d_vector_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558