"run_description":"glow-tts model training with gated conv.",
// AUDIO PARAMETERS
"audio":{
"fft_size":1024,// number of stft frequency levels. Size of the linear spectogram frame.
"win_length":1024,// stft window length in ms.
"hop_length":256,// stft window hop-lengh in ms.
"frame_length_ms":null,// stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms":null,// stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate":22050,// DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"preemphasis":0.0,// pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db":0,// reference level db, theoretically 20db is the sound of air.
// Griffin-Lim
"power":1.1,// value to sharpen wav signals after GL algorithm.
"griffin_lim_iters":60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// Silence trimming
"do_trim_silence":true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db":60,// threshold for timming silence. Set this according to your dataset.
// MelSpectrogram parameters
"num_mels":80,// size of the mel spec frame.
"mel_fmin":50.0,// minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax":7600.0,// maximum freq level for mel-spec. Tune for dataset!!
"spec_gain":1.0,// scaler value appplied after log transform of spectrogram.
// Normalization parameters
"signal_norm":true,// normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db":-100,// lower bound for normalization
"symmetric_norm":true,// move normalization to range [-1, 1]
"max_norm":1.0,// scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm":true,// clip normalized values into the range.
"stats_path":null// DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
"add_blank":false,// if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING
"mixed_precision":false,
"distributed":{
"backend":"nccl",
"url":"tcp:\/\/localhost:54323"
},
"reinit_layers":[],// give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// MODEL PARAMETERS
"use_mas":false,// use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
// TRAINING
"batch_size":2,// Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":1,
"r":1,// Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"loss_masking":true,// enable / disable loss masking against the sequence padding.
"test_delay_epochs":0,//Until attention is aligned, testing only wastes computation time.
"test_sentences_file":null,// set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule":true,// use noam warmup and lr schedule.
"grad_clip":5.0,// upper limit for gradients for clipping.
"epochs":1,// total number of epochs to train.
"lr":1e-3,// Initial learning rate. If Noam decay is active, maximum learning rate.
"wd":0.000001,// Weight decay weight.
"warmup_steps":4000,// Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm":false,// Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.