"preemphasis":0.0,// pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db":20,// reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence":true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
"trim_db":60,// threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power":1.5,// value to sharpen wav signals after GL algorithm.
"griffin_lim_iters":60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels":80,// size of the mel spec frame.
"mel_fmin":0.0,// minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax":8000.0,// maximum freq level for mel-spec. Tune for dataset!!
"spec_gain":20.0,
// Normalization parameters
"signal_norm":true,// normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db":-100,// lower bound for normalization
"symmetric_norm":true,// move normalization to range [-1, 1]
"max_norm":4.0,// scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm":true,// clip normalized values into the range.
"stats_path":null// DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
"gradual_training":[[0,7,4],[1,5,2]],//set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
"loss_masking":true,// enable / disable loss masking against the sequence padding.
"ga_alpha":10.0,// weight for guided attention loss. If > 0, guided attention is enabled.
"mixed_precision":false,
// VALIDATION
"run_eval":true,
"test_delay_epochs":0,//Until attention is aligned, testing only wastes computation time.
"test_sentences_file":null,// set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// LOSS SETTINGS
"loss_masking":true,// enable / disable loss masking against the sequence padding.
"decoder_loss_alpha":0.5,// original decoder loss weight. If > 0, it is enabled
"postnet_loss_alpha":0.25,// original postnet loss weight. If > 0, it is enabled
"postnet_diff_spec_alpha":0.25,// differential spectral loss weight. If > 0, it is enabled
"decoder_diff_spec_alpha":0.25,// differential spectral loss weight. If > 0, it is enabled
"decoder_ssim_alpha":0.5,// decoder ssim loss weight. If > 0, it is enabled
"postnet_ssim_alpha":0.25,// postnet ssim loss weight. If > 0, it is enabled
"ga_alpha":5.0,// weight for guided attention loss. If > 0, guided attention is enabled.
"stopnet_pos_weight":15.0,// pos class weight for stopnet loss since there are way more negative samples than positive samples.
// OPTIMIZER
"noam_schedule":false,// use noam warmup and lr schedule.
"grad_clip":1.0,// upper limit for gradients for clipping.
"epochs":1,// total number of epochs to train.
"lr":0.0001,// Initial learning rate. If Noam decay is active, maximum learning rate.
"wd":0.000001,// Weight decay weight.
"warmup_steps":4000,// Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm":false,// Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
// TACOTRON PRENET
"memory_size":-1,// ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
"prenet_type":"bn",// "original" or "bn".
"prenet_dropout":false,// enable/disable dropout at prenet.
"attention_heads":4,// number of attention heads (only for 'graves')
"attention_norm":"sigmoid",// softmax or sigmoid.
"windowing":false,// Enables attention windowing. Used only in eval mode.
"use_forward_attn":false,// if it uses forward attention. In general, it aligns faster.
"forward_attn_mask":false,// Additional masking forcing monotonicity only in eval mode.
"transition_agent":false,// enable/disable transition agent of forward attention.
"location_attn":true,// enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"bidirectional_decoder":false,// use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
"double_decoder_consistency":true,// use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
"ddc_r":7,// reduction rate for coarse decoder.
// STOPNET
"stopnet":true,// Train stopnet predicting the end of synthesis.
"separate_stopnet":true,// Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
// TENSORBOARD and LOGGING
"print_step":1,// Number of steps to log training on console.
"tb_plot_step":100,// Number of steps to plot TB training figures.
"print_eval":false,// If True, it prints intermediate loss values in evalulation.
"save_step":10000,// Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint":true,// If true, it saves checkpoints per "save_step"
"keep_all_best":true,// If true, keeps all best_models after keep_after steps
"keep_after":10000,// Global step after which to keep best models if keep_all_best is true
"tb_model_param_stats":false,// true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"text_cleaner":"phoneme_cleaners",
"enable_eos_bos_chars":false,// enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers":0,// number of training data loader processes. Don't set it too big. 4-8 are good values.