"num_freq":1025,// number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate":16000,// DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"frame_length_ms":50,// stft window length in ms.
"frame_shift_ms":12.5,// stft window hop-lengh in ms.
"preemphasis":0.98,// pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db":-100,// normalization range
"ref_level_db":20,// reference level db, theoretically 20db is the sound of air.
"power":1.5,// value to sharpen wav signals after GL algorithm.
"griffin_lim_iters":60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// Normalization parameters
"signal_norm":true,// normalize the spec values in range [0, 1]
"symmetric_norm":false,// move normalization to range [-1, 1]
"max_norm":1,// scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm":true,// clip normalized values into the range.
"mel_fmin":0.0,// minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax":8000.0,// maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence":true// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
},
"distributed":{
"backend":"nccl",
"url":"tcp:\/\/localhost:54321"
},
"reinit_layers":[],
"model":"Tacotron",// one of the model in models/
"grad_clip":1,// upper limit for gradients for clipping.
"epochs":10000,// total number of epochs to train.
"lr":0.0001,// Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_decay":false,// if true, Noam learning rate decaying is applied through training.
"warmup_steps":4000,// Noam decay steps to increase the learning rate from 0 to "lr"
"windowing":false,// Enables attention windowing. Used only in eval mode.
"memory_size":5,// ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
"attention_norm":"softmax",// softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type":"original",// ONLY TACOTRON2 - "original" or "bn".
"prenet_dropout":true,// ONLY TACOTRON2 - enable/disable dropout at prenet.
"use_forward_attn":true,// ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
"transition_agent":false,// ONLY TACOTRON2 - enable/disable transition agent of forward attention.
"forward_attn_mask":true,
"location_attn":false,// ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"loss_masking":true,// enable / disable loss masking against the sequence padding.
"enable_eos_bos_chars":false,// enable/disable beginning of sentence and end of sentence chars.
"stopnet":true,// Train stopnet predicting the end of synthesis.
"separate_stopnet":true,// Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
"tb_model_param_stats":false,// true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"batch_size":32,// Batch size for training. Lower values than 32 might cause hard to learn attention.
"eval_batch_size":32,
"r":5,// Number of frames to predict for step.
"wd":0.000001,// Weight decay weight.
"checkpoint":true,// If true, it saves checkpoints per "save_step"
"save_step":1000,// Number of training steps expected to save traning stats and checkpoints.
"print_step":10,// Number of steps to log traning on console.
"batch_group_size":0,//Number of batches to shuffle after bucketing.
"run_eval":false,
"test_sentences_file":"de_sentences.txt",// set a file to load sentences to be used for testing. If it is null then we use default english sentences.
"test_delay_epochs":5,//Until attention is aligned, testing only wastes computation time.
"data_path":"/home/erogol/Data/m-ai-labs/de_DE/by_book/",// DATASET-RELATED: can overwritten from command argument
],// DATASET-RELATED: metafile for training dataloader.
"meta_file_val":null,// DATASET-RELATED: metafile for evaluation dataloader.
"dataset":"mailabs",// DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
"min_seq_len":15,// DATASET-RELATED: minimum text length to use in training
"max_seq_len":200,// DATASET-RELATED: maximum text length
"output_path":"/media/erogol/data_ssd/Data/models/mozilla_models/",// DATASET-RELATED: output path for all training outputs.
"num_loader_workers":0,// number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers":4,// number of evaluation data loader processes.
"phoneme_cache_path":"phoneme_cache",// phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes":true,// use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language":"de",// depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages