diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json index 2f56c6ce..d301b61a 100644 --- a/tests/outputs/dummy_model_config.json +++ b/tests/outputs/dummy_model_config.json @@ -4,7 +4,7 @@ "audio":{ // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. + "num_mels": 80, // size of the mel spec frame. "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. "frame_length_ms": 50, // stft window length in ms. @@ -31,19 +31,19 @@ "reinit_layers": [], - "model": "Tacotron2", // one of the model in models/ + "model": "Tacotron2", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "windowing": false, // Enables attention windowing. Used only in eval mode. - "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. + "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. + "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, + "forward_attn_mask": false, "attention_type": "original", "attention_heads": 5, "bidirectional_decoder": false, @@ -51,13 +51,15 @@ "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "stopnet": true, // Train stopnet predicting the end of synthesis. + "stopnet": true, // Train stopnet predicting the end of synthesis. "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "use_gst": false, - + "use_gst": false, + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "eval_batch_size":16, + "eval_batch_size":16, "r": 1, // Number of frames to predict for step. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step"