diff --git a/config_libritts.json b/config_libritts.json index 10db4714..f9a752ec 100644 --- a/config_libritts.json +++ b/config_libritts.json @@ -37,22 +37,22 @@ "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "forward_attn_mask": false, "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 24, // Batch size for training. Lower values than 32 might cause hard to learn attention. "eval_batch_size":16, "r": 1, // Number of frames to predict for step. "wd": 0.000001, // Weight decay weight. @@ -61,7 +61,7 @@ "print_step": 10, // Number of steps to log traning on console. "batch_group_size": 0, //Number of batches to shuffle after bucketing. - "run_eval": false, + "run_eval": true, "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. "data_path": "/home/erogol/Data/Libri-TTS/train-clean-360/", // DATASET-RELATED: can overwritten from command argument diff --git a/utils/generic_utils.py b/utils/generic_utils.py index e911c643..ff971850 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -275,7 +275,7 @@ def setup_model(num_chars, num_speakers, c): elif c.model.lower() == "tacotron2": model = MyModel( num_chars=num_chars, - num_speakers=c.num_speakers, + num_speakers=num_speakers, r=c.r, attn_win=c.windowing, attn_norm=c.attention_norm,