config update and initial bias for graves attention

pull/10/head
Eren Golge 2019-11-05 16:30:23 +01:00
parent 926a4d36ce
commit b904bc02d6
3 changed files with 8 additions and 11 deletions

View File

@ -34,7 +34,6 @@
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
<<<<<<< HEAD
// TRAINING // TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16, "eval_batch_size":16,
@ -48,9 +47,6 @@
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER // OPTIMIZER
=======
"model": "Tacotron2", // one of the model in models/
>>>>>>> config update and bug fixes
"grad_clip": 1, // upper limit for gradients for clipping. "grad_clip": 1, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train. "epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
@ -63,12 +59,8 @@
"prenet_type": "original", // "original" or "bn". "prenet_type": "original", // "original" or "bn".
"prenet_dropout": true, // enable/disable dropout at prenet. "prenet_dropout": true, // enable/disable dropout at prenet.
<<<<<<< HEAD
// ATTENTION // ATTENTION
"attention_type": "original", // 'original' or 'graves' "attention_type": "original", // 'original' or 'graves'
=======
"attention_type": "graves", // 'original' or 'graves'
>>>>>>> config update and bug fixes
"attention_heads": 5, // number of attention heads (only for 'graves') "attention_heads": 5, // number of attention heads (only for 'graves')
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"windowing": false, // Enables attention windowing. Used only in eval mode. "windowing": false, // Enables attention windowing. Used only in eval mode.

View File

@ -119,11 +119,16 @@ class GravesAttention(nn.Module):
self.epsilon = 1e-5 self.epsilon = 1e-5
self.J = None self.J = None
self.N_a = nn.Sequential( self.N_a = nn.Sequential(
nn.Linear(query_dim, query_dim), nn.Linear(query_dim, query_dim, bias=True),
nn.Tanh(), nn.Tanh(),
nn.Linear(query_dim, 3*K)) nn.Linear(query_dim, 3*K, bias=True))
self.attention_weights = None self.attention_weights = None
self.mu_prev = None self.mu_prev = None
self.init_layers()
def init_layers(self):
torch.nn.init.constant_(self.N_a[2].bias[10:15], 0.5)
torch.nn.init.constant_(self.N_a[2].bias[5:10], 10)
def init_states(self, inputs): def init_states(self, inputs):
if self.J is None or inputs.shape[1] > self.J.shape[-1]: if self.J is None or inputs.shape[1] > self.J.shape[-1]:

View File

@ -198,7 +198,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
loss.backward() loss.backward()
optimizer, current_lr = adam_weight_decay(optimizer) optimizer, current_lr = adam_weight_decay(optimizer)
grad_norm, _ = check_update(model, c.grad_clip) grad_norm, _ = check_update(model.decoder, c.grad_clip)
optimizer.step() optimizer.step()
# compute alignment score # compute alignment score