config update and initial bias for graves attention

2019-11-05 16:30:23 +01:00 · 2019-11-05 16:30:23 +01:00 · b904bc02d6
parent 926a4d36ce
commit b904bc02d6
3 changed files with 8 additions and 11 deletions
--- a/config.json
+++ b/config.json
@ -34,7 +34,6 @@

    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.

-<<<<<<< HEAD
    // TRAINING
    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
    "eval_batch_size":16,   
@ -48,9 +47,6 @@
    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.

    // OPTIMIZER
-=======
-    "model": "Tacotron2",          // one of the model in models/    
->>>>>>> config update and bug fixes
    "grad_clip": 1,                // upper limit for gradients for clipping.
    "epochs": 1000,                // total number of epochs to train.
    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
@ -63,12 +59,8 @@
    "prenet_type": "original",     // "original" or "bn".
    "prenet_dropout": true,        // enable/disable dropout at prenet. 

-<<<<<<< HEAD
    // ATTENTION
    "attention_type": "original",  // 'original' or 'graves'
-=======
-    "attention_type": "graves",  // 'original' or 'graves'
->>>>>>> config update and bug fixes
    "attention_heads": 5,          // number of attention heads (only for 'graves')
    "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
    "windowing": false,            // Enables attention windowing. Used only in eval mode.
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@ -119,11 +119,16 @@ class GravesAttention(nn.Module):
        self.epsilon = 1e-5
        self.J = None
        self.N_a = nn.Sequential(
-            nn.Linear(query_dim, query_dim),
+            nn.Linear(query_dim, query_dim, bias=True),
            nn.Tanh(),
-            nn.Linear(query_dim, 3*K))
+            nn.Linear(query_dim, 3*K, bias=True))
        self.attention_weights = None
        self.mu_prev = None
+        self.init_layers()
+
+    def init_layers(self):
+        torch.nn.init.constant_(self.N_a[2].bias[10:15], 0.5)
+        torch.nn.init.constant_(self.N_a[2].bias[5:10], 10)

    def init_states(self, inputs):
        if self.J is None or inputs.shape[1] > self.J.shape[-1]:
--- a/train.py
+++ b/train.py
@ -198,7 +198,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,

        loss.backward()
        optimizer, current_lr = adam_weight_decay(optimizer)
-        grad_norm, _ = check_update(model, c.grad_clip)
+        grad_norm, _ = check_update(model.decoder, c.grad_clip)
        optimizer.step()

        # compute alignment score