diff --git a/config.json b/config.json index 837aa167..c41b1aa3 100644 --- a/config.json +++ b/config.json @@ -39,6 +39,7 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // TO BE IMPLEMENTED -- memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. + "attention_norm": "softmax", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. "eval_batch_size":16, diff --git a/models/tacotron.py b/models/tacotron.py index 7bda5ea2..adf74ab3 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -14,7 +14,7 @@ class Tacotron(nn.Module): r=5, padding_idx=None, memory_size=5, - attn_windowing=False, + attn_win=False, attn_norm="sigmoid"): super(Tacotron, self).__init__() self.r = r @@ -23,7 +23,7 @@ class Tacotron(nn.Module): self.embedding = nn.Embedding(num_chars, 256, padding_idx=padding_idx) self.embedding.weight.data.normal_(0, 0.3) self.encoder = Encoder(256) - self.decoder = Decoder(256, mel_dim, r, memory_size, attn_windowing, attn_norm) + self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win, attn_norm) self.postnet = PostCBHG(mel_dim) self.last_linear = nn.Sequential( nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim), diff --git a/train.py b/train.py index e4e4a674..83836bba 100644 --- a/train.py +++ b/train.py @@ -375,7 +375,7 @@ def main(args): init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) - model = MyModel(num_chars=num_chars, r=c.r, attention_norm=c.attention_norm) + model = MyModel(num_chars=num_chars, r=c.r, attn_norm=c.attention_norm) print(" | > Num output units : {}".format(ap.num_freq), flush=True)