diff --git a/layers/tacotron.py b/layers/tacotron.py index 74738810..d59f76c4 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -309,7 +309,7 @@ class Decoder(nn.Module): self.memory_size = memory_size if memory_size > 0 else r self.memory_dim = memory_dim # memory -> |Prenet| -> processed_memory - self.prenet = Prenet(memory_dim * memory_dim * self.memory_size, out_features=[256, 128]) + self.prenet = Prenet(memory_dim * self.memory_size, out_features=[256, 128]) # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State self.attention_rnn = AttentionRNNCell( out_dim=128, diff --git a/models/tacotron.py b/models/tacotron.py index 435e11d7..6a12d257 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -13,6 +13,7 @@ class Tacotron(nn.Module): mel_dim=80, r=5, padding_idx=None, + memory_size=5, attn_windowing=False): super(Tacotron, self).__init__() self.r = r @@ -23,7 +24,7 @@ class Tacotron(nn.Module): print(" | > Number of characters : {}".format(num_chars)) self.embedding.weight.data.normal_(0, 0.3) self.encoder = Encoder(embedding_dim) - self.decoder = Decoder(256, mel_dim, r, attn_windowing) + self.decoder = Decoder(256, mel_dim, r, memory_size, attn_windowing) self.postnet = PostCBHG(mel_dim) self.last_linear = nn.Sequential( nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim), diff --git a/train.py b/train.py index 74eb8b97..076b7042 100644 --- a/train.py +++ b/train.py @@ -357,7 +357,7 @@ def evaluate(model, criterion, criterion_st, ap, current_step): def main(args): num_chars = len(phonemes) if c.use_phonemes else len(symbols) - model = Tacotron(num_chars, c.embedding_size, ap.num_freq, ap.num_mels, c.r) + model = Tacotron(num_chars, c.embedding_size, ap.num_freq, ap.num_mels, c.r, c.memory_size) print(" | > Num output units : {}".format(ap.num_freq), flush=True) optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0)