diff --git a/config.json b/config.json index 0ad2921b..838f1510 100644 --- a/config.json +++ b/config.json @@ -14,7 +14,7 @@ "epochs": 2000, "lr": 0.0006, "warmup_steps": 4000, - "batch_size": 180, + "batch_size": 32, "r": 5, "griffin_lim_iters": 60, diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index 334047a1..a42a626e 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -26,6 +26,7 @@ class LJSpeechDataset(Dataset): frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) + self._sort_frames() def load_wav(self, filename): try: @@ -34,6 +35,20 @@ class LJSpeechDataset(Dataset): except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) + def _sort_frames(self): + r"""Sort sequences in ascending order""" + lengths = np.array([len(ins[1]) for ins in self.frames]) + + print(" | > Max length sequence {}".format(np.max(lengths))) + print(" | > Min length sequence {}".format(np.min(lengths))) + print(" | > Avg length sequence {}".format(np.mean(lengths))) + + idxs = np.argsort(lengths) + new_frames = [None] * len(lengths) + for i, idx in enumerate(idxs): + new_frames[i] = self.frames[idx] + self.frames = new_frames + def __len__(self): return len(self.frames) @@ -47,9 +62,17 @@ class LJSpeechDataset(Dataset): return sample def get_dummy_data(self): + r"""Get a dummy input for testing""" return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor) def collate_fn(self, batch): + r""" + Perform preprocessing and create a final data batch: + 1. PAD sequences with the longest sequence in the batch + 2. Convert Audio signal to Spectrograms. + 3. PAD sequences that can be divided by r. + 4. Convert Numpy to Torch tensors. + """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): diff --git a/layers/attention.py b/layers/attention.py index 8d993cea..e7385149 100644 --- a/layers/attention.py +++ b/layers/attention.py @@ -5,26 +5,27 @@ from torch.nn import functional as F class BahdanauAttention(nn.Module): - def __init__(self, dim): + def __init__(self, annot_dim, query_dim, hidden_dim): super(BahdanauAttention, self).__init__() - self.query_layer = nn.Linear(dim, dim, bias=False) - self.tanh = nn.Tanh() - self.v = nn.Linear(dim, 1, bias=False) + self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True) + self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True) + self.v = nn.Linear(hidden_dim, 1, bias=False) - def forward(self, query, processed_inputs): + def forward(self, annots, query): """ - Args: - query: (batch, 1, dim) or (batch, dim) - processed_inputs: (batch, max_time, dim) + Shapes: + - query: (batch, 1, dim) or (batch, dim) + - annots: (batch, max_time, dim) """ if query.dim() == 2: # insert time-axis for broadcasting query = query.unsqueeze(1) # (batch, 1, dim) processed_query = self.query_layer(query) + processed_annots = self.annot_layer(annots) # (batch, max_time, 1) - alignment = self.v(self.tanh(processed_query + processed_inputs)) + alignment = self.v(nn.functional.tanh(processed_query + processed_annots)) # (batch, max_time) return alignment.squeeze(-1) @@ -34,7 +35,7 @@ def get_mask_from_lengths(inputs, inputs_lengths): """Get mask tensor from list of length Args: - inputs: (batch, max_time, dim) + inputs: Tensor in size (batch, max_time, dim) inputs_lengths: array like """ mask = inputs.data.new(inputs.size(0), inputs.size(1)).byte().zero_() @@ -43,52 +44,48 @@ def get_mask_from_lengths(inputs, inputs_lengths): return ~mask -class AttentionWrapper(nn.Module): - def __init__(self, rnn_cell, alignment_model, +class AttentionRNN(nn.Module): + def __init__(self, out_dim, annot_dim, memory_dim, score_mask_value=-float("inf")): - super(AttentionWrapper, self).__init__() - self.rnn_cell = rnn_cell - self.alignment_model = alignment_model + super(AttentionRNN, self).__init__() + self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, out_dim) + self.alignment_model = BahdanauAttention(annot_dim, out_dim, out_dim) self.score_mask_value = score_mask_value - def forward(self, query, context_vec, cell_state, inputs, - processed_inputs=None, mask=None, inputs_lengths=None): + def forward(self, memory, context, rnn_state, annotations, + mask=None, annotations_lengths=None): - if processed_inputs is None: - processed_inputs = inputs - - if inputs_lengths is not None and mask is None: - mask = get_mask_from_lengths(inputs, inputs_lengths) + if annotations_lengths is not None and mask is None: + mask = get_mask_from_lengths(annotations, annotations_lengths) # Alignment # (batch, max_time) # e_{ij} = a(s_{i-1}, h_j) - # import ipdb - # ipdb.set_trace() - alignment = self.alignment_model(cell_state, processed_inputs) + alignment = self.alignment_model(annotations, rnn_state) + # TODO: needs recheck. if mask is not None: mask = mask.view(query.size(0), -1) alignment.data.masked_fill_(mask, self.score_mask_value) - # Normalize context_vec weight + # Normalize context weight alignment = F.softmax(alignment, dim=-1) # Attention context vector # (batch, 1, dim) # c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j - context_vec = torch.bmm(alignment.unsqueeze(1), inputs) - context_vec = context_vec.squeeze(1) + context = torch.bmm(alignment.unsqueeze(1), annotations) + context = context.squeeze(1) - # Concat input query and previous context_vec context - cell_input = torch.cat((query, context_vec), -1) - #cell_input = cell_input.unsqueeze(1) + # Concat input query and previous context context + rnn_input = torch.cat((memory, context), -1) + #rnn_input = rnn_input.unsqueeze(1) # Feed it to RNN # s_i = f(y_{i-1}, c_{i}, s_{i-1}) - cell_output = self.rnn_cell(cell_input, cell_state) + rnn_output = self.rnn_cell(rnn_input, rnn_state) - context_vec = context_vec.squeeze(1) - return cell_output, context_vec, alignment + context = context.squeeze(1) + return rnn_output, context, alignment diff --git a/layers/tacotron.py b/layers/tacotron.py index ac348017..6f5926a8 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -3,7 +3,7 @@ import torch from torch.autograd import Variable from torch import nn -from .attention import BahdanauAttention, AttentionWrapper +from .attention import AttentionRNN from .attention import get_mask_from_lengths class Prenet(nn.Module): @@ -219,15 +219,10 @@ class Decoder(nn.Module): self.memory_dim = memory_dim self.eps = eps self.r = r - # input -> |Linear| -> processed_inputs - self.input_layer = nn.Linear(in_features, 256, bias=False) # memory -> |Prenet| -> processed_memory self.prenet = Prenet(memory_dim * r, out_features=[256, 128]) # processed_inputs, processed_memory -> |Attention| -> Attention, Alignment, RNN_State - self.attention_rnn = AttentionWrapper( - nn.GRUCell(in_features + 128, 256), - BahdanauAttention(256) - ) + self.attention_rnn = AttentionRNN(256, in_features, 128) # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input self.project_to_decoder_in = nn.Linear(256+in_features, 256) # decoder_RNN_input -> |RNN| -> RNN_state @@ -245,9 +240,9 @@ class Decoder(nn.Module): Args: inputs: Encoder outputs. - memory: Decoder memory (autoregression. If None (at eval-time), + memory (None): Decoder memory (autoregression. If None (at eval-time), decoder outputs are used as decoder inputs. - input_lengths: Encoder output (memory) lengths. If not None, used for + input_lengths (None): input lengths, used for attention masking. Shapes: @@ -256,12 +251,11 @@ class Decoder(nn.Module): """ B = inputs.size(0) - # TODO: take this segment into Attention module. - processed_inputs = self.input_layer(inputs) - if input_lengths is not None: - mask = get_mask_from_lengths(processed_inputs, input_lengths) - else: - mask = None + + # if input_lengths is not None: + # mask = get_mask_from_lengths(processed_inputs, input_lengths) + # else: + # mask = None # Run greedy decoding if memory is None greedy = memory is None @@ -300,20 +294,7 @@ class Decoder(nn.Module): memory_input = initial_memory while True: if t > 0: - # using harmonized teacher-forcing. - # from https://arxiv.org/abs/1707.06588 - if greedy: - memory_input = outputs[-1] - else: - # combine prev. model output and prev. real target - memory_input = torch.div(outputs[-1] + memory[t-1], 2.0) - memory_input = torch.nn.functional.dropout(memory_input, - 0.1, - training=True) - # add a random noise - noise = torch.autograd.Variable( - memory_input.data.new(memory_input.size()).normal_(0.0, 1.0)) - memory_input = memory_input + noise + memory_input = outputs[-1] if greedy else memory[t - 1] # Prenet processed_memory = self.prenet(memory_input) @@ -321,7 +302,7 @@ class Decoder(nn.Module): # Attention RNN attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn( processed_memory, current_context_vec, attention_rnn_hidden, - inputs, processed_inputs=processed_inputs, mask=mask) + inputs) # Concat RNN output and attention context vector decoder_input = self.project_to_decoder_in( diff --git a/models/tacotron.py b/models/tacotron.py index c6218e40..57c9b43d 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -9,11 +9,11 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG class Tacotron(nn.Module): def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80, freq_dim=1025, r=5, padding_idx=None, - use_memory_mask=False): + use_atten_mask=False): super(Tacotron, self).__init__() self.mel_dim = mel_dim self.linear_dim = linear_dim - self.use_memory_mask = use_memory_mask + self.use_atten_mask = use_atten_mask self.embedding = nn.Embedding(len(symbols), embedding_dim, padding_idx=padding_idx) print(" | > Embedding dim : {}".format(len(symbols))) @@ -33,9 +33,7 @@ class Tacotron(nn.Module): # (B, T', in_dim) encoder_outputs = self.encoder(inputs) - if self.use_memory_mask: - input_lengths = input_lengths - else: + if not self.use_atten_mask: input_lengths = None # (B, T', mel_dim*r) diff --git a/train.py b/train.py index 6e7c7726..8aa6567d 100644 --- a/train.py +++ b/train.py @@ -199,7 +199,7 @@ def evaluate(model, criterion, data_loader, current_step): model = model.train() epoch_time = 0 - print("\n | > Validation") + print(" | > Validation") n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) progbar = Progbar(len(data_loader.dataset) / c.batch_size) @@ -246,10 +246,10 @@ def evaluate(model, criterion, data_loader, current_step): ('mel_loss', mel_loss.data[0])]) avg_linear_loss += linear_loss.data[0] - avg_mel_loss += avg_mel_loss.data[0] + avg_mel_loss += mel_loss.data[0] # Diagnostic visualizations - idx = np.random.randint(c.batch_size) + idx = np.random.randint(mel_input.shape[0]) const_spec = linear_output[idx].data.cpu().numpy() gt_spec = linear_spec_var[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() @@ -270,7 +270,7 @@ def evaluate(model, criterion, data_loader, current_step): tb.add_audio('ValSampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: - print("\n > Error at audio signal on TB!!") + print(" | > Error at audio signal on TB!!") print(audio_signal.max()) print(audio_signal.min()) @@ -305,8 +305,8 @@ def main(args): ) train_loader = DataLoader(train_dataset, batch_size=c.batch_size, - shuffle=True, collate_fn=train_dataset.collate_fn, - drop_last=True, num_workers=c.num_loader_workers, + shuffle=False, collate_fn=train_dataset.collate_fn, + drop_last=False, num_workers=c.num_loader_workers, pin_memory=True) val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'), @@ -325,15 +325,16 @@ def main(args): ) val_loader = DataLoader(val_dataset, batch_size=c.batch_size, - shuffle=True, collate_fn=val_dataset.collate_fn, - drop_last=True, num_workers= 4, + shuffle=False, collate_fn=val_dataset.collate_fn, + drop_last=False, num_workers= 4, pin_memory=True) model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, - c.r) + c.r, + use_atten_mask=True) optimizer = optim.Adam(model.parameters(), lr=c.lr) @@ -352,6 +353,7 @@ def main(args): start_epoch = 0 args.restore_step = checkpoint['step'] else: + args.restore_step = 0 print("\n > Starting a new training") if use_cuda: