diff --git a/layers/tacotron.py b/layers/tacotron.py
index 41bd15d2..195df4fa 100644
--- a/layers/tacotron.py
+++ b/layers/tacotron.py
@@ -57,6 +57,7 @@ class BatchNormConv1d(nn.Module):
                  activation=None):
         super(BatchNormConv1d, self).__init__()
         self.padding = padding
+        self.padder = nn.ConstantPad1d(padding, 0)
         self.conv1d = nn.Conv1d(
             in_channels,
             out_channels,
@@ -69,7 +70,7 @@ class BatchNormConv1d(nn.Module):
         self.activation = activation
 
     def forward(self, x):
-        x = nn.functional.pad(x, self.padding)
+        x = self.padder(x)
         x = self.conv1d(x)
         if self.activation is not None:
             x = self.activation(x)
@@ -135,9 +136,11 @@ class CBHG(nn.Module):
                 padding=[(k - 1) // 2, k // 2],
                 activation=self.relu) for k in range(1, K + 1)
         ])
-        # max pooling of conv bank, padding with nn.functional 
+        # max pooling of conv bank, padding with nn.functional
         # TODO: try average pooling OR larger kernel size
-        self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=0)
+        self.max_pool1d = nn.Sequential(
+            nn.ConstantPad1d([0, 1], value=0),
+            nn.MaxPool1d(kernel_size=2, stride=1, padding=0))
         out_features = [K * conv_bank_features] + conv_projections[:-1]
         activations = [self.relu] * (len(conv_projections) - 1)
         activations += [None]
@@ -186,7 +189,6 @@ class CBHG(nn.Module):
             outs.append(out)
         x = torch.cat(outs, dim=1)
         assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks)
-        x = nn.functional.pad(x, [0, 1])
         x = self.max_pool1d(x)
         for conv1d in self.conv1d_projections:
             x = conv1d(x)
@@ -256,6 +258,7 @@ class PostCBHG(nn.Module):
             highway_features=128,
             gru_features=128,
             num_highways=4)
+
     def forward(self, x):
         return self.cbhg(x)