docstrings for common layers

pull/10/head
erogol 2021-01-11 15:06:12 +01:00
parent a70917a030
commit cc2b1e043d
2 changed files with 46 additions and 2 deletions

View File

@ -44,7 +44,7 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid
# Use alternative when using output npy file from tune_wavegrad # Use alternative when using output npy file from tune_wavegrad
# beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item() # beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item()
# vocoder_model.compute_noise_level(beta['beta']) # vocoder_model.compute_noise_level(beta['beta'])
device_type = "cuda" if use_cuda else "cpu" device_type = "cuda" if use_cuda else "cpu"
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0)) waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0))
if use_cuda and not use_gl: if use_cuda and not use_gl:

View File

@ -1,10 +1,17 @@
import torch import torch
from torch import nn from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from scipy.stats import betabinom
class Linear(nn.Module): class Linear(nn.Module):
"""Linear layer with a specific initialization.
Args:
in_features (int): number of channels in the input tensor.
out_features (int): number of channels in the output tensor.
bias (bool, optional): enable/disable bias in the layer. Defaults to True.
init_gain (str, optional): method to compute the gain in the weight initializtion based on the nonlinear activation used afterwards. Defaults to 'linear'.
"""
def __init__(self, def __init__(self,
in_features, in_features,
out_features, out_features,
@ -25,6 +32,16 @@ class Linear(nn.Module):
class LinearBN(nn.Module): class LinearBN(nn.Module):
"""Linear layer with Batch Normalization.
x -> linear -> BN -> o
Args:
in_features (int): number of channels in the input tensor.
out_features (int ): number of channels in the output tensor.
bias (bool, optional): enable/disable bias in the linear layer. Defaults to True.
init_gain (str, optional): method to set the gain for weight initialization. Defaults to 'linear'.
"""
def __init__(self, def __init__(self,
in_features, in_features,
out_features, out_features,
@ -42,6 +59,10 @@ class LinearBN(nn.Module):
gain=torch.nn.init.calculate_gain(init_gain)) gain=torch.nn.init.calculate_gain(init_gain))
def forward(self, x): def forward(self, x):
"""
Shapes:
x: [T, B, C] or [B, C]
"""
out = self.linear_layer(x) out = self.linear_layer(x)
if len(out.shape) == 3: if len(out.shape) == 3:
out = out.permute(1, 2, 0) out = out.permute(1, 2, 0)
@ -52,6 +73,29 @@ class LinearBN(nn.Module):
class Prenet(nn.Module): class Prenet(nn.Module):
"""Tacotron specific Prenet with an optional Batch Normalization.
Note:
Prenet with BN improves the model performance significantly especially
if it is enabled after learning a diagonal attention alignment with the original
prenet. However, if the target dataset is high quality then it also works from
the start. It is also suggested to disable dropout if BN is in use.
prenet_type == "original"
x -> [linear -> ReLU -> Dropout]xN -> o
prenet_type == "bn"
x -> [linear -> BN -> ReLU -> Dropout]xN -> o
Args:
in_features (int): number of channels in the input tensor and the inner layers.
prenet_type (str, optional): prenet type "original" or "bn". Defaults to "original".
prenet_dropout (bool, optional): dropout rate. Defaults to True.
out_features (list, optional): List of output channels for each prenet block.
It also defines number of the prenet blocks based on the length of argument list.
Defaults to [256, 256].
bias (bool, optional): enable/disable bias in prenet linear layers. Defaults to True.
"""
# pylint: disable=dangerous-default-value # pylint: disable=dangerous-default-value
def __init__(self, def __init__(self,
in_features, in_features,