From cc2b1e043d44d761cf4b53530609c0b70724b6ca Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 11 Jan 2021 15:06:12 +0100 Subject: [PATCH] docstrings for common layers --- TTS/bin/synthesize.py | 2 +- TTS/tts/layers/common_layers.py | 46 ++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 12ff4d30..25459f79 100644 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -44,7 +44,7 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid # Use alternative when using output npy file from tune_wavegrad # beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item() # vocoder_model.compute_noise_level(beta['beta']) - + device_type = "cuda" if use_cuda else "cpu" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0)) if use_cuda and not use_gl: diff --git a/TTS/tts/layers/common_layers.py b/TTS/tts/layers/common_layers.py index 615f766e..a2eefba5 100644 --- a/TTS/tts/layers/common_layers.py +++ b/TTS/tts/layers/common_layers.py @@ -1,10 +1,17 @@ import torch from torch import nn from torch.nn import functional as F -from scipy.stats import betabinom class Linear(nn.Module): + """Linear layer with a specific initialization. + + Args: + in_features (int): number of channels in the input tensor. + out_features (int): number of channels in the output tensor. + bias (bool, optional): enable/disable bias in the layer. Defaults to True. + init_gain (str, optional): method to compute the gain in the weight initializtion based on the nonlinear activation used afterwards. Defaults to 'linear'. + """ def __init__(self, in_features, out_features, @@ -25,6 +32,16 @@ class Linear(nn.Module): class LinearBN(nn.Module): + """Linear layer with Batch Normalization. + + x -> linear -> BN -> o + + Args: + in_features (int): number of channels in the input tensor. + out_features (int ): number of channels in the output tensor. + bias (bool, optional): enable/disable bias in the linear layer. Defaults to True. + init_gain (str, optional): method to set the gain for weight initialization. Defaults to 'linear'. + """ def __init__(self, in_features, out_features, @@ -42,6 +59,10 @@ class LinearBN(nn.Module): gain=torch.nn.init.calculate_gain(init_gain)) def forward(self, x): + """ + Shapes: + x: [T, B, C] or [B, C] + """ out = self.linear_layer(x) if len(out.shape) == 3: out = out.permute(1, 2, 0) @@ -52,6 +73,29 @@ class LinearBN(nn.Module): class Prenet(nn.Module): + """Tacotron specific Prenet with an optional Batch Normalization. + + Note: + Prenet with BN improves the model performance significantly especially + if it is enabled after learning a diagonal attention alignment with the original + prenet. However, if the target dataset is high quality then it also works from + the start. It is also suggested to disable dropout if BN is in use. + + prenet_type == "original" + x -> [linear -> ReLU -> Dropout]xN -> o + + prenet_type == "bn" + x -> [linear -> BN -> ReLU -> Dropout]xN -> o + + Args: + in_features (int): number of channels in the input tensor and the inner layers. + prenet_type (str, optional): prenet type "original" or "bn". Defaults to "original". + prenet_dropout (bool, optional): dropout rate. Defaults to True. + out_features (list, optional): List of output channels for each prenet block. + It also defines number of the prenet blocks based on the length of argument list. + Defaults to [256, 256]. + bias (bool, optional): enable/disable bias in prenet linear layers. Defaults to True. + """ # pylint: disable=dangerous-default-value def __init__(self, in_features,