mirror of https://github.com/coqui-ai/TTS.git
docstrings for common layers
parent
a70917a030
commit
cc2b1e043d
|
@ -44,7 +44,7 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid
|
||||||
# Use alternative when using output npy file from tune_wavegrad
|
# Use alternative when using output npy file from tune_wavegrad
|
||||||
# beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item()
|
# beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item()
|
||||||
# vocoder_model.compute_noise_level(beta['beta'])
|
# vocoder_model.compute_noise_level(beta['beta'])
|
||||||
|
|
||||||
device_type = "cuda" if use_cuda else "cpu"
|
device_type = "cuda" if use_cuda else "cpu"
|
||||||
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0))
|
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0))
|
||||||
if use_cuda and not use_gl:
|
if use_cuda and not use_gl:
|
||||||
|
|
|
@ -1,10 +1,17 @@
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from scipy.stats import betabinom
|
|
||||||
|
|
||||||
|
|
||||||
class Linear(nn.Module):
|
class Linear(nn.Module):
|
||||||
|
"""Linear layer with a specific initialization.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
in_features (int): number of channels in the input tensor.
|
||||||
|
out_features (int): number of channels in the output tensor.
|
||||||
|
bias (bool, optional): enable/disable bias in the layer. Defaults to True.
|
||||||
|
init_gain (str, optional): method to compute the gain in the weight initializtion based on the nonlinear activation used afterwards. Defaults to 'linear'.
|
||||||
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
in_features,
|
in_features,
|
||||||
out_features,
|
out_features,
|
||||||
|
@ -25,6 +32,16 @@ class Linear(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class LinearBN(nn.Module):
|
class LinearBN(nn.Module):
|
||||||
|
"""Linear layer with Batch Normalization.
|
||||||
|
|
||||||
|
x -> linear -> BN -> o
|
||||||
|
|
||||||
|
Args:
|
||||||
|
in_features (int): number of channels in the input tensor.
|
||||||
|
out_features (int ): number of channels in the output tensor.
|
||||||
|
bias (bool, optional): enable/disable bias in the linear layer. Defaults to True.
|
||||||
|
init_gain (str, optional): method to set the gain for weight initialization. Defaults to 'linear'.
|
||||||
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
in_features,
|
in_features,
|
||||||
out_features,
|
out_features,
|
||||||
|
@ -42,6 +59,10 @@ class LinearBN(nn.Module):
|
||||||
gain=torch.nn.init.calculate_gain(init_gain))
|
gain=torch.nn.init.calculate_gain(init_gain))
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
|
"""
|
||||||
|
Shapes:
|
||||||
|
x: [T, B, C] or [B, C]
|
||||||
|
"""
|
||||||
out = self.linear_layer(x)
|
out = self.linear_layer(x)
|
||||||
if len(out.shape) == 3:
|
if len(out.shape) == 3:
|
||||||
out = out.permute(1, 2, 0)
|
out = out.permute(1, 2, 0)
|
||||||
|
@ -52,6 +73,29 @@ class LinearBN(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class Prenet(nn.Module):
|
class Prenet(nn.Module):
|
||||||
|
"""Tacotron specific Prenet with an optional Batch Normalization.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Prenet with BN improves the model performance significantly especially
|
||||||
|
if it is enabled after learning a diagonal attention alignment with the original
|
||||||
|
prenet. However, if the target dataset is high quality then it also works from
|
||||||
|
the start. It is also suggested to disable dropout if BN is in use.
|
||||||
|
|
||||||
|
prenet_type == "original"
|
||||||
|
x -> [linear -> ReLU -> Dropout]xN -> o
|
||||||
|
|
||||||
|
prenet_type == "bn"
|
||||||
|
x -> [linear -> BN -> ReLU -> Dropout]xN -> o
|
||||||
|
|
||||||
|
Args:
|
||||||
|
in_features (int): number of channels in the input tensor and the inner layers.
|
||||||
|
prenet_type (str, optional): prenet type "original" or "bn". Defaults to "original".
|
||||||
|
prenet_dropout (bool, optional): dropout rate. Defaults to True.
|
||||||
|
out_features (list, optional): List of output channels for each prenet block.
|
||||||
|
It also defines number of the prenet blocks based on the length of argument list.
|
||||||
|
Defaults to [256, 256].
|
||||||
|
bias (bool, optional): enable/disable bias in prenet linear layers. Defaults to True.
|
||||||
|
"""
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
in_features,
|
in_features,
|
||||||
|
|
Loading…
Reference in New Issue