mirror of https://github.com/coqui-ai/TTS.git
a ton of linter updates
parent
4422642ec0
commit
9a48ba3821
|
@ -170,7 +170,7 @@ def main():
|
|||
args.vocoder_name = model_item['default_vocoder'] if args.vocoder_name is None else args.vocoder_name
|
||||
|
||||
if args.vocoder_name is not None:
|
||||
vocoder_path, vocoder_config_path, vocoder_item = manager.download_model(args.vocoder_name)
|
||||
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
||||
|
||||
# CASE3: load custome models
|
||||
if args.model_path is not None:
|
||||
|
|
|
@ -573,7 +573,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if c.run_eval:
|
||||
target_loss = eval_avg_loss_dict['avg_loss']
|
||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
||||
global_step, epoch, c.r, OUT_PATH,
|
||||
global_step, epoch, c.r, OUT_PATH, model_characters,
|
||||
keep_all_best=keep_all_best, keep_after=keep_after)
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
@ -535,7 +533,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if c.run_eval:
|
||||
target_loss = eval_avg_loss_dict['avg_loss']
|
||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
||||
global_step, epoch, c.r, OUT_PATH,
|
||||
global_step, epoch, c.r, OUT_PATH, model_characters,
|
||||
keep_all_best=keep_all_best, keep_after=keep_after)
|
||||
|
||||
|
||||
|
|
|
@ -648,12 +648,14 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
epoch,
|
||||
c.r,
|
||||
OUT_PATH,
|
||||
model_characters,
|
||||
keep_all_best=keep_all_best,
|
||||
keep_after=keep_after,
|
||||
scaler=scaler.state_dict() if c.mixed_precision else None
|
||||
)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments(sys.argv)
|
||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||
|
|
|
@ -50,7 +50,7 @@ def setup_loader(ap, is_val=False, verbose=False):
|
|||
sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None
|
||||
loader = DataLoader(dataset,
|
||||
batch_size=1 if is_val else c.batch_size,
|
||||
shuffle=False if num_gpus > 1 else True,
|
||||
shuffle=num_gpus == 0,
|
||||
drop_last=False,
|
||||
sampler=sampler,
|
||||
num_workers=c.num_val_loader_workers
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import collections
|
||||
import os
|
||||
import random
|
||||
from multiprocessing import Manager, Pool
|
||||
from multiprocessing import Pool
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
|
|
@ -3,7 +3,7 @@ from glob import glob
|
|||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
from typing import List
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
|
|
@ -366,8 +366,10 @@ class RelativePositionTransformer(nn.Module):
|
|||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
|
||||
self.ffn_layers.append(
|
||||
FeedForwardNetwork(hidden_channels,
|
||||
hidden_channels if (idx + 1) != self.num_layers else out_channels,
|
||||
FeedForwardNetwork(
|
||||
hidden_channels,
|
||||
hidden_channels if
|
||||
(idx + 1) != self.num_layers else out_channels,
|
||||
hidden_channels_ffn,
|
||||
kernel_size,
|
||||
dropout_p=dropout_p))
|
||||
|
|
|
@ -75,7 +75,7 @@ class ReferenceEncoder(nn.Module):
|
|||
# x: 3D tensor [batch_size, post_conv_width,
|
||||
# num_channels*post_conv_height]
|
||||
self.recurrence.flatten_parameters()
|
||||
memory, out = self.recurrence(x)
|
||||
_, out = self.recurrence(x)
|
||||
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
||||
|
||||
return out.squeeze(0)
|
||||
|
|
|
@ -2,13 +2,12 @@ import math
|
|||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from inspect import signature
|
||||
from torch.nn import functional
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.utils.ssim import ssim
|
||||
|
||||
|
||||
# pylint: disable=abstract-method Method
|
||||
# pylint: disable=abstract-method
|
||||
# relates https://github.com/pytorch/pytorch/issues/42305
|
||||
class L1LossMasked(nn.Module):
|
||||
def __init__(self, seq_len_norm):
|
||||
|
|
|
@ -78,8 +78,7 @@ class RelativePositionTransformerEncoder(nn.Module):
|
|||
kernel_size=5,
|
||||
num_res_blocks=3,
|
||||
num_conv_blocks=1,
|
||||
dilations=[1, 1, 1]
|
||||
)
|
||||
dilations=[1, 1, 1])
|
||||
self.rel_pos_transformer = RelativePositionTransformer(
|
||||
hidden_channels, out_channels, hidden_channels, **params)
|
||||
|
||||
|
@ -104,8 +103,7 @@ class ResidualConv1dBNEncoder(nn.Module):
|
|||
"""
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||
super().__init__()
|
||||
self.prenet = nn.Sequential(
|
||||
nn.Conv1d(in_channels, hidden_channels, 1),
|
||||
self.prenet = nn.Sequential(nn.Conv1d(in_channels, hidden_channels, 1),
|
||||
nn.ReLU())
|
||||
self.res_conv_block = ResidualConv1dBNBlock(hidden_channels,
|
||||
hidden_channels,
|
||||
|
@ -183,9 +181,8 @@ class Encoder(nn.Module):
|
|||
# init encoder
|
||||
if encoder_type.lower() == "transformer":
|
||||
# text encoder
|
||||
self.encoder = RelativePositionTransformerEncoder(in_hidden_channels,
|
||||
out_channels,
|
||||
in_hidden_channels,
|
||||
self.encoder = RelativePositionTransformerEncoder(
|
||||
in_hidden_channels, out_channels, in_hidden_channels,
|
||||
encoder_params) # pylint: disable=unexpected-keyword-arg
|
||||
elif encoder_type.lower() == 'residual_conv_bn':
|
||||
self.encoder = ResidualConv1dBNEncoder(in_hidden_channels,
|
||||
|
|
|
@ -32,7 +32,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
|
|||
nd = str(num)
|
||||
if abs(float(nd)) >= 1e48:
|
||||
raise ValueError('number out of range')
|
||||
elif 'e' in nd:
|
||||
if 'e' in nd:
|
||||
raise ValueError('scientific notation is not supported')
|
||||
c_symbol = '正负点' if simp else '正負點'
|
||||
if o: # formal
|
||||
|
@ -69,7 +69,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
|
|||
if int(unit) == 0: # 0000
|
||||
intresult.append(c_basic[0])
|
||||
continue
|
||||
elif nu > 0 and int(unit) == 2: # 0002
|
||||
if nu > 0 and int(unit) == 2: # 0002
|
||||
intresult.append(c_twoalt + c_unit2[nu - 1])
|
||||
continue
|
||||
ulist = []
|
||||
|
|
|
@ -135,7 +135,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
|||
return model
|
||||
|
||||
def is_tacotron(c):
|
||||
return False if c['model'] in ['speedy_speech', 'glow_tts'] else True
|
||||
return not c['model'] in ['speedy_speech', 'glow_tts']
|
||||
|
||||
def check_config_tts(c):
|
||||
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
|
||||
|
|
|
@ -7,7 +7,7 @@ from TTS.utils.io import RenamingUnpickler
|
|||
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False):
|
||||
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False): # pylint: disable=redefined-builtin
|
||||
"""Load ```TTS.tts.models``` checkpoints.
|
||||
|
||||
Args:
|
||||
|
|
|
@ -63,8 +63,8 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
|||
speaker_embedding_dim = None
|
||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||
num_speakers = len(speaker_mapping)
|
||||
print(" > Training with {} speakers: {}".format(len(speakers),
|
||||
", ".join(speakers)))
|
||||
print(" > Training with {} speakers: {}".format(
|
||||
len(speakers), ", ".join(speakers)))
|
||||
else:
|
||||
num_speakers = 0
|
||||
speaker_embedding_dim = None
|
||||
|
|
|
@ -17,6 +17,7 @@ def create_window(window_size, channel):
|
|||
window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
|
||||
return window
|
||||
|
||||
|
||||
def _ssim(img1, img2, window, window_size, channel, size_average=True):
|
||||
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
|
||||
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
|
||||
|
@ -25,9 +26,13 @@ def _ssim(img1, img2, window, window_size, channel, size_average = True):
|
|||
mu2_sq = mu2.pow(2)
|
||||
mu1_mu2 = mu1*mu2
|
||||
|
||||
sigma1_sq = F.conv2d(img1*img1, window, padding = window_size//2, groups = channel) - mu1_sq
|
||||
sigma2_sq = F.conv2d(img2*img2, window, padding = window_size//2, groups = channel) - mu2_sq
|
||||
sigma12 = F.conv2d(img1*img2, window, padding = window_size//2, groups = channel) - mu1_mu2
|
||||
sigma1_sq = F.conv2d(
|
||||
img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
|
||||
sigma2_sq = F.conv2d(
|
||||
img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
|
||||
sigma12 = F.conv2d(
|
||||
img1 * img2, window, padding=window_size // 2,
|
||||
groups=channel) - mu1_mu2
|
||||
|
||||
C1 = 0.01**2
|
||||
C2 = 0.03**2
|
||||
|
@ -64,6 +69,7 @@ class SSIM(torch.nn.Module):
|
|||
|
||||
return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
|
||||
|
||||
|
||||
def ssim(img1, img2, window_size=11, size_average=True):
|
||||
(_, channel, _, _) = img1.size()
|
||||
window = create_window(window_size, channel)
|
||||
|
|
|
@ -20,9 +20,13 @@ def text_to_seqvec(text, CONFIG):
|
|||
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False),
|
||||
dtype=np.int32)
|
||||
else:
|
||||
seq = np.asarray(
|
||||
text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
|
||||
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False), dtype=np.int32)
|
||||
seq = np.asarray(text_to_sequence(
|
||||
text,
|
||||
text_cleaner,
|
||||
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
|
||||
add_blank=CONFIG['add_blank']
|
||||
if 'add_blank' in CONFIG.keys() else False),
|
||||
dtype=np.int32)
|
||||
return seq
|
||||
|
||||
|
||||
|
|
|
@ -144,8 +144,3 @@ class ModelManager(object):
|
|||
if isinstance(key, str) and len(my_dict[key]) > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ from torch import nn
|
|||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class TorchSTFT(nn.Module):
|
||||
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||
def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
|
||||
""" Torch based STFT operation """
|
||||
super(TorchSTFT, self).__init__()
|
||||
|
|
|
@ -23,7 +23,9 @@ class PositionalEncoding(nn.Module):
|
|||
def forward(self, x, noise_level):
|
||||
if x.shape[2] > self.pe.shape[1]:
|
||||
self.init_pe_matrix(x.shape[1], x.shape[2], x)
|
||||
return x + noise_level[..., None, None] + self.pe[:, :x.size(2)].repeat(x.shape[0], 1, 1) / self.C
|
||||
return x + noise_level[..., None,
|
||||
None] + self.pe[:, :x.size(2)].repeat(
|
||||
x.shape[0], 1, 1) / self.C
|
||||
|
||||
def init_pe_matrix(self, n_channels, max_len, x):
|
||||
pe = torch.zeros(max_len, n_channels)
|
||||
|
@ -172,4 +174,3 @@ class DBlock(nn.Module):
|
|||
for idx, layer in enumerate(self.main_block):
|
||||
if len(layer.state_dict()) != 0:
|
||||
self.main_block[idx] = weight_norm(layer)
|
||||
|
||||
|
|
|
@ -79,7 +79,7 @@ class Wavegrad(nn.Module):
|
|||
return x
|
||||
|
||||
def load_noise_schedule(self, path):
|
||||
beta = np.load(path, allow_pickle=True).item()['beta']
|
||||
beta = np.load(path, allow_pickle=True).item()['beta'] # pylint: disable=unexpected-keyword-arg
|
||||
self.compute_noise_level(beta)
|
||||
|
||||
@torch.no_grad()
|
||||
|
@ -91,8 +91,8 @@ class Wavegrad(nn.Module):
|
|||
y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0).to(x)
|
||||
sqrt_alpha_hat = self.noise_level.to(x)
|
||||
for n in range(len(self.alpha) - 1, -1, -1):
|
||||
y_n = self.c1[n] * (y_n -
|
||||
self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0])))
|
||||
y_n = self.c1[n] * (y_n - self.c2[n] * self.forward(
|
||||
y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0])))
|
||||
if n > 0:
|
||||
z = torch.randn_like(y_n)
|
||||
y_n += self.sigma[n - 1] * z
|
||||
|
|
|
@ -118,9 +118,8 @@ class UpsampleNetwork(nn.Module):
|
|||
|
||||
|
||||
class Upsample(nn.Module):
|
||||
def __init__(
|
||||
self, scale, pad, num_res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net
|
||||
):
|
||||
def __init__(self, scale, pad, num_res_blocks, feat_dims, compute_dims,
|
||||
res_out_dims, use_aux_net):
|
||||
super().__init__()
|
||||
self.scale = scale
|
||||
self.pad = pad
|
||||
|
|
|
@ -44,9 +44,11 @@ def log_sum_exp(x):
|
|||
|
||||
|
||||
# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
|
||||
def discretized_mix_logistic_loss(
|
||||
y_hat, y, num_classes=65536, log_scale_min=None, reduce=True
|
||||
):
|
||||
def discretized_mix_logistic_loss(y_hat,
|
||||
y,
|
||||
num_classes=65536,
|
||||
log_scale_min=None,
|
||||
reduce=True):
|
||||
if log_scale_min is None:
|
||||
log_scale_min = float(np.log(1e-14))
|
||||
y_hat = y_hat.permute(0, 2, 1)
|
||||
|
|
|
@ -7,7 +7,7 @@ import pickle as pickle_tts
|
|||
from TTS.utils.io import RenamingUnpickler
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):
|
||||
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin
|
||||
try:
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
except ModuleNotFoundError:
|
||||
|
|
|
@ -217,4 +217,3 @@ class SSIMLossTests(unittest.TestCase):
|
|||
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
|
||||
output = layer(dummy_input + mask, dummy_target, dummy_length)
|
||||
assert output.item() == 0, "0 vs {}".format(output.item())
|
||||
|
||||
|
|
|
@ -356,4 +356,3 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
||||
count, param.shape, param, param_ref)
|
||||
count += 1
|
||||
|
||||
|
|
|
@ -17,5 +17,5 @@ def test_currency() -> None:
|
|||
|
||||
|
||||
def test_expand_numbers() -> None:
|
||||
assert "minus one" == phoneme_cleaners("-1")
|
||||
assert "one" == phoneme_cleaners("1")
|
||||
assert phoneme_cleaners("-1") == 'minus one'
|
||||
assert phoneme_cleaners("1") == 'one'
|
||||
|
|
|
@ -17,7 +17,7 @@ def test_phoneme_to_sequence():
|
|||
lang = "en-us"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||
gt = 'ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!'
|
||||
assert text_hat == text_hat_with_params == gt
|
||||
|
|
Loading…
Reference in New Issue