mirror of https://github.com/coqui-ai/TTS.git
formatting
parent
87d674a038
commit
8cb27267a4
|
@ -29,7 +29,6 @@ import pandas
|
|||
import soundfile as sf
|
||||
from absl import logging
|
||||
|
||||
|
||||
SUBSETS = {
|
||||
"vox1_dev_wav": [
|
||||
"http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
|
||||
|
@ -191,7 +190,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
|
|||
|
||||
|
||||
def processor(directory, subset, force_process):
|
||||
""" download and process """
|
||||
"""download and process"""
|
||||
urls = SUBSETS
|
||||
if subset not in urls:
|
||||
raise ValueError(subset, "is not in voxceleb")
|
||||
|
|
|
@ -18,7 +18,7 @@ class FFTransformer(nn.Module):
|
|||
self.dropout = nn.Dropout(dropout_p)
|
||||
|
||||
def forward(self, src, src_mask=None, src_key_padding_mask=None):
|
||||
"""😦 ugly looking with all the transposing """
|
||||
"""😦 ugly looking with all the transposing"""
|
||||
src = src.permute(2, 0, 1)
|
||||
src2, enc_align = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
|
||||
src = self.norm1(src + src2)
|
||||
|
|
|
@ -39,7 +39,7 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
gst_style_tokens=10,
|
||||
gst_use_speaker_embedding=False,
|
||||
):
|
||||
""" Abstract Tacotron class """
|
||||
"""Abstract Tacotron class"""
|
||||
super().__init__()
|
||||
self.num_chars = num_chars
|
||||
self.r = r
|
||||
|
@ -153,7 +153,7 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
return input_mask, output_mask
|
||||
|
||||
def _backward_pass(self, mel_specs, encoder_outputs, mask):
|
||||
""" Run backwards decoder """
|
||||
"""Run backwards decoder"""
|
||||
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
|
||||
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask
|
||||
)
|
||||
|
@ -161,7 +161,7 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
return decoder_outputs_b, alignments_b
|
||||
|
||||
def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask):
|
||||
""" Double Decoder Consistency """
|
||||
"""Double Decoder Consistency"""
|
||||
T = mel_specs.shape[1]
|
||||
if T % self.coarse_decoder.r > 0:
|
||||
padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r)
|
||||
|
@ -182,7 +182,7 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
#############################
|
||||
|
||||
def compute_speaker_embedding(self, speaker_ids):
|
||||
""" Compute speaker embedding vectors """
|
||||
"""Compute speaker embedding vectors"""
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
||||
raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||
|
@ -191,7 +191,7 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
self.speaker_embeddings_projected = self.speaker_project_mel(self.speaker_embeddings).squeeze(1)
|
||||
|
||||
def compute_gst(self, inputs, style_input, speaker_embedding=None):
|
||||
""" Compute global style token """
|
||||
"""Compute global style token"""
|
||||
device = inputs.device
|
||||
if isinstance(style_input, dict):
|
||||
query = torch.zeros(1, 1, self.gst_embedding_dim // 2).to(device)
|
||||
|
|
|
@ -140,7 +140,7 @@ class Attention(keras.layers.Layer):
|
|||
return tuple(states)
|
||||
|
||||
def process_values(self, values):
|
||||
""" cache values for decoder iterations """
|
||||
"""cache values for decoder iterations"""
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
self.processed_values = self.inputs_layer(values)
|
||||
self.values = values
|
||||
|
@ -158,14 +158,14 @@ class Attention(keras.layers.Layer):
|
|||
return score, processed_query
|
||||
|
||||
def get_attn(self, query):
|
||||
""" compute query layer and unnormalized attention weights """
|
||||
"""compute query layer and unnormalized attention weights"""
|
||||
processed_query = self.query_layer(tf.expand_dims(query, 1))
|
||||
score = self.v(tf.nn.tanh(self.processed_values + processed_query))
|
||||
score = tf.squeeze(score, axis=2)
|
||||
return score, processed_query
|
||||
|
||||
def apply_score_masking(self, score, mask): # pylint: disable=no-self-use
|
||||
""" ignore sequence paddings """
|
||||
"""ignore sequence paddings"""
|
||||
padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
|
||||
# Bias so padding positions do not contribute to attention distribution.
|
||||
score -= 1.0e9 * math_ops.cast(padding_mask, dtype=tf.float32)
|
||||
|
|
|
@ -7,7 +7,7 @@ import tensorflow as tf
|
|||
|
||||
|
||||
def tf_create_dummy_inputs():
|
||||
""" Create dummy inputs for TF Tacotron2 model """
|
||||
"""Create dummy inputs for TF Tacotron2 model"""
|
||||
batch_size = 4
|
||||
max_input_length = 32
|
||||
max_mel_length = 128
|
||||
|
@ -25,12 +25,12 @@ def tf_create_dummy_inputs():
|
|||
|
||||
|
||||
def compare_torch_tf(torch_tensor, tf_tensor):
|
||||
""" Compute the average absolute difference b/w torch and tf tensors """
|
||||
"""Compute the average absolute difference b/w torch and tf tensors"""
|
||||
return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean()
|
||||
|
||||
|
||||
def convert_tf_name(tf_name):
|
||||
""" Convert certain patterns in TF layer names to Torch patterns """
|
||||
"""Convert certain patterns in TF layer names to Torch patterns"""
|
||||
tf_name_tmp = tf_name
|
||||
tf_name_tmp = tf_name_tmp.replace(":0", "")
|
||||
tf_name_tmp = tf_name_tmp.replace("/forward_lstm/lstm_cell_1/recurrent_kernel", "/weight_hh_l0")
|
||||
|
@ -44,7 +44,7 @@ def convert_tf_name(tf_name):
|
|||
|
||||
|
||||
def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict):
|
||||
""" Transfer weigths from torch state_dict to TF variables """
|
||||
"""Transfer weigths from torch state_dict to TF variables"""
|
||||
print(" > Passing weights from Torch to TF ...")
|
||||
for tf_var in tf_vars:
|
||||
torch_var_name = var_map_dict[tf_var.name]
|
||||
|
|
|
@ -33,7 +33,7 @@ def _pad_stop_target(x, length):
|
|||
|
||||
|
||||
def prepare_stop_target(inputs, out_steps):
|
||||
""" Pad row vectors with 1. """
|
||||
"""Pad row vectors with 1."""
|
||||
max_len = max((x.shape[0] for x in inputs))
|
||||
remainder = max_len % out_steps
|
||||
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
|
||||
|
|
|
@ -40,7 +40,7 @@ def get_speakers(items):
|
|||
|
||||
|
||||
def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
||||
""" Returns number of speakers, speaker embedding shape and speaker mapping"""
|
||||
"""Returns number of speakers, speaker embedding shape and speaker mapping"""
|
||||
if c.use_speaker_embedding:
|
||||
speakers = get_speakers(meta_data_train)
|
||||
if args.restore_path:
|
||||
|
|
|
@ -366,7 +366,7 @@ class AudioProcessor(object):
|
|||
return len(wav)
|
||||
|
||||
def trim_silence(self, wav):
|
||||
""" Trim silent parts with a threshold and 0.01 sec margin """
|
||||
"""Trim silent parts with a threshold and 0.01 sec margin"""
|
||||
margin = int(self.sample_rate * 0.01)
|
||||
wav = wav[margin:-margin]
|
||||
return librosa.effects.trim(wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[
|
||||
|
|
|
@ -37,7 +37,7 @@ def get_commit_hash():
|
|||
|
||||
|
||||
def create_experiment_folder(root_path, model_name, debug):
|
||||
""" Create a folder with the current date and time """
|
||||
"""Create a folder with the current date and time"""
|
||||
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
|
||||
if debug:
|
||||
commit_hash = "debug"
|
||||
|
|
|
@ -105,7 +105,7 @@ class GANDataset(Dataset):
|
|||
random.shuffle(self.G_to_D_mappings)
|
||||
|
||||
def load_item(self, idx):
|
||||
""" load (audio, feat) couple """
|
||||
"""load (audio, feat) couple"""
|
||||
if self.compute_feat:
|
||||
# compute features from wav
|
||||
wavpath = self.item_list[idx]
|
||||
|
|
|
@ -78,7 +78,7 @@ class WaveGradDataset(Dataset):
|
|||
return samples
|
||||
|
||||
def load_item(self, idx):
|
||||
""" load (audio, feat) couple """
|
||||
"""load (audio, feat) couple"""
|
||||
# compute features from wav
|
||||
wavpath = self.item_list[idx]
|
||||
|
||||
|
|
|
@ -131,7 +131,7 @@ class MultiScaleSTFTLoss(torch.nn.Module):
|
|||
|
||||
|
||||
class L1SpecLoss(nn.Module):
|
||||
""" L1 Loss over Spectrograms as described in HiFiGAN paper https://arxiv.org/pdf/2010.05646.pdf"""
|
||||
"""L1 Loss over Spectrograms as described in HiFiGAN paper https://arxiv.org/pdf/2010.05646.pdf"""
|
||||
|
||||
def __init__(
|
||||
self, sample_rate, n_fft, hop_length, win_length, mel_fmin=None, mel_fmax=None, n_mels=None, use_mel=True
|
||||
|
@ -169,7 +169,7 @@ class MultiScaleSubbandSTFTLoss(MultiScaleSTFTLoss):
|
|||
|
||||
|
||||
class MSEGLoss(nn.Module):
|
||||
""" Mean Squared Generator Loss """
|
||||
"""Mean Squared Generator Loss"""
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def forward(self, score_real):
|
||||
|
@ -178,7 +178,7 @@ class MSEGLoss(nn.Module):
|
|||
|
||||
|
||||
class HingeGLoss(nn.Module):
|
||||
""" Hinge Discriminator Loss """
|
||||
"""Hinge Discriminator Loss"""
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def forward(self, score_real):
|
||||
|
@ -193,7 +193,7 @@ class HingeGLoss(nn.Module):
|
|||
|
||||
|
||||
class MSEDLoss(nn.Module):
|
||||
""" Mean Squared Discriminator Loss """
|
||||
"""Mean Squared Discriminator Loss"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -210,7 +210,7 @@ class MSEDLoss(nn.Module):
|
|||
|
||||
|
||||
class HingeDLoss(nn.Module):
|
||||
""" Hinge Discriminator Loss """
|
||||
"""Hinge Discriminator Loss"""
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def forward(self, score_fake, score_real):
|
||||
|
@ -260,7 +260,7 @@ def _apply_G_adv_loss(scores_fake, loss_func):
|
|||
|
||||
|
||||
def _apply_D_loss(scores_fake, scores_real, loss_func):
|
||||
""" Compute D loss func and normalize loss values """
|
||||
"""Compute D loss func and normalize loss values"""
|
||||
loss = 0
|
||||
real_loss = 0
|
||||
fake_loss = 0
|
||||
|
|
|
@ -84,7 +84,7 @@ class Wavegrad(nn.Module):
|
|||
|
||||
@torch.no_grad()
|
||||
def inference(self, x, y_n=None):
|
||||
""" x: B x D X T """
|
||||
"""x: B x D X T"""
|
||||
if y_n is None:
|
||||
y_n = torch.randn(x.shape[0], 1, self.hop_len * x.shape[-1], dtype=torch.float32).to(x)
|
||||
else:
|
||||
|
|
|
@ -3,12 +3,12 @@ import tensorflow as tf
|
|||
|
||||
|
||||
def compare_torch_tf(torch_tensor, tf_tensor):
|
||||
""" Compute the average absolute difference b/w torch and tf tensors """
|
||||
"""Compute the average absolute difference b/w torch and tf tensors"""
|
||||
return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean()
|
||||
|
||||
|
||||
def convert_tf_name(tf_name):
|
||||
""" Convert certain patterns in TF layer names to Torch patterns """
|
||||
"""Convert certain patterns in TF layer names to Torch patterns"""
|
||||
tf_name_tmp = tf_name
|
||||
tf_name_tmp = tf_name_tmp.replace(":0", "")
|
||||
tf_name_tmp = tf_name_tmp.replace("/forward_lstm/lstm_cell_1/recurrent_kernel", "/weight_hh_l0")
|
||||
|
@ -22,7 +22,7 @@ def convert_tf_name(tf_name):
|
|||
|
||||
|
||||
def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict):
|
||||
""" Transfer weigths from torch state_dict to TF variables """
|
||||
"""Transfer weigths from torch state_dict to TF variables"""
|
||||
print(" > Passing weights from Torch to TF ...")
|
||||
for tf_var in tf_vars:
|
||||
torch_var_name = var_map_dict[tf_var.name]
|
||||
|
|
|
@ -5,7 +5,7 @@ import tensorflow as tf
|
|||
|
||||
|
||||
def save_checkpoint(model, current_step, epoch, output_path, **kwargs):
|
||||
""" Save TF Vocoder model """
|
||||
"""Save TF Vocoder model"""
|
||||
state = {
|
||||
"model": model.weights,
|
||||
"step": current_step,
|
||||
|
@ -17,7 +17,7 @@ def save_checkpoint(model, current_step, epoch, output_path, **kwargs):
|
|||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path):
|
||||
""" Load TF Vocoder model """
|
||||
"""Load TF Vocoder model"""
|
||||
checkpoint = pickle.load(open(checkpoint_path, "rb"))
|
||||
chkp_var_dict = {var.name: var.numpy() for var in checkpoint["model"]}
|
||||
tf_vars = model.weights
|
||||
|
|
|
@ -31,7 +31,7 @@ def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0):
|
|||
|
||||
|
||||
def log_sum_exp(x):
|
||||
""" numerically stable log_sum_exp implementation that prevents overflow """
|
||||
"""numerically stable log_sum_exp implementation that prevents overflow"""
|
||||
# TF ordering
|
||||
axis = len(x.size()) - 1
|
||||
m, _ = torch.max(x, dim=axis)
|
||||
|
|
|
@ -30,7 +30,7 @@ def interpolate_vocoder_input(scale_factor, spec):
|
|||
|
||||
|
||||
def plot_results(y_hat, y, ap, global_step, name_prefix):
|
||||
""" Plot vocoder model results """
|
||||
"""Plot vocoder model results"""
|
||||
|
||||
# select an instance from batch
|
||||
y_hat = y_hat[0].squeeze(0).detach().cpu().numpy()
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
#!/usr/bin/env python3`
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
import glob
|
||||
|
||||
from tests import get_tests_output_path
|
||||
from TTS.utils.manage import ModelManager
|
||||
|
||||
|
@ -15,6 +16,6 @@ def test_if_all_models_available():
|
|||
manager.download_model(model_name)
|
||||
print(f" | > OK: {model_name}")
|
||||
|
||||
folders = glob.glob(os.path.join(manager.output_prefix, '*'))
|
||||
folders = glob.glob(os.path.join(manager.output_prefix, "*"))
|
||||
assert len(folders) == len(model_names)
|
||||
shutil.rmtree(manager.output_prefix)
|
||||
|
|
|
@ -55,7 +55,7 @@ class TestAudio(unittest.TestCase):
|
|||
_test(4.0, True, True, True)
|
||||
|
||||
def test_normalize(self):
|
||||
"""Check normalization and denormalization for range values and consistency """
|
||||
"""Check normalization and denormalization for range values and consistency"""
|
||||
print(" > Testing normalization and denormalization.")
|
||||
wav = self.ap.load_wav(WAV_FILE)
|
||||
wav = self.ap.sound_norm(wav) # normalize audio to get abetter normalization range below.
|
||||
|
|
|
@ -40,7 +40,7 @@ class TacotronTFTrainTest(unittest.TestCase):
|
|||
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths, stop_targets, speaker_ids
|
||||
|
||||
def test_train_step(self):
|
||||
""" test forward pass """
|
||||
"""test forward pass"""
|
||||
(
|
||||
chars_seq,
|
||||
chars_seq_lengths,
|
||||
|
|
|
@ -22,7 +22,7 @@ ok_ljspeech = os.path.exists(test_data_path)
|
|||
def gan_dataset_case(
|
||||
batch_size, seq_len, hop_len, conv_pad, return_pairs, return_segments, use_noise_augment, use_cache, num_workers
|
||||
):
|
||||
"""Run dataloader with given parameters and check conditions """
|
||||
"""Run dataloader with given parameters and check conditions"""
|
||||
ap = AudioProcessor(**C.audio)
|
||||
_, train_items = load_wav_data(test_data_path, 10)
|
||||
dataset = GANDataset(
|
||||
|
@ -90,7 +90,7 @@ def gan_dataset_case(
|
|||
|
||||
|
||||
def test_parametrized_gan_dataset():
|
||||
""" test dataloader with different parameters """
|
||||
"""test dataloader with different parameters"""
|
||||
params = [
|
||||
[32, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, True, False, True, 0],
|
||||
[32, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, True, False, True, 4],
|
||||
|
|
|
@ -23,7 +23,7 @@ ok_ljspeech = os.path.exists(test_data_path)
|
|||
|
||||
|
||||
def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers):
|
||||
""" run dataloader with given parameters and check conditions """
|
||||
"""run dataloader with given parameters and check conditions"""
|
||||
ap = AudioProcessor(**C.audio)
|
||||
|
||||
C.batch_size = batch_size
|
||||
|
@ -69,7 +69,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
|
|||
|
||||
|
||||
def test_parametrized_wavernn_dataset():
|
||||
""" test dataloader with different parameters """
|
||||
"""test dataloader with different parameters"""
|
||||
params = [
|
||||
[16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
|
||||
[16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
|
||||
|
|
Loading…
Reference in New Issue