diff --git a/TTS/.models.json b/TTS/.models.json index 6f763840..aae2a1c2 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -47,15 +47,6 @@ "license": "MPL", "contact": "egolge@coqui.com" }, - "speedy-speech-wn": { - "description": "Speedy Speech model with wavenet decoder.", - "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ljspeech--speedy-speech-wn.zip", - "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", - "commit": "77b6145", - "author": "Eren Gölge @erogol", - "license": "MPL", - "contact": "egolge@coqui.com" - }, "vits": { "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.", "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--ljspeech--vits.zip", diff --git a/TTS/tts/layers/__init__.py b/TTS/tts/layers/__init__.py index 78f56a5d..f93efdb7 100644 --- a/TTS/tts/layers/__init__.py +++ b/TTS/tts/layers/__init__.py @@ -1,15 +1 @@ from TTS.tts.layers.losses import * - - -def setup_loss(config): - if config.model.lower() in ["tacotron", "tacotron2"]: - model = TacotronLoss(config) - elif config.model.lower() == "glow_tts": - model = GlowTTSLoss() - elif config.model.lower() == "speedy_speech": - model = SpeedySpeechLoss(config) - elif config.model.lower() == "align_tts": - model = AlignTTSLoss(config) - else: - raise ValueError(f" [!] loss for model {config.model.lower()} cannot be found.") - return model diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index d91a828e..b0d88740 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -1,5 +1,4 @@ import numpy as np -import torch def _pad_data(x, length): diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index 76abf2bc..b0a010b0 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -11,11 +11,11 @@ except ModuleNotFoundError: class StandardScaler: - """StandardScaler for mean-std normalization with the given mean and std values.""" + """StandardScaler for mean-scale normalization with the given mean and scale values.""" - def __init__(self, mean: np.ndarray = None, std: np.ndarray = None) -> None: + def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None: self.mean_ = mean - self.std_ = std + self.scale_ = scale def set_stats(self, mean, scale): self.mean_ = mean diff --git a/tests/tts_tests/test_forward_tts.py b/tests/tts_tests/test_forward_tts.py index 9bb60f48..cec0f211 100644 --- a/tests/tts_tests/test_forward_tts.py +++ b/tests/tts_tests/test_forward_tts.py @@ -1,5 +1,3 @@ -import unittest - import torch as T from TTS.tts.models.forward_tts import ForwardTTS, ForwardTTSArgs @@ -54,12 +52,12 @@ def model_input_output_test(): assert (outputs["x_mask"] - x_mask).sum() == 0.0 assert (outputs["y_mask"] - y_mask).sum() == 0.0 - assert outputs["alignment_soft"] == None - assert outputs["alignment_mas"] == None - assert outputs["alignment_logprob"] == None - assert outputs["o_alignment_dur"] == None - assert outputs["pitch_avg"] == None - assert outputs["pitch_avg_gt"] == None + assert outputs["alignment_soft"] is None + assert outputs["alignment_mas"] is None + assert outputs["alignment_logprob"] is None + assert outputs["o_alignment_dur"] is None + assert outputs["pitch_avg"] is None + assert outputs["pitch_avg_gt"] is None # USE PITCH model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=True, use_aligner=False)) @@ -85,10 +83,10 @@ def model_input_output_test(): assert outputs["pitch_avg"].shape == (2, 1, 21) assert outputs["pitch_avg_gt"].shape == (2, 1, 21) - assert outputs["alignment_soft"] == None - assert outputs["alignment_mas"] == None - assert outputs["alignment_logprob"] == None - assert outputs["o_alignment_dur"] == None + assert outputs["alignment_soft"] is None + assert outputs["alignment_mas"] is None + assert outputs["alignment_logprob"] is None + assert outputs["o_alignment_dur"] is None # USE ALIGNER NETWORK model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=False, use_aligner=True)) @@ -116,8 +114,8 @@ def model_input_output_test(): assert outputs["alignment_logprob"].shape == (2, 1, durations.sum(1).max(), 21) assert outputs["o_alignment_dur"].shape == (2, 21) - assert outputs["pitch_avg"] == None - assert outputs["pitch_avg_gt"] == None + assert outputs["pitch_avg"] is None + assert outputs["pitch_avg_gt"] is None # USE ALIGNER NETWORK AND PITCH model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=True, use_aligner=True)) diff --git a/tests/tts_tests/test_helpers.py b/tests/tts_tests/test_helpers.py index 0aac5473..6a2f260d 100644 --- a/tests/tts_tests/test_helpers.py +++ b/tests/tts_tests/test_helpers.py @@ -1,6 +1,6 @@ import torch as T -from TTS.tts.utils.helpers import * +from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask def average_over_durations_test(): # pylint: disable=no-self-use @@ -47,7 +47,7 @@ def generate_path_test(): durations = durations * x_mask.squeeze(1) y_length = durations.sum(1) y_mask = sequence_mask(y_length).unsqueeze(1).long() - attn_mask = (torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)).squeeze(1).long() + attn_mask = (T.unsqueeze(x_mask, -1) * T.unsqueeze(y_mask, 2)).squeeze(1).long() print(attn_mask.shape) path = generate_path(durations, attn_mask) assert path.shape == (10, 21, durations.sum(1).max().item())