mirror of https://github.com/coqui-ai/TTS.git
Remove SpeedySpeech from .models.json
parent
d97952611d
commit
26f76fce22
|
@ -47,15 +47,6 @@
|
|||
"license": "MPL",
|
||||
"contact": "egolge@coqui.com"
|
||||
},
|
||||
"speedy-speech-wn": {
|
||||
"description": "Speedy Speech model with wavenet decoder.",
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ljspeech--speedy-speech-wn.zip",
|
||||
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
||||
"commit": "77b6145",
|
||||
"author": "Eren Gölge @erogol",
|
||||
"license": "MPL",
|
||||
"contact": "egolge@coqui.com"
|
||||
},
|
||||
"vits": {
|
||||
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--ljspeech--vits.zip",
|
||||
|
|
|
@ -1,15 +1 @@
|
|||
from TTS.tts.layers.losses import *
|
||||
|
||||
|
||||
def setup_loss(config):
|
||||
if config.model.lower() in ["tacotron", "tacotron2"]:
|
||||
model = TacotronLoss(config)
|
||||
elif config.model.lower() == "glow_tts":
|
||||
model = GlowTTSLoss()
|
||||
elif config.model.lower() == "speedy_speech":
|
||||
model = SpeedySpeechLoss(config)
|
||||
elif config.model.lower() == "align_tts":
|
||||
model = AlignTTSLoss(config)
|
||||
else:
|
||||
raise ValueError(f" [!] loss for model {config.model.lower()} cannot be found.")
|
||||
return model
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def _pad_data(x, length):
|
||||
|
|
|
@ -11,11 +11,11 @@ except ModuleNotFoundError:
|
|||
|
||||
|
||||
class StandardScaler:
|
||||
"""StandardScaler for mean-std normalization with the given mean and std values."""
|
||||
"""StandardScaler for mean-scale normalization with the given mean and scale values."""
|
||||
|
||||
def __init__(self, mean: np.ndarray = None, std: np.ndarray = None) -> None:
|
||||
def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
|
||||
self.mean_ = mean
|
||||
self.std_ = std
|
||||
self.scale_ = scale
|
||||
|
||||
def set_stats(self, mean, scale):
|
||||
self.mean_ = mean
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
import unittest
|
||||
|
||||
import torch as T
|
||||
|
||||
from TTS.tts.models.forward_tts import ForwardTTS, ForwardTTSArgs
|
||||
|
@ -54,12 +52,12 @@ def model_input_output_test():
|
|||
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||
|
||||
assert outputs["alignment_soft"] == None
|
||||
assert outputs["alignment_mas"] == None
|
||||
assert outputs["alignment_logprob"] == None
|
||||
assert outputs["o_alignment_dur"] == None
|
||||
assert outputs["pitch_avg"] == None
|
||||
assert outputs["pitch_avg_gt"] == None
|
||||
assert outputs["alignment_soft"] is None
|
||||
assert outputs["alignment_mas"] is None
|
||||
assert outputs["alignment_logprob"] is None
|
||||
assert outputs["o_alignment_dur"] is None
|
||||
assert outputs["pitch_avg"] is None
|
||||
assert outputs["pitch_avg_gt"] is None
|
||||
|
||||
# USE PITCH
|
||||
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=True, use_aligner=False))
|
||||
|
@ -85,10 +83,10 @@ def model_input_output_test():
|
|||
assert outputs["pitch_avg"].shape == (2, 1, 21)
|
||||
assert outputs["pitch_avg_gt"].shape == (2, 1, 21)
|
||||
|
||||
assert outputs["alignment_soft"] == None
|
||||
assert outputs["alignment_mas"] == None
|
||||
assert outputs["alignment_logprob"] == None
|
||||
assert outputs["o_alignment_dur"] == None
|
||||
assert outputs["alignment_soft"] is None
|
||||
assert outputs["alignment_mas"] is None
|
||||
assert outputs["alignment_logprob"] is None
|
||||
assert outputs["o_alignment_dur"] is None
|
||||
|
||||
# USE ALIGNER NETWORK
|
||||
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=False, use_aligner=True))
|
||||
|
@ -116,8 +114,8 @@ def model_input_output_test():
|
|||
assert outputs["alignment_logprob"].shape == (2, 1, durations.sum(1).max(), 21)
|
||||
assert outputs["o_alignment_dur"].shape == (2, 21)
|
||||
|
||||
assert outputs["pitch_avg"] == None
|
||||
assert outputs["pitch_avg_gt"] == None
|
||||
assert outputs["pitch_avg"] is None
|
||||
assert outputs["pitch_avg_gt"] is None
|
||||
|
||||
# USE ALIGNER NETWORK AND PITCH
|
||||
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=True, use_aligner=True))
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import torch as T
|
||||
|
||||
from TTS.tts.utils.helpers import *
|
||||
from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask
|
||||
|
||||
|
||||
def average_over_durations_test(): # pylint: disable=no-self-use
|
||||
|
@ -47,7 +47,7 @@ def generate_path_test():
|
|||
durations = durations * x_mask.squeeze(1)
|
||||
y_length = durations.sum(1)
|
||||
y_mask = sequence_mask(y_length).unsqueeze(1).long()
|
||||
attn_mask = (torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)).squeeze(1).long()
|
||||
attn_mask = (T.unsqueeze(x_mask, -1) * T.unsqueeze(y_mask, 2)).squeeze(1).long()
|
||||
print(attn_mask.shape)
|
||||
path = generate_path(durations, attn_mask)
|
||||
assert path.shape == (10, 21, durations.sum(1).max().item())
|
||||
|
|
Loading…
Reference in New Issue