mirror of https://github.com/coqui-ai/TTS.git
parent
483888b9d8
commit
69f080eb47
|
@ -140,13 +140,13 @@ class DelightfulTTSConfig(BaseTTSConfig):
|
|||
d_vector_dim: int = None
|
||||
|
||||
# testing
|
||||
test_sentences: List[str] = field(
|
||||
test_sentences: List[List[str]] = field(
|
||||
default_factory=lambda: [
|
||||
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||
"Be a voice, not an echo.",
|
||||
"I'm sorry Dave. I'm afraid I can't do that.",
|
||||
"This cake is great. It's so delicious and moist.",
|
||||
"Prior to November 22, 1963.",
|
||||
["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
|
||||
["Be a voice, not an echo."],
|
||||
["I'm sorry Dave. I'm afraid I can't do that."],
|
||||
["This cake is great. It's so delicious and moist."],
|
||||
["Prior to November 22, 1963."],
|
||||
]
|
||||
)
|
||||
|
||||
|
|
|
@ -602,6 +602,7 @@ def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
|
|||
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
||||
return items
|
||||
|
||||
|
||||
def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
|
|
|
@ -49,7 +49,7 @@ def id_to_torch(aux_id, cuda=False):
|
|||
def embedding_to_torch(d_vector, cuda=False):
|
||||
if d_vector is not None:
|
||||
d_vector = np.asarray(d_vector)
|
||||
d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
|
||||
d_vector = torch.from_numpy(d_vector).float()
|
||||
d_vector = d_vector.squeeze().unsqueeze(0)
|
||||
if cuda:
|
||||
return d_vector.cuda()
|
||||
|
@ -1151,7 +1151,7 @@ class DelightfulTTS(BaseTTSE2E):
|
|||
if speaker_name is None:
|
||||
speaker_id = self.speaker_manager.get_random_id()
|
||||
else:
|
||||
speaker_id = self.speaker_manager.ids[speaker_name]
|
||||
speaker_id = self.speaker_manager.name_to_id[speaker_name]
|
||||
|
||||
return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector}
|
||||
|
||||
|
@ -1208,17 +1208,16 @@ class DelightfulTTS(BaseTTSE2E):
|
|||
|
||||
# set speaker inputs
|
||||
_speaker_id = None
|
||||
if speaker_id is not None and (self.args.use_speaker_embedding or self.args.use_d_vector_file):
|
||||
if speaker_id is not None and self.args.use_speaker_embedding:
|
||||
if isinstance(speaker_id, str) and self.args.use_speaker_embedding:
|
||||
# get the speaker id for the speaker embedding layer
|
||||
_speaker_id = self.speaker_manager.name_to_id[speaker_id]
|
||||
_speaker_id = id_to_torch(_speaker_id, cuda=is_cuda)
|
||||
else:
|
||||
# get the average d_vector for the speaker
|
||||
d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False)
|
||||
|
||||
if d_vector is not None and self.args.use_d_vector_file:
|
||||
d_vector = embedding_to_torch(d_vector, cuda=is_cuda)
|
||||
if speaker_id is not None and self.args.use_d_vector_file:
|
||||
# get the average d_vector for the speaker
|
||||
d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False)
|
||||
d_vector = embedding_to_torch(d_vector, cuda=is_cuda)
|
||||
|
||||
text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda)
|
||||
text_inputs = text_inputs.unsqueeze(0)
|
||||
|
|
|
@ -1814,7 +1814,7 @@ class Vits(BaseTTS):
|
|||
# rollback values
|
||||
_forward = self.forward
|
||||
disc = None
|
||||
if hasattr(self, 'disc'):
|
||||
if hasattr(self, "disc"):
|
||||
disc = self.disc
|
||||
training = self.training
|
||||
|
||||
|
@ -1916,7 +1916,7 @@ class Vits(BaseTTS):
|
|||
"input_lengths": x_lengths,
|
||||
"scales": scales,
|
||||
"sid": None if speaker_id is None else torch.tensor([speaker_id]).cpu().numpy(),
|
||||
"langid": None if language_id is None else torch.tensor([language_id]).cpu().numpy()
|
||||
"langid": None if language_id is None else torch.tensor([language_id]).cpu().numpy(),
|
||||
},
|
||||
)
|
||||
return audio[0][0]
|
||||
|
|
|
@ -164,6 +164,7 @@ def multilingual_cleaners(text):
|
|||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def no_cleaners(text):
|
||||
# remove newline characters
|
||||
text = text.replace("\n", "")
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from train_glowtts import config
|
||||
import json
|
||||
import re
|
||||
|
||||
from train_glowtts import config
|
||||
|
||||
s = json.dumps(config, default=vars, indent=2)
|
||||
s = re.sub(r'"test_sentences":\s*\[\],', '', s)
|
||||
s = re.sub(r'"test_sentences":\s*\[\],', "", s)
|
||||
print(s)
|
||||
|
|
|
@ -3,30 +3,31 @@ import os
|
|||
# Trainer: Where the ✨️ happens.
|
||||
# TrainingArgs: Defines the set of arguments of the Trainer.
|
||||
from trainer import Trainer, TrainerArgs
|
||||
from TTS.tts.configs.shared_configs import BaseAudioConfig
|
||||
|
||||
# GlowTTSConfig: all model related values for training, validating and testing.
|
||||
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
||||
|
||||
# BaseDatasetConfig: defines name, formatter and path of the dataset.
|
||||
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
|
||||
from TTS.tts.configs.shared_configs import BaseAudioConfig, BaseDatasetConfig, CharactersConfig
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.glow_tts import GlowTTS
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
# we use the same path as this script as our training folder.
|
||||
output_path = '/storage/output-glowtts/'
|
||||
output_path = "/storage/output-glowtts/"
|
||||
|
||||
|
||||
# DEFINE DATASET CONFIG
|
||||
# Set LJSpeech as our target dataset and define its path.
|
||||
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
|
||||
dataset_config = BaseDatasetConfig(
|
||||
formatter="bel_tts_formatter", meta_file_train="ipa_final_dataset.csv", path=os.path.join(output_path, "/storage/filtered_dataset/")
|
||||
formatter="bel_tts_formatter",
|
||||
meta_file_train="ipa_final_dataset.csv",
|
||||
path=os.path.join(output_path, "/storage/filtered_dataset/"),
|
||||
)
|
||||
|
||||
characters=CharactersConfig(
|
||||
characters = CharactersConfig(
|
||||
characters_class="TTS.tts.utils.text.characters.Graphemes",
|
||||
pad="_",
|
||||
eos="~",
|
||||
|
@ -71,22 +72,21 @@ config = GlowTTSConfig(
|
|||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# INITIALIZE THE AUDIO PROCESSOR
|
||||
# Audio processor is used for feature extraction and audio I/O.
|
||||
# It mainly serves to the dataloader and the training loggers.
|
||||
# INITIALIZE THE AUDIO PROCESSOR
|
||||
# Audio processor is used for feature extraction and audio I/O.
|
||||
# It mainly serves to the dataloader and the training loggers.
|
||||
ap = AudioProcessor.init_from_config(config)
|
||||
|
||||
# INITIALIZE THE TOKENIZER
|
||||
# Tokenizer is used to convert text to sequences of token IDs.
|
||||
# If characters are not defined in the config, default characters are passed to the config
|
||||
# INITIALIZE THE TOKENIZER
|
||||
# Tokenizer is used to convert text to sequences of token IDs.
|
||||
# If characters are not defined in the config, default characters are passed to the config
|
||||
tokenizer, config = TTSTokenizer.init_from_config(config)
|
||||
|
||||
# LOAD DATA SAMPLES
|
||||
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
|
||||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
# LOAD DATA SAMPLES
|
||||
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
|
||||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
|
@ -94,18 +94,18 @@ if __name__ == "__main__":
|
|||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# INITIALIZE THE MODEL
|
||||
# Models take a config object and a speaker manager as input
|
||||
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
|
||||
# Speaker manager is used by multi-speaker models.
|
||||
# INITIALIZE THE MODEL
|
||||
# Models take a config object and a speaker manager as input
|
||||
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
|
||||
# Speaker manager is used by multi-speaker models.
|
||||
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
|
||||
|
||||
# INITIALIZE THE TRAINER
|
||||
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
|
||||
# distributed training, etc.
|
||||
# INITIALIZE THE TRAINER
|
||||
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
|
||||
# distributed training, etc.
|
||||
trainer = Trainer(
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
)
|
||||
|
||||
# AND... 3,2,1... 🚀
|
||||
# AND... 3,2,1... 🚀
|
||||
trainer.fit()
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
import os
|
||||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
from TTS.tts.configs.shared_configs import BaseAudioConfig
|
||||
from coqpit import Coqpit
|
||||
from trainer import Trainer, TrainerArgs
|
||||
|
||||
from TTS.tts.configs.shared_configs import BaseAudioConfig
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.vocoder.configs.hifigan_config import *;
|
||||
from TTS.vocoder.configs.hifigan_config import *
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.models.gan import GAN
|
||||
|
||||
output_path = '/storage/output-hifigan/'
|
||||
output_path = "/storage/output-hifigan/"
|
||||
|
||||
audio_config = BaseAudioConfig(
|
||||
mel_fmin=50,
|
||||
|
|
|
@ -5,7 +5,7 @@ from trainer import Trainer, TrainerArgs
|
|||
from TTS.config.shared_configs import BaseDatasetConfig
|
||||
from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTTS, VocoderConfig
|
||||
from TTS.tts.models.delightful_tts import DelightfulTTS, DelightfulTtsArgs, VocoderConfig
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
@ -14,7 +14,9 @@ data_path = "/raid/datasets/vctk_v092_48khz_removed_silence_silero_vad"
|
|||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
dataset_config = BaseDatasetConfig(dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us")
|
||||
dataset_config = BaseDatasetConfig(
|
||||
dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us"
|
||||
)
|
||||
|
||||
audio_config = DelightfulTtsAudioConfig()
|
||||
|
||||
|
@ -73,9 +75,7 @@ speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speak
|
|||
config.model_args.num_speakers = speaker_manager.num_speakers
|
||||
|
||||
|
||||
model = DelightfulTTS(
|
||||
ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None
|
||||
)
|
||||
model = DelightfulTTS(ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None)
|
||||
|
||||
trainer = Trainer(
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
|
|
|
@ -39,7 +39,9 @@ config = DelightfulTTSConfig(
|
|||
print_eval=True,
|
||||
binary_align_loss_alpha=0.0,
|
||||
use_attn_priors=False,
|
||||
test_sentences=["Be a voice, not an echo."],
|
||||
test_sentences=[
|
||||
["Be a voice, not an echo.", "ljspeech-0"],
|
||||
],
|
||||
output_path=output_path,
|
||||
use_speaker_embedding=False,
|
||||
use_d_vector_file=True,
|
||||
|
|
|
@ -37,7 +37,9 @@ config = DelightfulTTSConfig(
|
|||
print_eval=True,
|
||||
binary_align_loss_alpha=0.0,
|
||||
use_attn_priors=False,
|
||||
test_sentences=["Be a voice, not an echo."],
|
||||
test_sentences=[
|
||||
["Be a voice, not an echo.", "ljspeech"],
|
||||
],
|
||||
output_path=output_path,
|
||||
num_speakers=4,
|
||||
use_speaker_embedding=True,
|
||||
|
|
|
@ -51,7 +51,7 @@ config = DelightfulTTSConfig(
|
|||
use_attn_priors=False,
|
||||
print_eval=True,
|
||||
test_sentences=[
|
||||
"Be a voice, not an echo.",
|
||||
["Be a voice, not an echo."],
|
||||
],
|
||||
use_speaker_embedding=False,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue