Training recipes for thorsten dataset (#1020)

* Fix style

* Fix isort

* Remove tensorboardX from requirements

Co-authored-by: logan hart <72301874+loganhart420@users.noreply.github.com>
Co-authored-by: Eren Gölge <egolge@coqui.ai>
pull/1675/head speaker_encoder_model
Noran Raskin 2022-05-30 12:07:31 +02:00 committed by GitHub
parent 71111d14e4
commit a790df4e94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 822 additions and 4 deletions

View File

@ -62,4 +62,4 @@ jobs:
platforms: linux/${{ matrix.arch }}
push: ${{ github.event_name == 'push' }}
build-args: "BASE=${{ matrix.base }}"
tags: ${{ steps.compute-tag.outputs.tags }}
tags: ${{ steps.compute-tag.outputs.tags }}

2
.gitignore vendored
View File

@ -166,4 +166,4 @@ internal/*
*_phoneme.npy
wandb
depot/*
coqui_recipes/*
coqui_recipes/*

View File

@ -17,4 +17,4 @@ RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.
COPY . /root
RUN make install
ENTRYPOINT ["tts"]
CMD ["--help"]
CMD ["--help"]

View File

@ -141,6 +141,21 @@ def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-arg
return items
def thorsten(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the thorsten meta data file to TTS format
https://github.com/thorstenMueller/deep-learning-german-tts/"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "thorsten"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
return items
def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the sam-accenture meta data file to TTS format
https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""

View File

@ -0,0 +1,15 @@
# 🐸💬 TTS Thorsten Recipes
For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset.
You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present.
Then, go to your desired model folder and run the training.
Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
```terminal
CUDA_VISIBLE_DEVICES="0" python train_modelX.py
```
💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.

View File

@ -0,0 +1,84 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.align_tts_config import AlignTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.align_tts import AlignTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
config = AlignTTSConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=False,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=True,
mixed_precision=False,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
output_path=output_path,
datasets=[dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = AlignTTS(config, ap, tokenizer)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

View File

@ -0,0 +1,21 @@
# create venv
python3 -m venv env
source .env/bin/activate
pip install pip --upgrade
# download Thorsten_DE dataset
pip install gdown
gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz
tar -xzf dataset.tgz
# create train-val splits
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
# rename dataset and remove archive
mv LJSpeech-1.1 thorsten-de
rm dataset.tgz
# destry venv
rm -rf env

View File

@ -0,0 +1,97 @@
import os
# Trainer: Where the ✨️ happens.
# TrainingArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs
# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
# we use the same path as this script as our training folder.
output_path = os.path.dirname(os.path.abspath(__file__))
# DEFINE DATASET CONFIG
# Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=False,
mixed_precision=True,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
output_path=output_path,
datasets=[dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

View File

@ -0,0 +1,53 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import HifiganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN
output_path = os.path.dirname(os.path.abspath(__file__))
config = HifiganConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=5,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=False,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -0,0 +1,53 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import MultibandMelganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN
output_path = os.path.dirname(os.path.abspath(__file__))
config = MultibandMelganConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=5,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=False,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -0,0 +1,102 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
audio_config = BaseAudioConfig(
sample_rate=22050,
do_trim_silence=True,
trim_db=60.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
config = SpeedySpeechConfig(
run_name="speedy_speech_thorsten-de",
audio=audio_config,
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
compute_input_seq_cache=True,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
min_audio_len=11050, # need to up min_audio_len to avois speedy speech error
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
precompute_num_workers=4,
print_step=50,
print_eval=False,
mixed_precision=False,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
sort_by_audio_len=True,
max_seq_len=500000,
output_path=output_path,
datasets=[dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = ForwardTTS(config, ap, tokenizer)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

View File

@ -0,0 +1,108 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
# from TTS.tts.datasets.tokenizer import Tokenizer
output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
audio_config = BaseAudioConfig(
sample_rate=22050,
do_trim_silence=True,
trim_db=60.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
config = Tacotron2Config( # This is the config that is saved for the future use
audio=audio_config,
batch_size=40, # BS of 40 and max length of 10s will use about 20GB of GPU memory
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
r=6,
gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
double_decoder_consistency=True,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
precompute_num_workers=8,
print_step=25,
print_eval=True,
mixed_precision=False,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
# max audio length of 10 seconds, feel free to increase if you got more than 20GB GPU memory
max_audio_len=22050 * 10,
output_path=output_path,
datasets=[dataset_config],
)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -0,0 +1,52 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import UnivnetConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN
output_path = os.path.dirname(os.path.abspath(__file__))
config = UnivnetConfig(
batch_size=64,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=False,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -0,0 +1,105 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
audio_config = BaseAudioConfig(
sample_rate=22050,
win_length=1024,
hop_length=256,
num_mels=80,
preemphasis=0.0,
ref_level_db=20,
log_func="np.log",
do_trim_silence=True,
trim_db=45,
mel_fmin=0,
mel_fmax=None,
spec_gain=1.0,
signal_norm=False,
do_amp_to_db_linear=False,
)
config = VitsConfig(
audio=audio_config,
run_name="vits_thorsten-de",
batch_size=32,
eval_batch_size=16,
batch_group_size=5,
num_loader_workers=0,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
compute_input_seq_cache=True,
print_step=25,
print_eval=True,
mixed_precision=True,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
output_path=output_path,
datasets=[dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
trainer.fit()

View File

@ -0,0 +1,56 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import WavegradConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.wavegrad import Wavegrad
output_path = os.path.dirname(os.path.abspath(__file__))
config = WavegradConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
seq_len=6144,
pad_short=2000,
use_noise_augment=True,
eval_split_size=50,
print_step=50,
print_eval=True,
mixed_precision=False,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = Wavegrad(config)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,58 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import WavernnConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.wavernn import Wavernn
output_path = os.path.dirname(os.path.abspath(__file__))
config = WavernnConfig(
batch_size=64,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=10000,
seq_len=1280,
pad_short=2000,
use_noise_augment=False,
eval_split_size=10,
print_step=25,
print_eval=True,
mixed_precision=False,
lr=1e-4,
grad_clip=4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = Wavernn(config)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -21,7 +21,6 @@ umap-learn==0.5.1
pandas
# deps for training
matplotlib
tensorboardX
pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
# coqui stack
trainer