diff --git a/.gitignore b/.gitignore index 7e9da0d8..f8d6e644 100644 --- a/.gitignore +++ b/.gitignore @@ -164,4 +164,5 @@ internal/* *_pitch.npy *_phoneme.npy wandb -depot/* \ No newline at end of file +depot/* +coqui_recipes/* \ No newline at end of file diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 79b78767..73063731 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,7 +1,8 @@ import os -from TTS.config import load_config, register_config from trainer import Trainer, TrainerArgs + +from TTS.config import load_config, register_config from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py index 081fdd56..6d4df610 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_vocoder.py @@ -1,7 +1,8 @@ import os -from TTS.config import load_config, register_config from trainer import Trainer, TrainerArgs + +from TTS.config import load_config, register_config from TTS.utils.audio import AudioProcessor from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from TTS.vocoder.models import setup_model diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 0eb2b5f3..ca3c4a28 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -324,9 +324,9 @@ class BaseTTS(BaseModel): loader = DataLoader( dataset, batch_size=config.eval_batch_size if is_eval else config.batch_size, - shuffle=False, # shuffle is done in the dataset. + shuffle=False, # shuffle is done in the dataset. collate_fn=dataset.collate_fn, - drop_last=True, # setting this False might cause issues in AMP training. + drop_last=True, # setting this False might cause issues in AMP training. sampler=sampler, num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, pin_memory=False, diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index f02090cf..256ea3af 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -994,9 +994,9 @@ class Vits(BaseTTS): print(" !! Error creating Test Sentence -", idx) return {"figures": test_figures, "audios": test_audios} - def test_log(self, outputs: dict, logger: "Logger", assets: dict, steps:int) -> None: - logger.test_audios(steps, outputs['audios'], self.ap.sample_rate) - logger.test_figures(steps, outputs['figures']) + def test_log(self, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: + logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) + logger.test_figures(steps, outputs["figures"]) def get_optimizer(self) -> List: """Initiate and return the GAN optimizers based on the config parameters. diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index 2fe0c39c..442dcef2 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -11,6 +11,7 @@ def is_tool(name): return which(name) is not None + # priority: espeakng > espeak if is_tool("espeak-ng"): _DEF_ESPEAK_LIB = "espeak-ng" @@ -116,7 +117,6 @@ class ESpeak(BasePhonemizer): # ^ self.num_skip_chars = 1 - def auto_set_espeak_lib(self) -> None: if is_tool("espeak-ng"): self._ESPEAK_LIB = "espeak-ng" @@ -163,7 +163,7 @@ class ESpeak(BasePhonemizer): phonemes = "" for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True): logging.debug("line: %s", repr(line)) - phonemes += line.decode("utf8").strip()[self.num_skip_chars:] # skip initial redundant characters + phonemes += line.decode("utf8").strip()[self.num_skip_chars :] # skip initial redundant characters return phonemes.replace("_", separator) def _phonemize(self, text, separator=None): diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py index a4b868aa..f1b29025 100644 --- a/recipes/ljspeech/align_tts/train_aligntts.py +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -1,6 +1,7 @@ import os from trainer import Trainer, TrainerArgs + from TTS.tts.configs.align_tts_config import AlignTTSConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index fcb62282..a3fc35c9 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index 183c8ebb..560d3de2 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -1,7 +1,8 @@ import os -from TTS.config import BaseAudioConfig, BaseDatasetConfig from trainer import Trainer, TrainerArgs + +from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.tts.configs.fast_speech_config import FastSpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py index 964a6420..1e5bbf30 100644 --- a/recipes/ljspeech/hifigan/train_hifigan.py +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -1,6 +1,7 @@ import os from trainer import Trainer, TrainerArgs + from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import HifiganConfig from TTS.vocoder.datasets.preprocess import load_wav_data diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py index 6f528a83..40ff5a00 100644 --- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -1,6 +1,7 @@ import os from trainer import Trainer, TrainerArgs + from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import MultibandMelganConfig from TTS.vocoder.datasets.preprocess import load_wav_data diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index 6a9ddf16..7ad132b2 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -1,7 +1,8 @@ import os -from TTS.config import BaseAudioConfig, BaseDatasetConfig from trainer import Trainer, TrainerArgs + +from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py index c3a1c51c..ea1b0874 100644 --- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py +++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index a7482b32..d00f8ed7 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py index 35240c5b..19c91925 100644 --- a/recipes/ljspeech/univnet/train.py +++ b/recipes/ljspeech/univnet/train.py @@ -1,6 +1,7 @@ import os from trainer import Trainer, TrainerArgs + from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import UnivnetConfig from TTS.vocoder.datasets.preprocess import load_wav_data diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py index 24ff4d0f..cfb3351d 100644 --- a/recipes/ljspeech/vits_tts/train_vits.py +++ b/recipes/ljspeech/vits_tts/train_vits.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples diff --git a/recipes/ljspeech/wavegrad/train_wavegrad.py b/recipes/ljspeech/wavegrad/train_wavegrad.py index 095773d6..1abdf45d 100644 --- a/recipes/ljspeech/wavegrad/train_wavegrad.py +++ b/recipes/ljspeech/wavegrad/train_wavegrad.py @@ -1,6 +1,7 @@ import os from trainer import Trainer, TrainerArgs + from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import WavegradConfig from TTS.vocoder.datasets.preprocess import load_wav_data diff --git a/recipes/ljspeech/wavernn/train_wavernn.py b/recipes/ljspeech/wavernn/train_wavernn.py index 172b489a..640f5092 100644 --- a/recipes/ljspeech/wavernn/train_wavernn.py +++ b/recipes/ljspeech/wavernn/train_wavernn.py @@ -1,6 +1,7 @@ import os from trainer import Trainer, TrainerArgs + from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import WavernnConfig from TTS.vocoder.datasets.preprocess import load_wav_data diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 391f31cb..ea4f377b 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -1,8 +1,9 @@ import os from glob import glob -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index aeb62055..986202c5 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -1,7 +1,8 @@ import os -from TTS.config import BaseAudioConfig, BaseDatasetConfig from trainer import Trainer, TrainerArgs + +from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index 578fbd1a..fe785a41 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -1,7 +1,8 @@ import os -from TTS.config import BaseAudioConfig, BaseDatasetConfig from trainer import Trainer, TrainerArgs + +from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.tts.configs.fast_speech_config import FastSpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index 0f198a86..ebdbfb37 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.glow_tts_config import GlowTTSConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py index fbb1af2d..80d21ca2 100644 --- a/recipes/vctk/speedy_speech/train_speedy_speech.py +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -1,7 +1,8 @@ import os -from TTS.config import BaseAudioConfig, BaseDatasetConfig from trainer import Trainer, TrainerArgs + +from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py index 917c5588..bed21ad9 100644 --- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.tacotron_config import TacotronConfig from TTS.tts.datasets import load_tts_samples diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index 759ddd57..caa745b3 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py index 0c62da48..43f5d4e6 100644 --- a/recipes/vctk/tacotron2/train_tacotron2.py +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index 53d7242c..dff4eefc 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -1,7 +1,8 @@ import os -from TTS.config.shared_configs import BaseAudioConfig from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples @@ -57,7 +58,7 @@ config = VitsConfig( print_step=25, print_eval=False, mixed_precision=True, - max_text_len= 325, # change this if you have a larger VRAM than 16GB + max_text_len=325, # change this if you have a larger VRAM than 16GB output_path=output_path, datasets=[dataset_config], )