From 0f8932a6a9a1d71c7429c8e07d3c37f1f9a2da25 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Wed, 21 Jun 2023 11:59:27 +0200 Subject: [PATCH] Fix here and ther --- TTS/tts/layers/bark/hubert/hubert_manager.py | 2 + TTS/tts/layers/bark/hubert/tokenizer.py | 4 +- TTS/tts/layers/bark/inference_funcs.py | 95 +++++------ TTS/tts/layers/bark/load_model.py | 159 +++++++++---------- TTS/tts/layers/bark/model.py | 10 +- TTS/tts/layers/bark/model_fine.py | 2 +- TTS/utils/synthesizer.py | 2 +- docs/source/models/tortoise.md | 24 +-- 8 files changed, 138 insertions(+), 160 deletions(-) diff --git a/TTS/tts/layers/bark/hubert/hubert_manager.py b/TTS/tts/layers/bark/hubert/hubert_manager.py index baa26438..4bc19929 100644 --- a/TTS/tts/layers/bark/hubert/hubert_manager.py +++ b/TTS/tts/layers/bark/hubert/hubert_manager.py @@ -17,6 +17,7 @@ class HubertManager: urllib.request.urlretrieve(download_url, model_path) print("Downloaded HuBERT") return model_path + return None @staticmethod def make_sure_tokenizer_installed( @@ -31,3 +32,4 @@ class HubertManager: shutil.move(os.path.join(model_dir, model), model_path) print("Downloaded tokenizer") return model_path + return None diff --git a/TTS/tts/layers/bark/hubert/tokenizer.py b/TTS/tts/layers/bark/hubert/tokenizer.py index 474a08db..be9a50f8 100644 --- a/TTS/tts/layers/bark/hubert/tokenizer.py +++ b/TTS/tts/layers/bark/hubert/tokenizer.py @@ -16,7 +16,7 @@ from torch.serialization import MAP_LOCATION class HubertTokenizer(nn.Module): def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0): - super(HubertTokenizer, self).__init__() + super().__init__() next_size = input_size if version == 0: self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True) @@ -181,7 +181,7 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep epoch = 1 while 1: - for i in range(save_epochs): + for _ in range(save_epochs): j = 0 for x, y in zip(data_x, data_y): model_training.train_step( diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index 6fa87c37..2b27246d 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -16,7 +16,7 @@ from torch.nn import functional as F from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer -from TTS.tts.layers.bark.load_model import _clear_cuda_cache, _inference_mode +from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode logger = logging.getLogger(__name__) @@ -34,34 +34,53 @@ def _normalize_whitespace(text): def get_voices(extra_voice_dirs: List[str] = []): - voices = {} - for dir in extra_voice_dirs: - paths = list(glob(f"{dir}/*.npz")) - for path in paths: - name = os.path.basename(path).replace(".npz", "") - voices[name] = path + dirs = extra_voice_dirs + voices: Dict[str, List[str]] = {} + for d in dirs: + subs = os.listdir(d) + for sub in subs: + subj = os.path.join(d, sub) + if os.path.isdir(subj): + voices[sub] = list(glob(f"{subj}/*.npz")) + # fetch audio files if no npz files are found + if len(voices[sub]) == 0: + voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3")) return voices -def load_voice(voice: str, extra_voice_dirs: List[str] = []): - def load_npz(npz_file): +def load_npz(npz_file): x_history = np.load(npz_file) semantic = x_history["semantic_prompt"] coarse = x_history["coarse_prompt"] fine = x_history["fine_prompt"] return semantic, coarse, fine + +def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value if voice == "random": return None, None, None voices = get_voices(extra_voice_dirs) + paths = voices[voice] + + # bark only uses a single sample for cloning + if len(paths) > 1: + raise ValueError(f"Voice {voice} has multiple paths: {paths}") + try: path = voices[voice] - except KeyError: - raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") - prompt = load_npz(path) - return prompt + except KeyError as e: + raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e + if len(paths) == 1 and paths[0].endswith(".npz"): + return load_npz(path[0]) + else: + audio_path = paths[0] + # replace the file extension with .npz + output_path = os.path.splitext(audio_path)[0] + ".npz" + generate_voice(audio=audio_path, model=model, output_path=output_path) + breakpoint() + return load_voice(model, voice, extra_voice_dirs) def zero_crossing_rate(audio, frame_length=1024, hop_length=512): zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2) @@ -85,7 +104,6 @@ def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250): def generate_voice( audio, - text, model, output_path, ): @@ -106,9 +124,6 @@ def generate_voice( encoded_frames = model.encodec.encode(audio) codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T] - # get seconds of audio - seconds = audio.shape[-1] / model.config.sample_rate - # move codes to cpu codes = codes.cpu().numpy() @@ -133,36 +148,6 @@ def generate_voice( np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens) - # while attempts < max_attempts: - # if attempts > 0 and base is not None: - # # Reset the base model token - # print(f"Reset the base model token Regenerating...") - # base = None - - # audio_array, x = model.generate_audio(text, history_promp=None, base=base, **kwargs) - # zcr = zero_crossing_rate(audio_array) - # spectral_contrast = compute_spectral_contrast(audio_array, model.config.sample_rate) - # bass_energy = compute_average_bass_energy(audio_array, model.config.sample_rate) - # print(f"Attempt {attempts + 1}: ZCR = {zcr}, Spectral Contrast = {spectral_contrast:.2f}, Bass Energy = {bass_energy:.2f}") - - # # Save the audio array to the output_array directory with a random name for debugging - # #output_file = os.path.join(output_directory, f"audio_{zcr:.2f}_sc{spectral_contrast:.2f}_be{bass_energy:.2f}.wav") - # #wavfile.write(output_file, sample_rate, audio_array) - # #print(f"Saved audio array to {output_file}") - - # if zcr < zcr_threshold and spectral_contrast < spectral_threshold and bass_energy < bass_energy_threshold: - # print(f"Audio passed ZCR, Spectral Contrast, and Bass Energy thresholds. No need to regenerate.") - # break - # else: - # print(f"Audio failed ZCR, Spectral Contrast, and/or Bass Energy thresholds. Regenerating...") - - # attempts += 1 - - # if attempts == max_attempts: - # print("Reached maximum attempts. Returning the last generated audio.") - - # return audio_array, x, zcr, spectral_contrast, bass_energy - def generate_text_semantic( text, @@ -224,7 +209,7 @@ def generate_text_semantic( np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64) )[None] assert x.shape[1] == 256 + 256 + 1 - with _inference_mode(): + with inference_mode(): x = x.to(model.device) n_tot_steps = 768 # custom tqdm updates since we don't know when eos will occur @@ -285,8 +270,8 @@ def generate_text_semantic( pbar_state = req_pbar_state pbar.close() out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :] - assert all(0 <= out) and all(out < model.config.SEMANTIC_VOCAB_SIZE) - _clear_cuda_cache() + assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE) + clear_cuda_cache() return out @@ -382,7 +367,7 @@ def generate_coarse( x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32) x_coarse = x_coarse_history.astype(np.int32) base_semantic_idx = len(x_semantic_history) - with _inference_mode(): + with inference_mode(): x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device) x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device) n_window_steps = int(np.ceil(n_steps / sliding_window_len)) @@ -456,7 +441,7 @@ def generate_coarse( ) for n in range(1, model.config.N_COARSE_CODEBOOKS): gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE - _clear_cuda_cache() + clear_cuda_cache() return gen_coarse_audio_arr @@ -526,7 +511,7 @@ def generate_fine( ) # we can be lazy about fractional loop and just keep overwriting codebooks n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1 - with _inference_mode(): + with inference_mode(): in_arr = torch.tensor(in_arr.T).to(model.device) for n in tqdm.tqdm(range(n_loops), disable=silent): start_idx = np.min([n * 512, in_arr.shape[0] - 1024]) @@ -558,14 +543,12 @@ def generate_fine( if n_remove_from_end > 0: gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end] assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1] - _clear_cuda_cache() + clear_cuda_cache() return gen_fine_arr def codec_decode(fine_tokens, model): """Turn quantized audio codes into audio array using encodec.""" - from TTS.utils.audio.numpy_transforms import save_wav - arr = torch.from_numpy(fine_tokens)[None] arr = arr.to(model.device) arr = arr.transpose(0, 1) diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py index dbd861d0..33144ed5 100644 --- a/TTS/tts/layers/bark/load_model.py +++ b/TTS/tts/layers/bark/load_model.py @@ -1,17 +1,12 @@ import contextlib - -# import funcy import functools import hashlib import logging import os -import re import requests import torch import tqdm -from encodec import EncodecModel -from transformers import BertTokenizer from TTS.tts.layers.bark.model import GPT, GPTConfig from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig @@ -31,8 +26,6 @@ else: # hold models in global scope to lazy load -global models -models = {} logger = logging.getLogger(__name__) @@ -44,10 +37,10 @@ if not hasattr(torch.nn.functional, "scaled_dot_product_attention"): ) -def _string_md5(s): - m = hashlib.md5() - m.update(s.encode("utf-8")) - return m.hexdigest() +# def _string_md5(s): +# m = hashlib.md5() +# m.update(s.encode("utf-8")) +# return m.hexdigest() def _md5(fname): @@ -58,18 +51,18 @@ def _md5(fname): return hash_md5.hexdigest() -def _get_ckpt_path(model_type, CACHE_DIR): - model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"]) - return os.path.join(CACHE_DIR, f"{model_name}.pt") +# def _get_ckpt_path(model_type, CACHE_DIR): +# model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"]) +# return os.path.join(CACHE_DIR, f"{model_name}.pt") -S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/" +# S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/" -def _parse_s3_filepath(s3_filepath): - bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1) - rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath) - return bucket_name, rel_s3_filepath +# def _parse_s3_filepath(s3_filepath): +# bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1) +# rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath) +# return bucket_name, rel_s3_filepath def _download(from_s3_path, to_local_path, CACHE_DIR): @@ -83,7 +76,7 @@ def _download(from_s3_path, to_local_path, CACHE_DIR): progress_bar.update(len(data)) file.write(data) progress_bar.close() - if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + if total_size_in_bytes not in [0, progress_bar.n]: raise ValueError("ERROR, something went wrong") @@ -107,27 +100,27 @@ if torch.cuda.is_available(): @contextlib.contextmanager -def _inference_mode(): +def inference_mode(): with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast(): yield -def _clear_cuda_cache(): +def clear_cuda_cache(): if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() -def clean_models(model_key=None): - global models - model_keys = [model_key] if model_key is not None else models.keys() - for k in model_keys: - if k in models: - del models[k] - _clear_cuda_cache() +# def clean_models(model_key=None): +# global models +# model_keys = [model_key] if model_key is not None else models.keys() +# for k in model_keys: +# if k in models: +# del models[k] +# clear_cuda_cache() -def _load_model(ckpt_path, device, config, model_type="text"): +def load_model(ckpt_path, device, config, model_type="text"): logger.info(f"loading {model_type} model from {ckpt_path}...") if device == "cpu": @@ -174,13 +167,13 @@ def _load_model(ckpt_path, device, config, model_type="text"): state_dict = checkpoint["model"] # fixup checkpoint unwanted_prefix = "_orig_mod." - for k, v in list(state_dict.items()): + for k, _ in list(state_dict.items()): if k.startswith(unwanted_prefix): state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k) extra_keys = set(state_dict.keys()) - set(model.state_dict().keys()) - extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")]) + extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias")) missing_keys = set(model.state_dict().keys()) - set(state_dict.keys()) - missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")]) + missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias")) if len(extra_keys) != 0: raise ValueError(f"extra keys found: {extra_keys}") if len(missing_keys) != 0: @@ -192,63 +185,63 @@ def _load_model(ckpt_path, device, config, model_type="text"): model.eval() model.to(device) del checkpoint, state_dict - _clear_cuda_cache() + clear_cuda_cache() return model, config -def _load_codec_model(device): - model = EncodecModel.encodec_model_24khz() - model.set_target_bandwidth(6.0) - model.eval() - model.to(device) - _clear_cuda_cache() - return model +# def _load_codec_model(device): +# model = EncodecModel.encodec_model_24khz() +# model.set_target_bandwidth(6.0) +# model.eval() +# model.to(device) +# clear_cuda_cache() +# return model -def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"): - _load_model_f = functools.partial(_load_model, model_type=model_type) - if model_type not in ("text", "coarse", "fine"): - raise NotImplementedError() - global models - if torch.cuda.device_count() == 0 or not use_gpu: - device = "cpu" - else: - device = "cuda" - model_key = str(device) + f"__{model_type}" - if model_key not in models or force_reload: - if ckpt_path is None: - ckpt_path = _get_ckpt_path(model_type) - clean_models(model_key=model_key) - model = _load_model_f(ckpt_path, device) - models[model_key] = model - return models[model_key] +# def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"): +# _load_model_f = functools.partial(_load_model, model_type=model_type) +# if model_type not in ("text", "coarse", "fine"): +# raise NotImplementedError() +# global models +# if torch.cuda.device_count() == 0 or not use_gpu: +# device = "cpu" +# else: +# device = "cuda" +# model_key = str(device) + f"__{model_type}" +# if model_key not in models or force_reload: +# if ckpt_path is None: +# ckpt_path = _get_ckpt_path(model_type) +# clean_models(model_key=model_key) +# model = _load_model_f(ckpt_path, device) +# models[model_key] = model +# return models[model_key] -def load_codec_model(use_gpu=True, force_reload=False): - global models - if torch.cuda.device_count() == 0 or not use_gpu: - device = "cpu" - else: - device = "cuda" - model_key = str(device) + f"__codec" - if model_key not in models or force_reload: - clean_models(model_key=model_key) - model = _load_codec_model(device) - models[model_key] = model - return models[model_key] +# def load_codec_model(use_gpu=True, force_reload=False): +# global models +# if torch.cuda.device_count() == 0 or not use_gpu: +# device = "cpu" +# else: +# device = "cuda" +# model_key = str(device) + f"__codec" +# if model_key not in models or force_reload: +# clean_models(model_key=model_key) +# model = _load_codec_model(device) +# models[model_key] = model +# return models[model_key] -def preload_models( - text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False -): - global USE_SMALLER_MODELS - global REMOTE_MODEL_PATHS - if use_smaller_models: - USE_SMALLER_MODELS = True - logger.info("Using smaller models generation.py") - REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS +# def preload_models( +# text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False +# ): +# global USE_SMALLER_MODELS +# global REMOTE_MODEL_PATHS +# if use_smaller_models: +# USE_SMALLER_MODELS = True +# logger.info("Using smaller models generation.py") +# REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS - _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True) - _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True) - _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True) - _ = load_codec_model(use_gpu=use_gpu, force_reload=True) +# _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True) +# _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True) +# _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True) +# _ = load_codec_model(use_gpu=use_gpu, force_reload=True) diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index bcc87a4b..c84022bd 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -6,8 +6,8 @@ import math from dataclasses import dataclass import torch -import torch.nn as nn from coqpit import Coqpit +from torch import nn from torch.nn import functional as F @@ -19,8 +19,8 @@ class LayerNorm(nn.Module): self.weight = nn.Parameter(torch.ones(ndim)) self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None - def forward(self, input): - return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5) + def forward(self, x): + return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5) class CausalSelfAttention(nn.Module): @@ -177,7 +177,7 @@ class GPT(nn.Module): def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False): device = idx.device - b, t = idx.size() + _, t = idx.size() if past_kv is not None: assert t == 1 tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) @@ -219,7 +219,7 @@ class GPT(nn.Module): new_kv = () if use_cache else None - for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)): + for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)): x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache) if use_cache: diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py index 8a426107..09e5f476 100644 --- a/TTS/tts/layers/bark/model_fine.py +++ b/TTS/tts/layers/bark/model_fine.py @@ -6,7 +6,7 @@ import math from dataclasses import dataclass import torch -import torch.nn as nn +from torch import nn from torch.nn import functional as F from .model import GPT, MLP, GPTConfig diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 4f7761b9..bbaf2904 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -341,7 +341,7 @@ class Synthesizer(object): use_gl = self.vocoder_model is None - if not reference_wav: + if not reference_wav: # not voice conversion for sen in sens: if hasattr(self.tts_model, "synthesize"): sp_name = "random" if speaker_name is None else speaker_name diff --git a/docs/source/models/tortoise.md b/docs/source/models/tortoise.md index c49a0fcb..d602d597 100644 --- a/docs/source/models/tortoise.md +++ b/docs/source/models/tortoise.md @@ -1,7 +1,7 @@ # Tortoise 🐢 Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to -the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS. +the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS. Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS. @@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig from TTS.tts.models.tortoise import Tortoise config = TortoiseConfig() -model = Tortoise.inif_from_config(config) +model = Tortoise.init_from_config(config) model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True) # with random speaker @@ -29,23 +29,23 @@ from TTS.api import TTS tts = TTS("tts_models/en/multi-dataset/tortoise-v2") # cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj` -# with custom inference settings overriding defaults. -tts.tts_to_file(text="Hello, my name is Manmay , how are you?", +# with custom inference settings overriding defaults. +tts.tts_to_file(text="Hello, my name is Manmay , how are you?", file_path="output.wav", - voice_dir="TTS/tts/utils/assets/tortoise/voices/", + voice_dir="path/to/tortoise/voices/dir/", speaker="lj", num_autoregressive_samples=1, diffusion_iterations=10) # Using presets with the same voice -tts.tts_to_file(text="Hello, my name is Manmay , how are you?", +tts.tts_to_file(text="Hello, my name is Manmay , how are you?", file_path="output.wav", - voice_dir="TTS/tts/utils/assets/tortoise/voices/", + voice_dir="path/to/tortoise/voices/dir/", speaker="lj", preset="ultra_fast") # Random voice generation -tts.tts_to_file(text="Hello, my name is Manmay , how are you?", +tts.tts_to_file(text="Hello, my name is Manmay , how are you?", file_path="output.wav") ``` @@ -54,16 +54,16 @@ Using 🐸TTS Command line: ```console # cloning the `lj` voice tts --model_name tts_models/en/multi-dataset/tortoise-v2 \ ---text "This is an example." \ ---out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \ ---voice_dir TTS/tts/utils/assets/tortoise/voices/ \ +--text "This is an example." \ +--out_path "output.wav" \ +--voice_dir path/to/tortoise/voices/dir/ \ --speaker_idx "lj" \ --progress_bar True # Random voice generation tts --model_name tts_models/en/multi-dataset/tortoise-v2 \ --text "This is an example." \ ---out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \ +--out_path "output.wav" \ --progress_bar True ```