Fix here and ther

2023-06-21 11:59:27 +02:00 · 2023-06-21 11:59:27 +02:00 · 0f8932a6a9
parent 03c347b7f3
commit 0f8932a6a9
8 changed files with 138 additions and 160 deletions
--- a/TTS/tts/layers/bark/hubert/hubert_manager.py
+++ b/TTS/tts/layers/bark/hubert/hubert_manager.py
@ -17,6 +17,7 @@ class HubertManager:
            urllib.request.urlretrieve(download_url, model_path)
            print("Downloaded HuBERT")
            return model_path
+        return None

    @staticmethod
    def make_sure_tokenizer_installed(
@ -31,3 +32,4 @@ class HubertManager:
            shutil.move(os.path.join(model_dir, model), model_path)
            print("Downloaded tokenizer")
            return model_path
+        return None
--- a/TTS/tts/layers/bark/hubert/tokenizer.py
+++ b/TTS/tts/layers/bark/hubert/tokenizer.py
@ -16,7 +16,7 @@ from torch.serialization import MAP_LOCATION

 class HubertTokenizer(nn.Module):
    def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
-        super(HubertTokenizer, self).__init__()
+        super().__init__()
        next_size = input_size
        if version == 0:
            self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
@ -181,7 +181,7 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
    epoch = 1

    while 1:
-        for i in range(save_epochs):
+        for _ in range(save_epochs):
            j = 0
            for x, y in zip(data_x, data_y):
                model_training.train_step(
--- a/TTS/tts/layers/bark/inference_funcs.py
+++ b/TTS/tts/layers/bark/inference_funcs.py
@ -16,7 +16,7 @@ from torch.nn import functional as F
 from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
 from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
 from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
-from TTS.tts.layers.bark.load_model import _clear_cuda_cache, _inference_mode
+from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode

 logger = logging.getLogger(__name__)

@ -34,34 +34,53 @@ def _normalize_whitespace(text):


 def get_voices(extra_voice_dirs: List[str] = []):
-    voices = {}
-    for dir in extra_voice_dirs:
-        paths = list(glob(f"{dir}/*.npz"))
-        for path in paths:
-            name = os.path.basename(path).replace(".npz", "")
-            voices[name] = path
+    dirs = extra_voice_dirs
+    voices: Dict[str, List[str]] = {}
+    for d in dirs:
+        subs = os.listdir(d)
+        for sub in subs:
+            subj = os.path.join(d, sub)
+            if os.path.isdir(subj):
+                voices[sub] = list(glob(f"{subj}/*.npz"))
+                # fetch audio files if no npz files are found
+                if len(voices[sub]) == 0:
+                    voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
    return voices


-def load_voice(voice: str, extra_voice_dirs: List[str] = []):
-    def load_npz(npz_file):
+def load_npz(npz_file):
        x_history = np.load(npz_file)
        semantic = x_history["semantic_prompt"]
        coarse = x_history["coarse_prompt"]
        fine = x_history["fine_prompt"]
        return semantic, coarse, fine

+
+def load_voice(model, voice: str, extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-default-value
    if voice == "random":
        return None, None, None

    voices = get_voices(extra_voice_dirs)
+    paths = voices[voice]
+
+    # bark only uses a single sample for cloning
+    if len(paths) > 1:
+        raise ValueError(f"Voice {voice} has multiple paths: {paths}")
+
    try:
        path = voices[voice]
-    except KeyError:
-        raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}")
-    prompt = load_npz(path)
-    return prompt
+    except KeyError as e:
+        raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e

+    if len(paths) == 1 and paths[0].endswith(".npz"):
+        return load_npz(path[0])
+    else:
+        audio_path = paths[0]
+        # replace the file extension with .npz
+        output_path = os.path.splitext(audio_path)[0] + ".npz"
+        generate_voice(audio=audio_path, model=model, output_path=output_path)
+        breakpoint()
+        return load_voice(model, voice, extra_voice_dirs)

 def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
    zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
@ -85,7 +104,6 @@ def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):

 def generate_voice(
    audio,
-    text,
    model,
    output_path,
 ):
@ -106,9 +124,6 @@ def generate_voice(
        encoded_frames = model.encodec.encode(audio)
    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]

-    # get seconds of audio
-    seconds = audio.shape[-1] / model.config.sample_rate
-
    # move codes to cpu
    codes = codes.cpu().numpy()

@ -133,36 +148,6 @@ def generate_voice(

    np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

-    # while attempts < max_attempts:
-    #     if attempts > 0 and base is not None:
-    #         # Reset the base model token
-    #         print(f"Reset the base model token Regenerating...")
-    #         base = None
-
-    #     audio_array, x = model.generate_audio(text, history_promp=None, base=base, **kwargs)
-    #     zcr = zero_crossing_rate(audio_array)
-    #     spectral_contrast = compute_spectral_contrast(audio_array, model.config.sample_rate)
-    #     bass_energy = compute_average_bass_energy(audio_array, model.config.sample_rate)
-    #     print(f"Attempt {attempts + 1}: ZCR = {zcr}, Spectral Contrast = {spectral_contrast:.2f}, Bass Energy = {bass_energy:.2f}")
-
-    #     # Save the audio array to the output_array directory with a random name for debugging
-    #     #output_file = os.path.join(output_directory, f"audio_{zcr:.2f}_sc{spectral_contrast:.2f}_be{bass_energy:.2f}.wav")
-    #     #wavfile.write(output_file, sample_rate, audio_array)
-    #     #print(f"Saved audio array to {output_file}")
-
-    #     if zcr < zcr_threshold and spectral_contrast < spectral_threshold and bass_energy < bass_energy_threshold:
-    #         print(f"Audio passed ZCR, Spectral Contrast, and Bass Energy thresholds. No need to regenerate.")
-    #         break
-    #     else:
-    #         print(f"Audio failed ZCR, Spectral Contrast, and/or Bass Energy thresholds. Regenerating...")
-
-    #     attempts += 1
-
-    # if attempts == max_attempts:
-    #     print("Reached maximum attempts. Returning the last generated audio.")
-
-    # return audio_array, x, zcr, spectral_contrast, bass_energy
-

 def generate_text_semantic(
    text,
@ -224,7 +209,7 @@ def generate_text_semantic(
        np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
    )[None]
    assert x.shape[1] == 256 + 256 + 1
-    with _inference_mode():
+    with inference_mode():
        x = x.to(model.device)
        n_tot_steps = 768
        # custom tqdm updates since we don't know when eos will occur
@ -285,8 +270,8 @@ def generate_text_semantic(
            pbar_state = req_pbar_state
        pbar.close()
        out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
-    assert all(0 <= out) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
-    _clear_cuda_cache()
+    assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
+    clear_cuda_cache()
    return out


@ -382,7 +367,7 @@ def generate_coarse(
    x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
    x_coarse = x_coarse_history.astype(np.int32)
    base_semantic_idx = len(x_semantic_history)
-    with _inference_mode():
+    with inference_mode():
        x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
        x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
        n_window_steps = int(np.ceil(n_steps / sliding_window_len))
@ -456,7 +441,7 @@ def generate_coarse(
    )
    for n in range(1, model.config.N_COARSE_CODEBOOKS):
        gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
-    _clear_cuda_cache()
+    clear_cuda_cache()
    return gen_coarse_audio_arr


@ -526,7 +511,7 @@ def generate_fine(
        )
    # we can be lazy about fractional loop and just keep overwriting codebooks
    n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
-    with _inference_mode():
+    with inference_mode():
        in_arr = torch.tensor(in_arr.T).to(model.device)
        for n in tqdm.tqdm(range(n_loops), disable=silent):
            start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
@ -558,14 +543,12 @@ def generate_fine(
    if n_remove_from_end > 0:
        gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
    assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
-    _clear_cuda_cache()
+    clear_cuda_cache()
    return gen_fine_arr


 def codec_decode(fine_tokens, model):
    """Turn quantized audio codes into audio array using encodec."""
-    from TTS.utils.audio.numpy_transforms import save_wav
-
    arr = torch.from_numpy(fine_tokens)[None]
    arr = arr.to(model.device)
    arr = arr.transpose(0, 1)
--- a/TTS/tts/layers/bark/load_model.py
+++ b/TTS/tts/layers/bark/load_model.py
@ -1,17 +1,12 @@
 import contextlib
-
-# import funcy
 import functools
 import hashlib
 import logging
 import os
-import re

 import requests
 import torch
 import tqdm
-from encodec import EncodecModel
-from transformers import BertTokenizer

 from TTS.tts.layers.bark.model import GPT, GPTConfig
 from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
@ -31,8 +26,6 @@ else:


 # hold models in global scope to lazy load
-global models
-models = {}

 logger = logging.getLogger(__name__)

@ -44,10 +37,10 @@ if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
    )


-def _string_md5(s):
-    m = hashlib.md5()
-    m.update(s.encode("utf-8"))
-    return m.hexdigest()
+# def _string_md5(s):
+#     m = hashlib.md5()
+#     m.update(s.encode("utf-8"))
+#     return m.hexdigest()


 def _md5(fname):
@ -58,18 +51,18 @@ def _md5(fname):
    return hash_md5.hexdigest()


-def _get_ckpt_path(model_type, CACHE_DIR):
-    model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
-    return os.path.join(CACHE_DIR, f"{model_name}.pt")
+# def _get_ckpt_path(model_type, CACHE_DIR):
+#     model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
+#     return os.path.join(CACHE_DIR, f"{model_name}.pt")


-S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
+# S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"


-def _parse_s3_filepath(s3_filepath):
-    bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
-    rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
-    return bucket_name, rel_s3_filepath
+# def _parse_s3_filepath(s3_filepath):
+#     bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
+#     rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
+#     return bucket_name, rel_s3_filepath


 def _download(from_s3_path, to_local_path, CACHE_DIR):
@ -83,7 +76,7 @@ def _download(from_s3_path, to_local_path, CACHE_DIR):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
-    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+    if total_size_in_bytes not in [0, progress_bar.n]:
        raise ValueError("ERROR, something went wrong")


@ -107,27 +100,27 @@ if torch.cuda.is_available():


@contextlib.contextmanager
-def _inference_mode():
+def inference_mode():
    with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
        yield


-def _clear_cuda_cache():
+def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()


-def clean_models(model_key=None):
-    global models
-    model_keys = [model_key] if model_key is not None else models.keys()
-    for k in model_keys:
-        if k in models:
-            del models[k]
-    _clear_cuda_cache()
+# def clean_models(model_key=None):
+#     global models
+#     model_keys = [model_key] if model_key is not None else models.keys()
+#     for k in model_keys:
+#         if k in models:
+#             del models[k]
+#     clear_cuda_cache()


-def _load_model(ckpt_path, device, config, model_type="text"):
+def load_model(ckpt_path, device, config, model_type="text"):
    logger.info(f"loading {model_type} model from {ckpt_path}...")

    if device == "cpu":
@ -174,13 +167,13 @@ def _load_model(ckpt_path, device, config, model_type="text"):
    state_dict = checkpoint["model"]
    # fixup checkpoint
    unwanted_prefix = "_orig_mod."
-    for k, v in list(state_dict.items()):
+    for k, _ in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
-    extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
+    extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
+    missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
    if len(extra_keys) != 0:
        raise ValueError(f"extra keys found: {extra_keys}")
    if len(missing_keys) != 0:
@ -192,63 +185,63 @@ def _load_model(ckpt_path, device, config, model_type="text"):
    model.eval()
    model.to(device)
    del checkpoint, state_dict
-    _clear_cuda_cache()
+    clear_cuda_cache()
    return model, config


-def _load_codec_model(device):
-    model = EncodecModel.encodec_model_24khz()
-    model.set_target_bandwidth(6.0)
-    model.eval()
-    model.to(device)
-    _clear_cuda_cache()
-    return model
+# def _load_codec_model(device):
+#     model = EncodecModel.encodec_model_24khz()
+#     model.set_target_bandwidth(6.0)
+#     model.eval()
+#     model.to(device)
+#     clear_cuda_cache()
+#     return model


-def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
-    _load_model_f = functools.partial(_load_model, model_type=model_type)
-    if model_type not in ("text", "coarse", "fine"):
-        raise NotImplementedError()
-    global models
-    if torch.cuda.device_count() == 0 or not use_gpu:
-        device = "cpu"
-    else:
-        device = "cuda"
-    model_key = str(device) + f"__{model_type}"
-    if model_key not in models or force_reload:
-        if ckpt_path is None:
-            ckpt_path = _get_ckpt_path(model_type)
-        clean_models(model_key=model_key)
-        model = _load_model_f(ckpt_path, device)
-        models[model_key] = model
-    return models[model_key]
+# def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
+#     _load_model_f = functools.partial(_load_model, model_type=model_type)
+#     if model_type not in ("text", "coarse", "fine"):
+#         raise NotImplementedError()
+#     global models
+#     if torch.cuda.device_count() == 0 or not use_gpu:
+#         device = "cpu"
+#     else:
+#         device = "cuda"
+#     model_key = str(device) + f"__{model_type}"
+#     if model_key not in models or force_reload:
+#         if ckpt_path is None:
+#             ckpt_path = _get_ckpt_path(model_type)
+#         clean_models(model_key=model_key)
+#         model = _load_model_f(ckpt_path, device)
+#         models[model_key] = model
+#     return models[model_key]


-def load_codec_model(use_gpu=True, force_reload=False):
-    global models
-    if torch.cuda.device_count() == 0 or not use_gpu:
-        device = "cpu"
-    else:
-        device = "cuda"
-    model_key = str(device) + f"__codec"
-    if model_key not in models or force_reload:
-        clean_models(model_key=model_key)
-        model = _load_codec_model(device)
-        models[model_key] = model
-    return models[model_key]
+# def load_codec_model(use_gpu=True, force_reload=False):
+#     global models
+#     if torch.cuda.device_count() == 0 or not use_gpu:
+#         device = "cpu"
+#     else:
+#         device = "cuda"
+#     model_key = str(device) + f"__codec"
+#     if model_key not in models or force_reload:
+#         clean_models(model_key=model_key)
+#         model = _load_codec_model(device)
+#         models[model_key] = model
+#     return models[model_key]


-def preload_models(
-    text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
-):
-    global USE_SMALLER_MODELS
-    global REMOTE_MODEL_PATHS
-    if use_smaller_models:
-        USE_SMALLER_MODELS = True
-        logger.info("Using smaller models generation.py")
-        REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
+# def preload_models(
+#     text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
+# ):
+#     global USE_SMALLER_MODELS
+#     global REMOTE_MODEL_PATHS
+#     if use_smaller_models:
+#         USE_SMALLER_MODELS = True
+#         logger.info("Using smaller models generation.py")
+#         REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS

-    _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
-    _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
-    _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
-    _ = load_codec_model(use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
+#     _ = load_codec_model(use_gpu=use_gpu, force_reload=True)
--- a/TTS/tts/layers/bark/model.py
+++ b/TTS/tts/layers/bark/model.py
@ -6,8 +6,8 @@ import math
 from dataclasses import dataclass

 import torch
-import torch.nn as nn
 from coqpit import Coqpit
+from torch import nn
 from torch.nn import functional as F


@ -19,8 +19,8 @@ class LayerNorm(nn.Module):
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

-    def forward(self, input):
-        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+    def forward(self, x):
+        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)


 class CausalSelfAttention(nn.Module):
@ -177,7 +177,7 @@ class GPT(nn.Module):

    def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
        device = idx.device
-        b, t = idx.size()
+        _, t = idx.size()
        if past_kv is not None:
            assert t == 1
            tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
@ -219,7 +219,7 @@ class GPT(nn.Module):

        new_kv = () if use_cache else None

-        for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
+        for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
            x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)

            if use_cache:
--- a/TTS/tts/layers/bark/model_fine.py
+++ b/TTS/tts/layers/bark/model_fine.py
@ -6,7 +6,7 @@ import math
 from dataclasses import dataclass

 import torch
-import torch.nn as nn
+from torch import nn
 from torch.nn import functional as F

 from .model import GPT, MLP, GPTConfig
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -341,7 +341,7 @@ class Synthesizer(object):

        use_gl = self.vocoder_model is None

-        if not reference_wav:
+        if not reference_wav:  # not voice conversion
            for sen in sens:
                if hasattr(self.tts_model, "synthesize"):
                    sp_name = "random" if speaker_name is None else speaker_name
--- a/docs/source/models/tortoise.md
+++ b/docs/source/models/tortoise.md
@ -1,7 +1,7 @@
 # Tortoise 🐢
 Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input
 text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to
-the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS. 
+the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.

 Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS.

@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
 from TTS.tts.models.tortoise import Tortoise

 config = TortoiseConfig()
-model = Tortoise.inif_from_config(config)
+model = Tortoise.init_from_config(config)
 model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)

 # with random speaker
@ -29,23 +29,23 @@ from TTS.api import TTS
 tts = TTS("tts_models/en/multi-dataset/tortoise-v2")

 # cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
-# with custom inference settings overriding defaults. 
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+# with custom inference settings overriding defaults.
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                file_path="output.wav",
-                voice_dir="TTS/tts/utils/assets/tortoise/voices/",
+                voice_dir="path/to/tortoise/voices/dir/",
                speaker="lj",
                num_autoregressive_samples=1,
                diffusion_iterations=10)

 # Using presets with the same voice
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                file_path="output.wav",
-                voice_dir="TTS/tts/utils/assets/tortoise/voices/",
+                voice_dir="path/to/tortoise/voices/dir/",
                speaker="lj",
                preset="ultra_fast")

 # Random voice generation
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                file_path="output.wav")
 ```

@ -54,16 +54,16 @@ Using 🐸TTS Command line:
 ```console
 # cloning the `lj` voice
 tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
--text "This is an example." \ 
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
--voice_dir TTS/tts/utils/assets/tortoise/voices/ \
+--text "This is an example." \
+--out_path "output.wav" \
+--voice_dir path/to/tortoise/voices/dir/ \
 --speaker_idx "lj" \
 --progress_bar True

 # Random voice generation
 tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
 --text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
+--out_path "output.wav" \
 --progress_bar True
 ```