From 0f8932a6a9a1d71c7429c8e07d3c37f1f9a2da25 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Wed, 21 Jun 2023 11:59:27 +0200
Subject: [PATCH] Fix here and ther

---
 TTS/tts/layers/bark/hubert/hubert_manager.py |   2 +
 TTS/tts/layers/bark/hubert/tokenizer.py      |   4 +-
 TTS/tts/layers/bark/inference_funcs.py       |  95 +++++------
 TTS/tts/layers/bark/load_model.py            | 159 +++++++++----------
 TTS/tts/layers/bark/model.py                 |  10 +-
 TTS/tts/layers/bark/model_fine.py            |   2 +-
 TTS/utils/synthesizer.py                     |   2 +-
 docs/source/models/tortoise.md               |  24 +--
 8 files changed, 138 insertions(+), 160 deletions(-)

diff --git a/TTS/tts/layers/bark/hubert/hubert_manager.py b/TTS/tts/layers/bark/hubert/hubert_manager.py
index baa26438..4bc19929 100644
--- a/TTS/tts/layers/bark/hubert/hubert_manager.py
+++ b/TTS/tts/layers/bark/hubert/hubert_manager.py
@@ -17,6 +17,7 @@ class HubertManager:
             urllib.request.urlretrieve(download_url, model_path)
             print("Downloaded HuBERT")
             return model_path
+        return None
 
     @staticmethod
     def make_sure_tokenizer_installed(
@@ -31,3 +32,4 @@ class HubertManager:
             shutil.move(os.path.join(model_dir, model), model_path)
             print("Downloaded tokenizer")
             return model_path
+        return None
diff --git a/TTS/tts/layers/bark/hubert/tokenizer.py b/TTS/tts/layers/bark/hubert/tokenizer.py
index 474a08db..be9a50f8 100644
--- a/TTS/tts/layers/bark/hubert/tokenizer.py
+++ b/TTS/tts/layers/bark/hubert/tokenizer.py
@@ -16,7 +16,7 @@ from torch.serialization import MAP_LOCATION
 
 class HubertTokenizer(nn.Module):
     def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
-        super(HubertTokenizer, self).__init__()
+        super().__init__()
         next_size = input_size
         if version == 0:
             self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
@@ -181,7 +181,7 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
     epoch = 1
 
     while 1:
-        for i in range(save_epochs):
+        for _ in range(save_epochs):
             j = 0
             for x, y in zip(data_x, data_y):
                 model_training.train_step(
diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py
index 6fa87c37..2b27246d 100644
--- a/TTS/tts/layers/bark/inference_funcs.py
+++ b/TTS/tts/layers/bark/inference_funcs.py
@@ -16,7 +16,7 @@ from torch.nn import functional as F
 from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
 from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
 from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
-from TTS.tts.layers.bark.load_model import _clear_cuda_cache, _inference_mode
+from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
 
 logger = logging.getLogger(__name__)
 
@@ -34,34 +34,53 @@ def _normalize_whitespace(text):
 
 
 def get_voices(extra_voice_dirs: List[str] = []):
-    voices = {}
-    for dir in extra_voice_dirs:
-        paths = list(glob(f"{dir}/*.npz"))
-        for path in paths:
-            name = os.path.basename(path).replace(".npz", "")
-            voices[name] = path
+    dirs = extra_voice_dirs
+    voices: Dict[str, List[str]] = {}
+    for d in dirs:
+        subs = os.listdir(d)
+        for sub in subs:
+            subj = os.path.join(d, sub)
+            if os.path.isdir(subj):
+                voices[sub] = list(glob(f"{subj}/*.npz"))
+                # fetch audio files if no npz files are found
+                if len(voices[sub]) == 0:
+                    voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
     return voices
 
 
-def load_voice(voice: str, extra_voice_dirs: List[str] = []):
-    def load_npz(npz_file):
+def load_npz(npz_file):
         x_history = np.load(npz_file)
         semantic = x_history["semantic_prompt"]
         coarse = x_history["coarse_prompt"]
         fine = x_history["fine_prompt"]
         return semantic, coarse, fine
 
+
+def load_voice(model, voice: str, extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-default-value
     if voice == "random":
         return None, None, None
 
     voices = get_voices(extra_voice_dirs)
+    paths = voices[voice]
+
+    # bark only uses a single sample for cloning
+    if len(paths) > 1:
+        raise ValueError(f"Voice {voice} has multiple paths: {paths}")
+
     try:
         path = voices[voice]
-    except KeyError:
-        raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}")
-    prompt = load_npz(path)
-    return prompt
+    except KeyError as e:
+        raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
 
+    if len(paths) == 1 and paths[0].endswith(".npz"):
+        return load_npz(path[0])
+    else:
+        audio_path = paths[0]
+        # replace the file extension with .npz
+        output_path = os.path.splitext(audio_path)[0] + ".npz"
+        generate_voice(audio=audio_path, model=model, output_path=output_path)
+        breakpoint()
+        return load_voice(model, voice, extra_voice_dirs)
 
 def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
     zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
@@ -85,7 +104,6 @@ def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
 
 def generate_voice(
     audio,
-    text,
     model,
     output_path,
 ):
@@ -106,9 +124,6 @@ def generate_voice(
         encoded_frames = model.encodec.encode(audio)
     codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]
 
-    # get seconds of audio
-    seconds = audio.shape[-1] / model.config.sample_rate
-
     # move codes to cpu
     codes = codes.cpu().numpy()
 
@@ -133,36 +148,6 @@ def generate_voice(
 
     np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
 
-    # while attempts < max_attempts:
-    #     if attempts > 0 and base is not None:
-    #         # Reset the base model token
-    #         print(f"Reset the base model token Regenerating...")
-    #         base = None
-
-    #     audio_array, x = model.generate_audio(text, history_promp=None, base=base, **kwargs)
-    #     zcr = zero_crossing_rate(audio_array)
-    #     spectral_contrast = compute_spectral_contrast(audio_array, model.config.sample_rate)
-    #     bass_energy = compute_average_bass_energy(audio_array, model.config.sample_rate)
-    #     print(f"Attempt {attempts + 1}: ZCR = {zcr}, Spectral Contrast = {spectral_contrast:.2f}, Bass Energy = {bass_energy:.2f}")
-
-    #     # Save the audio array to the output_array directory with a random name for debugging
-    #     #output_file = os.path.join(output_directory, f"audio_{zcr:.2f}_sc{spectral_contrast:.2f}_be{bass_energy:.2f}.wav")
-    #     #wavfile.write(output_file, sample_rate, audio_array)
-    #     #print(f"Saved audio array to {output_file}")
-
-    #     if zcr < zcr_threshold and spectral_contrast < spectral_threshold and bass_energy < bass_energy_threshold:
-    #         print(f"Audio passed ZCR, Spectral Contrast, and Bass Energy thresholds. No need to regenerate.")
-    #         break
-    #     else:
-    #         print(f"Audio failed ZCR, Spectral Contrast, and/or Bass Energy thresholds. Regenerating...")
-
-    #     attempts += 1
-
-    # if attempts == max_attempts:
-    #     print("Reached maximum attempts. Returning the last generated audio.")
-
-    # return audio_array, x, zcr, spectral_contrast, bass_energy
-
 
 def generate_text_semantic(
     text,
@@ -224,7 +209,7 @@ def generate_text_semantic(
         np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
     )[None]
     assert x.shape[1] == 256 + 256 + 1
-    with _inference_mode():
+    with inference_mode():
         x = x.to(model.device)
         n_tot_steps = 768
         # custom tqdm updates since we don't know when eos will occur
@@ -285,8 +270,8 @@ def generate_text_semantic(
             pbar_state = req_pbar_state
         pbar.close()
         out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
-    assert all(0 <= out) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
-    _clear_cuda_cache()
+    assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
+    clear_cuda_cache()
     return out
 
 
@@ -382,7 +367,7 @@ def generate_coarse(
     x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
     x_coarse = x_coarse_history.astype(np.int32)
     base_semantic_idx = len(x_semantic_history)
-    with _inference_mode():
+    with inference_mode():
         x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
         x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
         n_window_steps = int(np.ceil(n_steps / sliding_window_len))
@@ -456,7 +441,7 @@ def generate_coarse(
     )
     for n in range(1, model.config.N_COARSE_CODEBOOKS):
         gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
-    _clear_cuda_cache()
+    clear_cuda_cache()
     return gen_coarse_audio_arr
 
 
@@ -526,7 +511,7 @@ def generate_fine(
         )
     # we can be lazy about fractional loop and just keep overwriting codebooks
     n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
-    with _inference_mode():
+    with inference_mode():
         in_arr = torch.tensor(in_arr.T).to(model.device)
         for n in tqdm.tqdm(range(n_loops), disable=silent):
             start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
@@ -558,14 +543,12 @@ def generate_fine(
     if n_remove_from_end > 0:
         gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
     assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
-    _clear_cuda_cache()
+    clear_cuda_cache()
     return gen_fine_arr
 
 
 def codec_decode(fine_tokens, model):
     """Turn quantized audio codes into audio array using encodec."""
-    from TTS.utils.audio.numpy_transforms import save_wav
-
     arr = torch.from_numpy(fine_tokens)[None]
     arr = arr.to(model.device)
     arr = arr.transpose(0, 1)
diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py
index dbd861d0..33144ed5 100644
--- a/TTS/tts/layers/bark/load_model.py
+++ b/TTS/tts/layers/bark/load_model.py
@@ -1,17 +1,12 @@
 import contextlib
-
-# import funcy
 import functools
 import hashlib
 import logging
 import os
-import re
 
 import requests
 import torch
 import tqdm
-from encodec import EncodecModel
-from transformers import BertTokenizer
 
 from TTS.tts.layers.bark.model import GPT, GPTConfig
 from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
@@ -31,8 +26,6 @@ else:
 
 
 # hold models in global scope to lazy load
-global models
-models = {}
 
 logger = logging.getLogger(__name__)
 
@@ -44,10 +37,10 @@ if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
     )
 
 
-def _string_md5(s):
-    m = hashlib.md5()
-    m.update(s.encode("utf-8"))
-    return m.hexdigest()
+# def _string_md5(s):
+#     m = hashlib.md5()
+#     m.update(s.encode("utf-8"))
+#     return m.hexdigest()
 
 
 def _md5(fname):
@@ -58,18 +51,18 @@ def _md5(fname):
     return hash_md5.hexdigest()
 
 
-def _get_ckpt_path(model_type, CACHE_DIR):
-    model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
-    return os.path.join(CACHE_DIR, f"{model_name}.pt")
+# def _get_ckpt_path(model_type, CACHE_DIR):
+#     model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
+#     return os.path.join(CACHE_DIR, f"{model_name}.pt")
 
 
-S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
+# S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
 
 
-def _parse_s3_filepath(s3_filepath):
-    bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
-    rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
-    return bucket_name, rel_s3_filepath
+# def _parse_s3_filepath(s3_filepath):
+#     bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
+#     rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
+#     return bucket_name, rel_s3_filepath
 
 
 def _download(from_s3_path, to_local_path, CACHE_DIR):
@@ -83,7 +76,7 @@ def _download(from_s3_path, to_local_path, CACHE_DIR):
             progress_bar.update(len(data))
             file.write(data)
     progress_bar.close()
-    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+    if total_size_in_bytes not in [0, progress_bar.n]:
         raise ValueError("ERROR, something went wrong")
 
 
@@ -107,27 +100,27 @@ if torch.cuda.is_available():
 
 
 @contextlib.contextmanager
-def _inference_mode():
+def inference_mode():
     with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
         yield
 
 
-def _clear_cuda_cache():
+def clear_cuda_cache():
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
 
 
-def clean_models(model_key=None):
-    global models
-    model_keys = [model_key] if model_key is not None else models.keys()
-    for k in model_keys:
-        if k in models:
-            del models[k]
-    _clear_cuda_cache()
+# def clean_models(model_key=None):
+#     global models
+#     model_keys = [model_key] if model_key is not None else models.keys()
+#     for k in model_keys:
+#         if k in models:
+#             del models[k]
+#     clear_cuda_cache()
 
 
-def _load_model(ckpt_path, device, config, model_type="text"):
+def load_model(ckpt_path, device, config, model_type="text"):
     logger.info(f"loading {model_type} model from {ckpt_path}...")
 
     if device == "cpu":
@@ -174,13 +167,13 @@ def _load_model(ckpt_path, device, config, model_type="text"):
     state_dict = checkpoint["model"]
     # fixup checkpoint
     unwanted_prefix = "_orig_mod."
-    for k, v in list(state_dict.items()):
+    for k, _ in list(state_dict.items()):
         if k.startswith(unwanted_prefix):
             state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
     extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
-    extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
+    extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
     missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
+    missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
     if len(extra_keys) != 0:
         raise ValueError(f"extra keys found: {extra_keys}")
     if len(missing_keys) != 0:
@@ -192,63 +185,63 @@ def _load_model(ckpt_path, device, config, model_type="text"):
     model.eval()
     model.to(device)
     del checkpoint, state_dict
-    _clear_cuda_cache()
+    clear_cuda_cache()
     return model, config
 
 
-def _load_codec_model(device):
-    model = EncodecModel.encodec_model_24khz()
-    model.set_target_bandwidth(6.0)
-    model.eval()
-    model.to(device)
-    _clear_cuda_cache()
-    return model
+# def _load_codec_model(device):
+#     model = EncodecModel.encodec_model_24khz()
+#     model.set_target_bandwidth(6.0)
+#     model.eval()
+#     model.to(device)
+#     clear_cuda_cache()
+#     return model
 
 
-def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
-    _load_model_f = functools.partial(_load_model, model_type=model_type)
-    if model_type not in ("text", "coarse", "fine"):
-        raise NotImplementedError()
-    global models
-    if torch.cuda.device_count() == 0 or not use_gpu:
-        device = "cpu"
-    else:
-        device = "cuda"
-    model_key = str(device) + f"__{model_type}"
-    if model_key not in models or force_reload:
-        if ckpt_path is None:
-            ckpt_path = _get_ckpt_path(model_type)
-        clean_models(model_key=model_key)
-        model = _load_model_f(ckpt_path, device)
-        models[model_key] = model
-    return models[model_key]
+# def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
+#     _load_model_f = functools.partial(_load_model, model_type=model_type)
+#     if model_type not in ("text", "coarse", "fine"):
+#         raise NotImplementedError()
+#     global models
+#     if torch.cuda.device_count() == 0 or not use_gpu:
+#         device = "cpu"
+#     else:
+#         device = "cuda"
+#     model_key = str(device) + f"__{model_type}"
+#     if model_key not in models or force_reload:
+#         if ckpt_path is None:
+#             ckpt_path = _get_ckpt_path(model_type)
+#         clean_models(model_key=model_key)
+#         model = _load_model_f(ckpt_path, device)
+#         models[model_key] = model
+#     return models[model_key]
 
 
-def load_codec_model(use_gpu=True, force_reload=False):
-    global models
-    if torch.cuda.device_count() == 0 or not use_gpu:
-        device = "cpu"
-    else:
-        device = "cuda"
-    model_key = str(device) + f"__codec"
-    if model_key not in models or force_reload:
-        clean_models(model_key=model_key)
-        model = _load_codec_model(device)
-        models[model_key] = model
-    return models[model_key]
+# def load_codec_model(use_gpu=True, force_reload=False):
+#     global models
+#     if torch.cuda.device_count() == 0 or not use_gpu:
+#         device = "cpu"
+#     else:
+#         device = "cuda"
+#     model_key = str(device) + f"__codec"
+#     if model_key not in models or force_reload:
+#         clean_models(model_key=model_key)
+#         model = _load_codec_model(device)
+#         models[model_key] = model
+#     return models[model_key]
 
 
-def preload_models(
-    text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
-):
-    global USE_SMALLER_MODELS
-    global REMOTE_MODEL_PATHS
-    if use_smaller_models:
-        USE_SMALLER_MODELS = True
-        logger.info("Using smaller models generation.py")
-        REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
+# def preload_models(
+#     text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
+# ):
+#     global USE_SMALLER_MODELS
+#     global REMOTE_MODEL_PATHS
+#     if use_smaller_models:
+#         USE_SMALLER_MODELS = True
+#         logger.info("Using smaller models generation.py")
+#         REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
 
-    _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
-    _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
-    _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
-    _ = load_codec_model(use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
+#     _ = load_codec_model(use_gpu=use_gpu, force_reload=True)
diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py
index bcc87a4b..c84022bd 100644
--- a/TTS/tts/layers/bark/model.py
+++ b/TTS/tts/layers/bark/model.py
@@ -6,8 +6,8 @@ import math
 from dataclasses import dataclass
 
 import torch
-import torch.nn as nn
 from coqpit import Coqpit
+from torch import nn
 from torch.nn import functional as F
 
 
@@ -19,8 +19,8 @@ class LayerNorm(nn.Module):
         self.weight = nn.Parameter(torch.ones(ndim))
         self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
 
-    def forward(self, input):
-        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+    def forward(self, x):
+        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
 
 
 class CausalSelfAttention(nn.Module):
@@ -177,7 +177,7 @@ class GPT(nn.Module):
 
     def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
         device = idx.device
-        b, t = idx.size()
+        _, t = idx.size()
         if past_kv is not None:
             assert t == 1
             tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
@@ -219,7 +219,7 @@ class GPT(nn.Module):
 
         new_kv = () if use_cache else None
 
-        for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
+        for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
             x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
 
             if use_cache:
diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py
index 8a426107..09e5f476 100644
--- a/TTS/tts/layers/bark/model_fine.py
+++ b/TTS/tts/layers/bark/model_fine.py
@@ -6,7 +6,7 @@ import math
 from dataclasses import dataclass
 
 import torch
-import torch.nn as nn
+from torch import nn
 from torch.nn import functional as F
 
 from .model import GPT, MLP, GPTConfig
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 4f7761b9..bbaf2904 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -341,7 +341,7 @@ class Synthesizer(object):
 
         use_gl = self.vocoder_model is None
 
-        if not reference_wav:
+        if not reference_wav:  # not voice conversion
             for sen in sens:
                 if hasattr(self.tts_model, "synthesize"):
                     sp_name = "random" if speaker_name is None else speaker_name
diff --git a/docs/source/models/tortoise.md b/docs/source/models/tortoise.md
index c49a0fcb..d602d597 100644
--- a/docs/source/models/tortoise.md
+++ b/docs/source/models/tortoise.md
@@ -1,7 +1,7 @@
 # Tortoise 🐢
 Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input
 text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to
-the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS. 
+the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
 
 Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS.
 
@@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
 from TTS.tts.models.tortoise import Tortoise
 
 config = TortoiseConfig()
-model = Tortoise.inif_from_config(config)
+model = Tortoise.init_from_config(config)
 model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
 
 # with random speaker
@@ -29,23 +29,23 @@ from TTS.api import TTS
 tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
 
 # cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
-# with custom inference settings overriding defaults. 
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+# with custom inference settings overriding defaults.
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                 file_path="output.wav",
-                voice_dir="TTS/tts/utils/assets/tortoise/voices/",
+                voice_dir="path/to/tortoise/voices/dir/",
                 speaker="lj",
                 num_autoregressive_samples=1,
                 diffusion_iterations=10)
 
 # Using presets with the same voice
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                 file_path="output.wav",
-                voice_dir="TTS/tts/utils/assets/tortoise/voices/",
+                voice_dir="path/to/tortoise/voices/dir/",
                 speaker="lj",
                 preset="ultra_fast")
 
 # Random voice generation
-tts.tts_to_file(text="Hello, my name is Manmay , how are you?", 
+tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                 file_path="output.wav")
 ```
 
@@ -54,16 +54,16 @@ Using 🐸TTS Command line:
 ```console
 # cloning the `lj` voice
 tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
---text "This is an example." \ 
---out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
---voice_dir TTS/tts/utils/assets/tortoise/voices/ \
+--text "This is an example." \
+--out_path "output.wav" \
+--voice_dir path/to/tortoise/voices/dir/ \
 --speaker_idx "lj" \
 --progress_bar True
 
 # Random voice generation
 tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
 --text "This is an example." \
---out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
+--out_path "output.wav" \
 --progress_bar True
 ```