mirror of https://github.com/coqui-ai/TTS.git
Fix here and ther
parent
03c347b7f3
commit
0f8932a6a9
|
@ -17,6 +17,7 @@ class HubertManager:
|
|||
urllib.request.urlretrieve(download_url, model_path)
|
||||
print("Downloaded HuBERT")
|
||||
return model_path
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def make_sure_tokenizer_installed(
|
||||
|
@ -31,3 +32,4 @@ class HubertManager:
|
|||
shutil.move(os.path.join(model_dir, model), model_path)
|
||||
print("Downloaded tokenizer")
|
||||
return model_path
|
||||
return None
|
||||
|
|
|
@ -16,7 +16,7 @@ from torch.serialization import MAP_LOCATION
|
|||
|
||||
class HubertTokenizer(nn.Module):
|
||||
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
|
||||
super(HubertTokenizer, self).__init__()
|
||||
super().__init__()
|
||||
next_size = input_size
|
||||
if version == 0:
|
||||
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
|
||||
|
@ -181,7 +181,7 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
|
|||
epoch = 1
|
||||
|
||||
while 1:
|
||||
for i in range(save_epochs):
|
||||
for _ in range(save_epochs):
|
||||
j = 0
|
||||
for x, y in zip(data_x, data_y):
|
||||
model_training.train_step(
|
||||
|
|
|
@ -16,7 +16,7 @@ from torch.nn import functional as F
|
|||
from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
|
||||
from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
|
||||
from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
|
||||
from TTS.tts.layers.bark.load_model import _clear_cuda_cache, _inference_mode
|
||||
from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -34,34 +34,53 @@ def _normalize_whitespace(text):
|
|||
|
||||
|
||||
def get_voices(extra_voice_dirs: List[str] = []):
|
||||
voices = {}
|
||||
for dir in extra_voice_dirs:
|
||||
paths = list(glob(f"{dir}/*.npz"))
|
||||
for path in paths:
|
||||
name = os.path.basename(path).replace(".npz", "")
|
||||
voices[name] = path
|
||||
dirs = extra_voice_dirs
|
||||
voices: Dict[str, List[str]] = {}
|
||||
for d in dirs:
|
||||
subs = os.listdir(d)
|
||||
for sub in subs:
|
||||
subj = os.path.join(d, sub)
|
||||
if os.path.isdir(subj):
|
||||
voices[sub] = list(glob(f"{subj}/*.npz"))
|
||||
# fetch audio files if no npz files are found
|
||||
if len(voices[sub]) == 0:
|
||||
voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
|
||||
return voices
|
||||
|
||||
|
||||
def load_voice(voice: str, extra_voice_dirs: List[str] = []):
|
||||
def load_npz(npz_file):
|
||||
def load_npz(npz_file):
|
||||
x_history = np.load(npz_file)
|
||||
semantic = x_history["semantic_prompt"]
|
||||
coarse = x_history["coarse_prompt"]
|
||||
fine = x_history["fine_prompt"]
|
||||
return semantic, coarse, fine
|
||||
|
||||
|
||||
def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
|
||||
if voice == "random":
|
||||
return None, None, None
|
||||
|
||||
voices = get_voices(extra_voice_dirs)
|
||||
paths = voices[voice]
|
||||
|
||||
# bark only uses a single sample for cloning
|
||||
if len(paths) > 1:
|
||||
raise ValueError(f"Voice {voice} has multiple paths: {paths}")
|
||||
|
||||
try:
|
||||
path = voices[voice]
|
||||
except KeyError:
|
||||
raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}")
|
||||
prompt = load_npz(path)
|
||||
return prompt
|
||||
except KeyError as e:
|
||||
raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
|
||||
|
||||
if len(paths) == 1 and paths[0].endswith(".npz"):
|
||||
return load_npz(path[0])
|
||||
else:
|
||||
audio_path = paths[0]
|
||||
# replace the file extension with .npz
|
||||
output_path = os.path.splitext(audio_path)[0] + ".npz"
|
||||
generate_voice(audio=audio_path, model=model, output_path=output_path)
|
||||
breakpoint()
|
||||
return load_voice(model, voice, extra_voice_dirs)
|
||||
|
||||
def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
|
||||
zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
|
||||
|
@ -85,7 +104,6 @@ def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
|
|||
|
||||
def generate_voice(
|
||||
audio,
|
||||
text,
|
||||
model,
|
||||
output_path,
|
||||
):
|
||||
|
@ -106,9 +124,6 @@ def generate_voice(
|
|||
encoded_frames = model.encodec.encode(audio)
|
||||
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
|
||||
|
||||
# get seconds of audio
|
||||
seconds = audio.shape[-1] / model.config.sample_rate
|
||||
|
||||
# move codes to cpu
|
||||
codes = codes.cpu().numpy()
|
||||
|
||||
|
@ -133,36 +148,6 @@ def generate_voice(
|
|||
|
||||
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
|
||||
|
||||
# while attempts < max_attempts:
|
||||
# if attempts > 0 and base is not None:
|
||||
# # Reset the base model token
|
||||
# print(f"Reset the base model token Regenerating...")
|
||||
# base = None
|
||||
|
||||
# audio_array, x = model.generate_audio(text, history_promp=None, base=base, **kwargs)
|
||||
# zcr = zero_crossing_rate(audio_array)
|
||||
# spectral_contrast = compute_spectral_contrast(audio_array, model.config.sample_rate)
|
||||
# bass_energy = compute_average_bass_energy(audio_array, model.config.sample_rate)
|
||||
# print(f"Attempt {attempts + 1}: ZCR = {zcr}, Spectral Contrast = {spectral_contrast:.2f}, Bass Energy = {bass_energy:.2f}")
|
||||
|
||||
# # Save the audio array to the output_array directory with a random name for debugging
|
||||
# #output_file = os.path.join(output_directory, f"audio_{zcr:.2f}_sc{spectral_contrast:.2f}_be{bass_energy:.2f}.wav")
|
||||
# #wavfile.write(output_file, sample_rate, audio_array)
|
||||
# #print(f"Saved audio array to {output_file}")
|
||||
|
||||
# if zcr < zcr_threshold and spectral_contrast < spectral_threshold and bass_energy < bass_energy_threshold:
|
||||
# print(f"Audio passed ZCR, Spectral Contrast, and Bass Energy thresholds. No need to regenerate.")
|
||||
# break
|
||||
# else:
|
||||
# print(f"Audio failed ZCR, Spectral Contrast, and/or Bass Energy thresholds. Regenerating...")
|
||||
|
||||
# attempts += 1
|
||||
|
||||
# if attempts == max_attempts:
|
||||
# print("Reached maximum attempts. Returning the last generated audio.")
|
||||
|
||||
# return audio_array, x, zcr, spectral_contrast, bass_energy
|
||||
|
||||
|
||||
def generate_text_semantic(
|
||||
text,
|
||||
|
@ -224,7 +209,7 @@ def generate_text_semantic(
|
|||
np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
|
||||
)[None]
|
||||
assert x.shape[1] == 256 + 256 + 1
|
||||
with _inference_mode():
|
||||
with inference_mode():
|
||||
x = x.to(model.device)
|
||||
n_tot_steps = 768
|
||||
# custom tqdm updates since we don't know when eos will occur
|
||||
|
@ -285,8 +270,8 @@ def generate_text_semantic(
|
|||
pbar_state = req_pbar_state
|
||||
pbar.close()
|
||||
out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
|
||||
assert all(0 <= out) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
|
||||
_clear_cuda_cache()
|
||||
assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
|
||||
clear_cuda_cache()
|
||||
return out
|
||||
|
||||
|
||||
|
@ -382,7 +367,7 @@ def generate_coarse(
|
|||
x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
|
||||
x_coarse = x_coarse_history.astype(np.int32)
|
||||
base_semantic_idx = len(x_semantic_history)
|
||||
with _inference_mode():
|
||||
with inference_mode():
|
||||
x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
|
||||
x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
|
||||
n_window_steps = int(np.ceil(n_steps / sliding_window_len))
|
||||
|
@ -456,7 +441,7 @@ def generate_coarse(
|
|||
)
|
||||
for n in range(1, model.config.N_COARSE_CODEBOOKS):
|
||||
gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
|
||||
_clear_cuda_cache()
|
||||
clear_cuda_cache()
|
||||
return gen_coarse_audio_arr
|
||||
|
||||
|
||||
|
@ -526,7 +511,7 @@ def generate_fine(
|
|||
)
|
||||
# we can be lazy about fractional loop and just keep overwriting codebooks
|
||||
n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
|
||||
with _inference_mode():
|
||||
with inference_mode():
|
||||
in_arr = torch.tensor(in_arr.T).to(model.device)
|
||||
for n in tqdm.tqdm(range(n_loops), disable=silent):
|
||||
start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
|
||||
|
@ -558,14 +543,12 @@ def generate_fine(
|
|||
if n_remove_from_end > 0:
|
||||
gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
|
||||
assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
|
||||
_clear_cuda_cache()
|
||||
clear_cuda_cache()
|
||||
return gen_fine_arr
|
||||
|
||||
|
||||
def codec_decode(fine_tokens, model):
|
||||
"""Turn quantized audio codes into audio array using encodec."""
|
||||
from TTS.utils.audio.numpy_transforms import save_wav
|
||||
|
||||
arr = torch.from_numpy(fine_tokens)[None]
|
||||
arr = arr.to(model.device)
|
||||
arr = arr.transpose(0, 1)
|
||||
|
|
|
@ -1,17 +1,12 @@
|
|||
import contextlib
|
||||
|
||||
# import funcy
|
||||
import functools
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import requests
|
||||
import torch
|
||||
import tqdm
|
||||
from encodec import EncodecModel
|
||||
from transformers import BertTokenizer
|
||||
|
||||
from TTS.tts.layers.bark.model import GPT, GPTConfig
|
||||
from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
|
||||
|
@ -31,8 +26,6 @@ else:
|
|||
|
||||
|
||||
# hold models in global scope to lazy load
|
||||
global models
|
||||
models = {}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -44,10 +37,10 @@ if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
|
|||
)
|
||||
|
||||
|
||||
def _string_md5(s):
|
||||
m = hashlib.md5()
|
||||
m.update(s.encode("utf-8"))
|
||||
return m.hexdigest()
|
||||
# def _string_md5(s):
|
||||
# m = hashlib.md5()
|
||||
# m.update(s.encode("utf-8"))
|
||||
# return m.hexdigest()
|
||||
|
||||
|
||||
def _md5(fname):
|
||||
|
@ -58,18 +51,18 @@ def _md5(fname):
|
|||
return hash_md5.hexdigest()
|
||||
|
||||
|
||||
def _get_ckpt_path(model_type, CACHE_DIR):
|
||||
model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
|
||||
return os.path.join(CACHE_DIR, f"{model_name}.pt")
|
||||
# def _get_ckpt_path(model_type, CACHE_DIR):
|
||||
# model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
|
||||
# return os.path.join(CACHE_DIR, f"{model_name}.pt")
|
||||
|
||||
|
||||
S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
|
||||
# S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
|
||||
|
||||
|
||||
def _parse_s3_filepath(s3_filepath):
|
||||
bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
|
||||
rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
|
||||
return bucket_name, rel_s3_filepath
|
||||
# def _parse_s3_filepath(s3_filepath):
|
||||
# bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
|
||||
# rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
|
||||
# return bucket_name, rel_s3_filepath
|
||||
|
||||
|
||||
def _download(from_s3_path, to_local_path, CACHE_DIR):
|
||||
|
@ -83,7 +76,7 @@ def _download(from_s3_path, to_local_path, CACHE_DIR):
|
|||
progress_bar.update(len(data))
|
||||
file.write(data)
|
||||
progress_bar.close()
|
||||
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
|
||||
if total_size_in_bytes not in [0, progress_bar.n]:
|
||||
raise ValueError("ERROR, something went wrong")
|
||||
|
||||
|
||||
|
@ -107,27 +100,27 @@ if torch.cuda.is_available():
|
|||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _inference_mode():
|
||||
def inference_mode():
|
||||
with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
|
||||
yield
|
||||
|
||||
|
||||
def _clear_cuda_cache():
|
||||
def clear_cuda_cache():
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def clean_models(model_key=None):
|
||||
global models
|
||||
model_keys = [model_key] if model_key is not None else models.keys()
|
||||
for k in model_keys:
|
||||
if k in models:
|
||||
del models[k]
|
||||
_clear_cuda_cache()
|
||||
# def clean_models(model_key=None):
|
||||
# global models
|
||||
# model_keys = [model_key] if model_key is not None else models.keys()
|
||||
# for k in model_keys:
|
||||
# if k in models:
|
||||
# del models[k]
|
||||
# clear_cuda_cache()
|
||||
|
||||
|
||||
def _load_model(ckpt_path, device, config, model_type="text"):
|
||||
def load_model(ckpt_path, device, config, model_type="text"):
|
||||
logger.info(f"loading {model_type} model from {ckpt_path}...")
|
||||
|
||||
if device == "cpu":
|
||||
|
@ -174,13 +167,13 @@ def _load_model(ckpt_path, device, config, model_type="text"):
|
|||
state_dict = checkpoint["model"]
|
||||
# fixup checkpoint
|
||||
unwanted_prefix = "_orig_mod."
|
||||
for k, v in list(state_dict.items()):
|
||||
for k, _ in list(state_dict.items()):
|
||||
if k.startswith(unwanted_prefix):
|
||||
state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
|
||||
extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
|
||||
extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
|
||||
extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
|
||||
missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
|
||||
missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
|
||||
missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
|
||||
if len(extra_keys) != 0:
|
||||
raise ValueError(f"extra keys found: {extra_keys}")
|
||||
if len(missing_keys) != 0:
|
||||
|
@ -192,63 +185,63 @@ def _load_model(ckpt_path, device, config, model_type="text"):
|
|||
model.eval()
|
||||
model.to(device)
|
||||
del checkpoint, state_dict
|
||||
_clear_cuda_cache()
|
||||
clear_cuda_cache()
|
||||
return model, config
|
||||
|
||||
|
||||
def _load_codec_model(device):
|
||||
model = EncodecModel.encodec_model_24khz()
|
||||
model.set_target_bandwidth(6.0)
|
||||
model.eval()
|
||||
model.to(device)
|
||||
_clear_cuda_cache()
|
||||
return model
|
||||
# def _load_codec_model(device):
|
||||
# model = EncodecModel.encodec_model_24khz()
|
||||
# model.set_target_bandwidth(6.0)
|
||||
# model.eval()
|
||||
# model.to(device)
|
||||
# clear_cuda_cache()
|
||||
# return model
|
||||
|
||||
|
||||
def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
|
||||
_load_model_f = functools.partial(_load_model, model_type=model_type)
|
||||
if model_type not in ("text", "coarse", "fine"):
|
||||
raise NotImplementedError()
|
||||
global models
|
||||
if torch.cuda.device_count() == 0 or not use_gpu:
|
||||
device = "cpu"
|
||||
else:
|
||||
device = "cuda"
|
||||
model_key = str(device) + f"__{model_type}"
|
||||
if model_key not in models or force_reload:
|
||||
if ckpt_path is None:
|
||||
ckpt_path = _get_ckpt_path(model_type)
|
||||
clean_models(model_key=model_key)
|
||||
model = _load_model_f(ckpt_path, device)
|
||||
models[model_key] = model
|
||||
return models[model_key]
|
||||
# def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
|
||||
# _load_model_f = functools.partial(_load_model, model_type=model_type)
|
||||
# if model_type not in ("text", "coarse", "fine"):
|
||||
# raise NotImplementedError()
|
||||
# global models
|
||||
# if torch.cuda.device_count() == 0 or not use_gpu:
|
||||
# device = "cpu"
|
||||
# else:
|
||||
# device = "cuda"
|
||||
# model_key = str(device) + f"__{model_type}"
|
||||
# if model_key not in models or force_reload:
|
||||
# if ckpt_path is None:
|
||||
# ckpt_path = _get_ckpt_path(model_type)
|
||||
# clean_models(model_key=model_key)
|
||||
# model = _load_model_f(ckpt_path, device)
|
||||
# models[model_key] = model
|
||||
# return models[model_key]
|
||||
|
||||
|
||||
def load_codec_model(use_gpu=True, force_reload=False):
|
||||
global models
|
||||
if torch.cuda.device_count() == 0 or not use_gpu:
|
||||
device = "cpu"
|
||||
else:
|
||||
device = "cuda"
|
||||
model_key = str(device) + f"__codec"
|
||||
if model_key not in models or force_reload:
|
||||
clean_models(model_key=model_key)
|
||||
model = _load_codec_model(device)
|
||||
models[model_key] = model
|
||||
return models[model_key]
|
||||
# def load_codec_model(use_gpu=True, force_reload=False):
|
||||
# global models
|
||||
# if torch.cuda.device_count() == 0 or not use_gpu:
|
||||
# device = "cpu"
|
||||
# else:
|
||||
# device = "cuda"
|
||||
# model_key = str(device) + f"__codec"
|
||||
# if model_key not in models or force_reload:
|
||||
# clean_models(model_key=model_key)
|
||||
# model = _load_codec_model(device)
|
||||
# models[model_key] = model
|
||||
# return models[model_key]
|
||||
|
||||
|
||||
def preload_models(
|
||||
text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
|
||||
):
|
||||
global USE_SMALLER_MODELS
|
||||
global REMOTE_MODEL_PATHS
|
||||
if use_smaller_models:
|
||||
USE_SMALLER_MODELS = True
|
||||
logger.info("Using smaller models generation.py")
|
||||
REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
|
||||
# def preload_models(
|
||||
# text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
|
||||
# ):
|
||||
# global USE_SMALLER_MODELS
|
||||
# global REMOTE_MODEL_PATHS
|
||||
# if use_smaller_models:
|
||||
# USE_SMALLER_MODELS = True
|
||||
# logger.info("Using smaller models generation.py")
|
||||
# REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
|
||||
|
||||
_ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
|
||||
_ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
|
||||
_ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
|
||||
_ = load_codec_model(use_gpu=use_gpu, force_reload=True)
|
||||
# _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
|
||||
# _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
|
||||
# _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
|
||||
# _ = load_codec_model(use_gpu=use_gpu, force_reload=True)
|
||||
|
|
|
@ -6,8 +6,8 @@ import math
|
|||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from coqpit import Coqpit
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
|
@ -19,8 +19,8 @@ class LayerNorm(nn.Module):
|
|||
self.weight = nn.Parameter(torch.ones(ndim))
|
||||
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
|
||||
|
||||
def forward(self, input):
|
||||
return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
|
||||
def forward(self, x):
|
||||
return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
|
||||
|
||||
|
||||
class CausalSelfAttention(nn.Module):
|
||||
|
@ -177,7 +177,7 @@ class GPT(nn.Module):
|
|||
|
||||
def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
|
||||
device = idx.device
|
||||
b, t = idx.size()
|
||||
_, t = idx.size()
|
||||
if past_kv is not None:
|
||||
assert t == 1
|
||||
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
|
||||
|
@ -219,7 +219,7 @@ class GPT(nn.Module):
|
|||
|
||||
new_kv = () if use_cache else None
|
||||
|
||||
for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
|
||||
for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
|
||||
x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
|
||||
|
||||
if use_cache:
|
||||
|
|
|
@ -6,7 +6,7 @@ import math
|
|||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from .model import GPT, MLP, GPTConfig
|
||||
|
|
|
@ -341,7 +341,7 @@ class Synthesizer(object):
|
|||
|
||||
use_gl = self.vocoder_model is None
|
||||
|
||||
if not reference_wav:
|
||||
if not reference_wav: # not voice conversion
|
||||
for sen in sens:
|
||||
if hasattr(self.tts_model, "synthesize"):
|
||||
sp_name = "random" if speaker_name is None else speaker_name
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Tortoise 🐢
|
||||
Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input
|
||||
text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to
|
||||
the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
|
||||
the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
|
||||
|
||||
Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS.
|
||||
|
||||
|
@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
|
|||
from TTS.tts.models.tortoise import Tortoise
|
||||
|
||||
config = TortoiseConfig()
|
||||
model = Tortoise.inif_from_config(config)
|
||||
model = Tortoise.init_from_config(config)
|
||||
model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
|
||||
|
||||
# with random speaker
|
||||
|
@ -29,23 +29,23 @@ from TTS.api import TTS
|
|||
tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
|
||||
|
||||
# cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
|
||||
# with custom inference settings overriding defaults.
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
# with custom inference settings overriding defaults.
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
file_path="output.wav",
|
||||
voice_dir="TTS/tts/utils/assets/tortoise/voices/",
|
||||
voice_dir="path/to/tortoise/voices/dir/",
|
||||
speaker="lj",
|
||||
num_autoregressive_samples=1,
|
||||
diffusion_iterations=10)
|
||||
|
||||
# Using presets with the same voice
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
file_path="output.wav",
|
||||
voice_dir="TTS/tts/utils/assets/tortoise/voices/",
|
||||
voice_dir="path/to/tortoise/voices/dir/",
|
||||
speaker="lj",
|
||||
preset="ultra_fast")
|
||||
|
||||
# Random voice generation
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
file_path="output.wav")
|
||||
```
|
||||
|
||||
|
@ -54,16 +54,16 @@ Using 🐸TTS Command line:
|
|||
```console
|
||||
# cloning the `lj` voice
|
||||
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
|
||||
--text "This is an example." \
|
||||
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
|
||||
--voice_dir TTS/tts/utils/assets/tortoise/voices/ \
|
||||
--text "This is an example." \
|
||||
--out_path "output.wav" \
|
||||
--voice_dir path/to/tortoise/voices/dir/ \
|
||||
--speaker_idx "lj" \
|
||||
--progress_bar True
|
||||
|
||||
# Random voice generation
|
||||
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
|
||||
--text "This is an example." \
|
||||
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
|
||||
--out_path "output.wav" \
|
||||
--progress_bar True
|
||||
```
|
||||
|
||||
|
|
Loading…
Reference in New Issue