Fix here and ther

pull/2685/head
Eren G??lge 2023-06-21 11:59:27 +02:00
parent 03c347b7f3
commit 0f8932a6a9
8 changed files with 138 additions and 160 deletions

View File

@ -17,6 +17,7 @@ class HubertManager:
urllib.request.urlretrieve(download_url, model_path)
print("Downloaded HuBERT")
return model_path
return None
@staticmethod
def make_sure_tokenizer_installed(
@ -31,3 +32,4 @@ class HubertManager:
shutil.move(os.path.join(model_dir, model), model_path)
print("Downloaded tokenizer")
return model_path
return None

View File

@ -16,7 +16,7 @@ from torch.serialization import MAP_LOCATION
class HubertTokenizer(nn.Module):
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
super(HubertTokenizer, self).__init__()
super().__init__()
next_size = input_size
if version == 0:
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
@ -181,7 +181,7 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
epoch = 1
while 1:
for i in range(save_epochs):
for _ in range(save_epochs):
j = 0
for x, y in zip(data_x, data_y):
model_training.train_step(

View File

@ -16,7 +16,7 @@ from torch.nn import functional as F
from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
from TTS.tts.layers.bark.load_model import _clear_cuda_cache, _inference_mode
from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
logger = logging.getLogger(__name__)
@ -34,34 +34,53 @@ def _normalize_whitespace(text):
def get_voices(extra_voice_dirs: List[str] = []):
voices = {}
for dir in extra_voice_dirs:
paths = list(glob(f"{dir}/*.npz"))
for path in paths:
name = os.path.basename(path).replace(".npz", "")
voices[name] = path
dirs = extra_voice_dirs
voices: Dict[str, List[str]] = {}
for d in dirs:
subs = os.listdir(d)
for sub in subs:
subj = os.path.join(d, sub)
if os.path.isdir(subj):
voices[sub] = list(glob(f"{subj}/*.npz"))
# fetch audio files if no npz files are found
if len(voices[sub]) == 0:
voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
return voices
def load_voice(voice: str, extra_voice_dirs: List[str] = []):
def load_npz(npz_file):
def load_npz(npz_file):
x_history = np.load(npz_file)
semantic = x_history["semantic_prompt"]
coarse = x_history["coarse_prompt"]
fine = x_history["fine_prompt"]
return semantic, coarse, fine
def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
if voice == "random":
return None, None, None
voices = get_voices(extra_voice_dirs)
paths = voices[voice]
# bark only uses a single sample for cloning
if len(paths) > 1:
raise ValueError(f"Voice {voice} has multiple paths: {paths}")
try:
path = voices[voice]
except KeyError:
raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}")
prompt = load_npz(path)
return prompt
except KeyError as e:
raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
if len(paths) == 1 and paths[0].endswith(".npz"):
return load_npz(path[0])
else:
audio_path = paths[0]
# replace the file extension with .npz
output_path = os.path.splitext(audio_path)[0] + ".npz"
generate_voice(audio=audio_path, model=model, output_path=output_path)
breakpoint()
return load_voice(model, voice, extra_voice_dirs)
def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
@ -85,7 +104,6 @@ def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
def generate_voice(
audio,
text,
model,
output_path,
):
@ -106,9 +124,6 @@ def generate_voice(
encoded_frames = model.encodec.encode(audio)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
# get seconds of audio
seconds = audio.shape[-1] / model.config.sample_rate
# move codes to cpu
codes = codes.cpu().numpy()
@ -133,36 +148,6 @@ def generate_voice(
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
# while attempts < max_attempts:
# if attempts > 0 and base is not None:
# # Reset the base model token
# print(f"Reset the base model token Regenerating...")
# base = None
# audio_array, x = model.generate_audio(text, history_promp=None, base=base, **kwargs)
# zcr = zero_crossing_rate(audio_array)
# spectral_contrast = compute_spectral_contrast(audio_array, model.config.sample_rate)
# bass_energy = compute_average_bass_energy(audio_array, model.config.sample_rate)
# print(f"Attempt {attempts + 1}: ZCR = {zcr}, Spectral Contrast = {spectral_contrast:.2f}, Bass Energy = {bass_energy:.2f}")
# # Save the audio array to the output_array directory with a random name for debugging
# #output_file = os.path.join(output_directory, f"audio_{zcr:.2f}_sc{spectral_contrast:.2f}_be{bass_energy:.2f}.wav")
# #wavfile.write(output_file, sample_rate, audio_array)
# #print(f"Saved audio array to {output_file}")
# if zcr < zcr_threshold and spectral_contrast < spectral_threshold and bass_energy < bass_energy_threshold:
# print(f"Audio passed ZCR, Spectral Contrast, and Bass Energy thresholds. No need to regenerate.")
# break
# else:
# print(f"Audio failed ZCR, Spectral Contrast, and/or Bass Energy thresholds. Regenerating...")
# attempts += 1
# if attempts == max_attempts:
# print("Reached maximum attempts. Returning the last generated audio.")
# return audio_array, x, zcr, spectral_contrast, bass_energy
def generate_text_semantic(
text,
@ -224,7 +209,7 @@ def generate_text_semantic(
np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
)[None]
assert x.shape[1] == 256 + 256 + 1
with _inference_mode():
with inference_mode():
x = x.to(model.device)
n_tot_steps = 768
# custom tqdm updates since we don't know when eos will occur
@ -285,8 +270,8 @@ def generate_text_semantic(
pbar_state = req_pbar_state
pbar.close()
out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
assert all(0 <= out) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
_clear_cuda_cache()
assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
clear_cuda_cache()
return out
@ -382,7 +367,7 @@ def generate_coarse(
x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
x_coarse = x_coarse_history.astype(np.int32)
base_semantic_idx = len(x_semantic_history)
with _inference_mode():
with inference_mode():
x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
n_window_steps = int(np.ceil(n_steps / sliding_window_len))
@ -456,7 +441,7 @@ def generate_coarse(
)
for n in range(1, model.config.N_COARSE_CODEBOOKS):
gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
_clear_cuda_cache()
clear_cuda_cache()
return gen_coarse_audio_arr
@ -526,7 +511,7 @@ def generate_fine(
)
# we can be lazy about fractional loop and just keep overwriting codebooks
n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
with _inference_mode():
with inference_mode():
in_arr = torch.tensor(in_arr.T).to(model.device)
for n in tqdm.tqdm(range(n_loops), disable=silent):
start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
@ -558,14 +543,12 @@ def generate_fine(
if n_remove_from_end > 0:
gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
_clear_cuda_cache()
clear_cuda_cache()
return gen_fine_arr
def codec_decode(fine_tokens, model):
"""Turn quantized audio codes into audio array using encodec."""
from TTS.utils.audio.numpy_transforms import save_wav
arr = torch.from_numpy(fine_tokens)[None]
arr = arr.to(model.device)
arr = arr.transpose(0, 1)

View File

@ -1,17 +1,12 @@
import contextlib
# import funcy
import functools
import hashlib
import logging
import os
import re
import requests
import torch
import tqdm
from encodec import EncodecModel
from transformers import BertTokenizer
from TTS.tts.layers.bark.model import GPT, GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
@ -31,8 +26,6 @@ else:
# hold models in global scope to lazy load
global models
models = {}
logger = logging.getLogger(__name__)
@ -44,10 +37,10 @@ if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
)
def _string_md5(s):
m = hashlib.md5()
m.update(s.encode("utf-8"))
return m.hexdigest()
# def _string_md5(s):
# m = hashlib.md5()
# m.update(s.encode("utf-8"))
# return m.hexdigest()
def _md5(fname):
@ -58,18 +51,18 @@ def _md5(fname):
return hash_md5.hexdigest()
def _get_ckpt_path(model_type, CACHE_DIR):
model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
return os.path.join(CACHE_DIR, f"{model_name}.pt")
# def _get_ckpt_path(model_type, CACHE_DIR):
# model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
# return os.path.join(CACHE_DIR, f"{model_name}.pt")
S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
# S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
def _parse_s3_filepath(s3_filepath):
bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
return bucket_name, rel_s3_filepath
# def _parse_s3_filepath(s3_filepath):
# bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
# rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
# return bucket_name, rel_s3_filepath
def _download(from_s3_path, to_local_path, CACHE_DIR):
@ -83,7 +76,7 @@ def _download(from_s3_path, to_local_path, CACHE_DIR):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
if total_size_in_bytes not in [0, progress_bar.n]:
raise ValueError("ERROR, something went wrong")
@ -107,27 +100,27 @@ if torch.cuda.is_available():
@contextlib.contextmanager
def _inference_mode():
def inference_mode():
with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
yield
def _clear_cuda_cache():
def clear_cuda_cache():
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
def clean_models(model_key=None):
global models
model_keys = [model_key] if model_key is not None else models.keys()
for k in model_keys:
if k in models:
del models[k]
_clear_cuda_cache()
# def clean_models(model_key=None):
# global models
# model_keys = [model_key] if model_key is not None else models.keys()
# for k in model_keys:
# if k in models:
# del models[k]
# clear_cuda_cache()
def _load_model(ckpt_path, device, config, model_type="text"):
def load_model(ckpt_path, device, config, model_type="text"):
logger.info(f"loading {model_type} model from {ckpt_path}...")
if device == "cpu":
@ -174,13 +167,13 @@ def _load_model(ckpt_path, device, config, model_type="text"):
state_dict = checkpoint["model"]
# fixup checkpoint
unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()):
for k, _ in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0:
@ -192,63 +185,63 @@ def _load_model(ckpt_path, device, config, model_type="text"):
model.eval()
model.to(device)
del checkpoint, state_dict
_clear_cuda_cache()
clear_cuda_cache()
return model, config
def _load_codec_model(device):
model = EncodecModel.encodec_model_24khz()
model.set_target_bandwidth(6.0)
model.eval()
model.to(device)
_clear_cuda_cache()
return model
# def _load_codec_model(device):
# model = EncodecModel.encodec_model_24khz()
# model.set_target_bandwidth(6.0)
# model.eval()
# model.to(device)
# clear_cuda_cache()
# return model
def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
_load_model_f = functools.partial(_load_model, model_type=model_type)
if model_type not in ("text", "coarse", "fine"):
raise NotImplementedError()
global models
if torch.cuda.device_count() == 0 or not use_gpu:
device = "cpu"
else:
device = "cuda"
model_key = str(device) + f"__{model_type}"
if model_key not in models or force_reload:
if ckpt_path is None:
ckpt_path = _get_ckpt_path(model_type)
clean_models(model_key=model_key)
model = _load_model_f(ckpt_path, device)
models[model_key] = model
return models[model_key]
# def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
# _load_model_f = functools.partial(_load_model, model_type=model_type)
# if model_type not in ("text", "coarse", "fine"):
# raise NotImplementedError()
# global models
# if torch.cuda.device_count() == 0 or not use_gpu:
# device = "cpu"
# else:
# device = "cuda"
# model_key = str(device) + f"__{model_type}"
# if model_key not in models or force_reload:
# if ckpt_path is None:
# ckpt_path = _get_ckpt_path(model_type)
# clean_models(model_key=model_key)
# model = _load_model_f(ckpt_path, device)
# models[model_key] = model
# return models[model_key]
def load_codec_model(use_gpu=True, force_reload=False):
global models
if torch.cuda.device_count() == 0 or not use_gpu:
device = "cpu"
else:
device = "cuda"
model_key = str(device) + f"__codec"
if model_key not in models or force_reload:
clean_models(model_key=model_key)
model = _load_codec_model(device)
models[model_key] = model
return models[model_key]
# def load_codec_model(use_gpu=True, force_reload=False):
# global models
# if torch.cuda.device_count() == 0 or not use_gpu:
# device = "cpu"
# else:
# device = "cuda"
# model_key = str(device) + f"__codec"
# if model_key not in models or force_reload:
# clean_models(model_key=model_key)
# model = _load_codec_model(device)
# models[model_key] = model
# return models[model_key]
def preload_models(
text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
):
global USE_SMALLER_MODELS
global REMOTE_MODEL_PATHS
if use_smaller_models:
USE_SMALLER_MODELS = True
logger.info("Using smaller models generation.py")
REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
# def preload_models(
# text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
# ):
# global USE_SMALLER_MODELS
# global REMOTE_MODEL_PATHS
# if use_smaller_models:
# USE_SMALLER_MODELS = True
# logger.info("Using smaller models generation.py")
# REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
_ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
_ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
_ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
_ = load_codec_model(use_gpu=use_gpu, force_reload=True)
# _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
# _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
# _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
# _ = load_codec_model(use_gpu=use_gpu, force_reload=True)

View File

@ -6,8 +6,8 @@ import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from coqpit import Coqpit
from torch import nn
from torch.nn import functional as F
@ -19,8 +19,8 @@ class LayerNorm(nn.Module):
self.weight = nn.Parameter(torch.ones(ndim))
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
def forward(self, input):
return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
def forward(self, x):
return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
class CausalSelfAttention(nn.Module):
@ -177,7 +177,7 @@ class GPT(nn.Module):
def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
device = idx.device
b, t = idx.size()
_, t = idx.size()
if past_kv is not None:
assert t == 1
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
@ -219,7 +219,7 @@ class GPT(nn.Module):
new_kv = () if use_cache else None
for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
if use_cache:

View File

@ -6,7 +6,7 @@ import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch import nn
from torch.nn import functional as F
from .model import GPT, MLP, GPTConfig

View File

@ -341,7 +341,7 @@ class Synthesizer(object):
use_gl = self.vocoder_model is None
if not reference_wav:
if not reference_wav: # not voice conversion
for sen in sens:
if hasattr(self.tts_model, "synthesize"):
sp_name = "random" if speaker_name is None else speaker_name

View File

@ -1,7 +1,7 @@
# Tortoise 🐢
Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input
text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to
the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS.
@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
from TTS.tts.models.tortoise import Tortoise
config = TortoiseConfig()
model = Tortoise.inif_from_config(config)
model = Tortoise.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
# with random speaker
@ -29,23 +29,23 @@ from TTS.api import TTS
tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
# cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
# with custom inference settings overriding defaults.
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
# with custom inference settings overriding defaults.
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav",
voice_dir="TTS/tts/utils/assets/tortoise/voices/",
voice_dir="path/to/tortoise/voices/dir/",
speaker="lj",
num_autoregressive_samples=1,
diffusion_iterations=10)
# Using presets with the same voice
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav",
voice_dir="TTS/tts/utils/assets/tortoise/voices/",
voice_dir="path/to/tortoise/voices/dir/",
speaker="lj",
preset="ultra_fast")
# Random voice generation
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav")
```
@ -54,16 +54,16 @@ Using 🐸TTS Command line:
```console
# cloning the `lj` voice
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
--text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
--voice_dir TTS/tts/utils/assets/tortoise/voices/ \
--text "This is an example." \
--out_path "output.wav" \
--voice_dir path/to/tortoise/voices/dir/ \
--speaker_idx "lj" \
--progress_bar True
# Random voice generation
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
--text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
--out_path "output.wav" \
--progress_bar True
```