mirror of https://github.com/coqui-ai/TTS.git
213 lines
6.3 KiB
Plaintext
213 lines
6.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n",
|
|
"\n",
|
|
"Before running this script please DON'T FORGET:\n",
|
|
"- to set the right paths in the cell below.\n",
|
|
"\n",
|
|
"Repositories:\n",
|
|
"- TTS: https://github.com/coqui/TTS\n",
|
|
"- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import importlib\n",
|
|
"import random\n",
|
|
"import librosa\n",
|
|
"import torch\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"from TTS.utils.io import load_config\n",
|
|
"from tqdm import tqdm\n",
|
|
"from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
|
|
"\n",
|
|
"# you may need to change this depending on your system\n",
|
|
"os.environ['CUDA_VISIBLE_DEVICES']='0'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Clone encoder \n",
|
|
"!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n",
|
|
"os.chdir('Real-Time-Voice-Cloning/')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Install voxceleb_trainer Requeriments\n",
|
|
"!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0 scipy>=1.0.0 tqdm sounddevice Unidecode inflect multiprocess numba"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Download encoder Checkpoint\n",
|
|
"!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n",
|
|
"!unzip pretrained.zip"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from encoder import inference as encoder\n",
|
|
"from encoder.params_model import model_embedding_size as speaker_embedding_size\n",
|
|
"from pathlib import Path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n",
|
|
"encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n",
|
|
"print(\"Testing your configuration with small inputs.\")\n",
|
|
"# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n",
|
|
"# sampling rate, which may differ.\n",
|
|
"# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n",
|
|
"# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n",
|
|
"# The sampling rate is the number of values (samples) recorded per second, it is set to\n",
|
|
"# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond \n",
|
|
"# to an audio of 1 second.\n",
|
|
"print(\"\\tTesting the encoder...\")\n",
|
|
"\n",
|
|
"wav = np.zeros(encoder.sampling_rate) \n",
|
|
"embed = encoder.embed_utterance(wav)\n",
|
|
"print(embed.shape)\n",
|
|
"\n",
|
|
"# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n",
|
|
"# embeddings it will be).\n",
|
|
"#embed /= np.linalg.norm(embed) # for random embedding\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"SAVE_PATH = '../'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Set constants\n",
|
|
"DATASETS_NAME = ['vctk'] # list the datasets\n",
|
|
"DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n",
|
|
"DATASETS_METAFILE = ['']\n",
|
|
"USE_CUDA = True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Preprocess dataset\n",
|
|
"meta_data = []\n",
|
|
"for i in range(len(DATASETS_NAME)):\n",
|
|
" preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
|
|
" preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
|
|
" meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
|
|
" \n",
|
|
"meta_data= list(meta_data)\n",
|
|
"\n",
|
|
"meta_data = meta_data\n",
|
|
"embeddings_dict = {}\n",
|
|
"len_meta_data= len(meta_data)\n",
|
|
"for i in tqdm(range(len_meta_data)):\n",
|
|
" _, wave_file_path, speaker_id = meta_data[i]\n",
|
|
" wav_file_name = os.path.basename(wave_file_path)\n",
|
|
" # Extract Embedding\n",
|
|
" preprocessed_wav = encoder.preprocess_wav(wave_file_path)\n",
|
|
" file_embedding = encoder.embed_utterance(preprocessed_wav)\n",
|
|
" embeddings_dict[wav_file_name] = [file_embedding.reshape(-1).tolist(), speaker_id]\n",
|
|
" del file_embedding"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# create and export speakers.json and aplly a L2_norm in embedding\n",
|
|
"speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0]} for i, sample in enumerate(embeddings_dict.keys())}\n",
|
|
"save_speaker_mapping(SAVE_PATH, speaker_mapping)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#test load integrity\n",
|
|
"speaker_mapping_load = load_speaker_mapping(SAVE_PATH)\n",
|
|
"assert speaker_mapping == speaker_mapping_load\n",
|
|
"print(\"The file speakers.json has been exported to \",SAVE_PATH, ' with ', len(embeddings_dict.keys()), ' samples')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|