mirror of https://github.com/coqui-ai/TTS.git
add Jupyter Notebook for Extract Speaker Embedding per sample using AngleProto
parent
bd4c6ee42a
commit
f37159c135
|
@ -1,60 +0,0 @@
|
|||
|
||||
"github_branch":"* dev-gst-embeddings",
|
||||
{
|
||||
"run_name": "libritts_100+360-angleproto",
|
||||
"run_description": "train speaker encoder for libritts 100 and 360",
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
||||
},
|
||||
"reinit_layers": [],
|
||||
"loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
|
||||
"grad_clip": 3.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"steps_plot_stats": 10, // number of steps to plot embeddings.
|
||||
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 1, // Number of steps to log traning on console.
|
||||
"output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
|
||||
"model": {
|
||||
"input_dim": 80, // input_dim == num_mels
|
||||
"proj_dim": 128,
|
||||
"lstm_dim": 384,
|
||||
"num_lstm_layers": 3
|
||||
},
|
||||
"datasets":
|
||||
[
|
||||
{
|
||||
"name": "vctk",
|
||||
"path": "../../../datasets/VCTK-Corpus-removed-silence/",
|
||||
"meta_file_train": null,
|
||||
"meta_file_val": null
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
|
||||
"github_branch":"* dev-gst-embeddings",
|
||||
{
|
||||
"run_name": "libritts_100+360-angleproto",
|
||||
"run_description": "train speaker encoder for libritts 100 and 360",
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
||||
},
|
||||
"reinit_layers": [],
|
||||
"loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
|
||||
"grad_clip": 3.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"steps_plot_stats": 10, // number of steps to plot embeddings.
|
||||
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 1, // Number of steps to log traning on console.
|
||||
"output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
|
||||
"model": {
|
||||
"input_dim": 80, // input_dim == num_mels
|
||||
"proj_dim": 128,
|
||||
"lstm_dim": 384,
|
||||
"num_lstm_layers": 3
|
||||
},
|
||||
"datasets":
|
||||
[
|
||||
{
|
||||
"name": "vctk",
|
||||
"path": "../../../datasets/VCTK-Corpus-removed-silence/",
|
||||
"meta_file_train": null,
|
||||
"meta_file_val": null
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
|
||||
"github_branch":"* dev-gst-embeddings",
|
||||
{
|
||||
"run_name": "libritts_100+360-angleproto",
|
||||
"run_description": "train speaker encoder for libritts 100 and 360",
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
||||
},
|
||||
"reinit_layers": [],
|
||||
"loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
|
||||
"grad_clip": 3.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"steps_plot_stats": 10, // number of steps to plot embeddings.
|
||||
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 1, // Number of steps to log traning on console.
|
||||
"output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
|
||||
"model": {
|
||||
"input_dim": 80, // input_dim == num_mels
|
||||
"proj_dim": 128,
|
||||
"lstm_dim": 384,
|
||||
"num_lstm_layers": 3
|
||||
},
|
||||
"datasets":
|
||||
[
|
||||
{
|
||||
"name": "vctk",
|
||||
"path": "../../../datasets/VCTK-Corpus-removed-silence/",
|
||||
"meta_file_train": null,
|
||||
"meta_file_val": null
|
||||
}
|
||||
]
|
||||
}
|
|
@ -23,7 +23,7 @@ class GE2ELoss(nn.Module):
|
|||
self.b = nn.Parameter(torch.tensor(init_b))
|
||||
self.loss_method = loss_method
|
||||
|
||||
print('Initialised Generalized End-to-End loss')
|
||||
print(' > Initialised Generalized End-to-End loss')
|
||||
|
||||
assert self.loss_method in ["softmax", "contrast"]
|
||||
|
||||
|
@ -142,7 +142,7 @@ class AngleProtoLoss(nn.Module):
|
|||
self.b = nn.Parameter(torch.tensor(init_b))
|
||||
self.criterion = torch.nn.CrossEntropyLoss()
|
||||
|
||||
print('Initialised Angular Prototypical loss')
|
||||
print(' > Initialised Angular Prototypical loss')
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n",
|
||||
"\n",
|
||||
"Before running this script please DON'T FORGET: \n",
|
||||
"- to set file paths.\n",
|
||||
"- to download related model files from TTS.\n",
|
||||
"- download or clone related repos, linked below.\n",
|
||||
"- setup the repositories. ```python setup.py install```\n",
|
||||
"- to checkout right commit versions (given next to the model) of TTS.\n",
|
||||
"- to set the right paths in the cell below.\n",
|
||||
"\n",
|
||||
"Repository:\n",
|
||||
"- TTS: https://github.com/mozilla/TTS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"import os\n",
|
||||
"import importlib\n",
|
||||
"import random\n",
|
||||
"import librosa\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
|
||||
"\n",
|
||||
"# you may need to change this depending on your system\n",
|
||||
"os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.io import load_config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You should also adjust all the path constants to point at the relevant locations for you locally"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n",
|
||||
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
|
||||
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"DATASETS_NAME = ['vctk'] # list the datasets\n",
|
||||
"DATASETS_PATH = ['../../../datasets/VCTK/']\n",
|
||||
"DATASETS_METAFILE = ['']\n",
|
||||
"\n",
|
||||
"USE_CUDA = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Preprocess dataset\n",
|
||||
"meta_data = []\n",
|
||||
"for i in range(len(DATASETS_NAME)):\n",
|
||||
" preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
|
||||
" preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
|
||||
" meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
|
||||
" \n",
|
||||
"meta_data= list(meta_data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"c = load_config(CONFIG_PATH)\n",
|
||||
"ap = AudioProcessor(**c['audio'])\n",
|
||||
"\n",
|
||||
"model = SpeakerEncoder(**c.model)\n",
|
||||
"model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
|
||||
"model.eval()\n",
|
||||
"if USE_CUDA:\n",
|
||||
" model.cuda()\n",
|
||||
"\n",
|
||||
"embeddings_dict = {}\n",
|
||||
"len_meta_data= len(meta_data)\n",
|
||||
"\n",
|
||||
"for i in tqdm(range(len_meta_data)):\n",
|
||||
" _, wav_file, speaker_id = meta_data[i]\n",
|
||||
" wav_file_name = os.path.basename(wav_file)\n",
|
||||
" mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
|
||||
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
|
||||
" if USE_CUDA:\n",
|
||||
" mel_spec = mel_spec.cuda()\n",
|
||||
" embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
|
||||
" embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create and export speakers.json\n",
|
||||
"speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
|
||||
"save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#test load integrity\n",
|
||||
"speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
|
||||
"assert speaker_mapping == speaker_mapping_load\n",
|
||||
"print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -63,9 +63,9 @@
|
|||
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"DATASETS_NAME = ['brspeech'] # list the datasets\n",
|
||||
"DATASETS_PATH = ['../../../datasets/BRSpeech-2.0-beta8']\n",
|
||||
"DATASETS_METAFILE = ['TTS_metadata_brspeech2+cv_all_valited_lines.csv']\n",
|
||||
"DATASETS_NAME = ['vctk'] # list the datasets\n",
|
||||
"DATASETS_PATH = ['../../../datasets/VCTK/']\n",
|
||||
"DATASETS_METAFILE = ['']\n",
|
||||
"\n",
|
||||
"USE_CUDA = True"
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue