add Jupyter Notebook for Extract Speaker Embedding per sample using AngleProto

2020-07-31 01:03:03 -03:00 · 2020-07-31 01:03:03 -03:00 · f37159c135
parent bd4c6ee42a
commit f37159c135
6 changed files with 168 additions and 185 deletions
--- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json
+++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json
@ -1,60 +0,0 @@
 "github_branch":"* dev-gst-embeddings",
 {
    "run_name": "libritts_100+360-angleproto",
    "run_description": "train speaker encoder for libritts 100 and 360",
    "audio":{
        // Audio processing parameters
        "num_mels": 80,         // size of the mel spec frame. 
        "num_freq": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
        "win_length": 1024,     // stft window length in ms.
        "hop_length": 256,      // stft window hop-lengh in ms.
        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
        "min_level_db": -100,   // normalization range
        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
        // Normalization parameters
        "signal_norm": true,    // normalize the spec values in range [0, 1]
        "symmetric_norm": true, // move normalization to range [-1, 1]
        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
    },
    "reinit_layers": [],
    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
    "grad_clip": 3.0, // upper limit for gradients for clipping.
    "epochs": 1000, // total number of epochs to train.
    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
    "steps_plot_stats": 10, // number of steps to plot embeddings.
    "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
    "wd": 0.000001, // Weight decay weight.
    "checkpoint": true, // If true, it saves checkpoints per "save_step"
    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
    "print_step": 1, // Number of steps to log traning on console.
    "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
    "model": {
        "input_dim": 80, // input_dim == num_mels
        "proj_dim": 128,
        "lstm_dim": 384,
        "num_lstm_layers": 3
    },
    "datasets": 
        [
            {
                "name": "vctk",
                "path": "../../../datasets/VCTK-Corpus-removed-silence/",
                "meta_file_train": null,
                "meta_file_val": null
            }
        ]
 }
--- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json
+++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json
@ -1,60 +0,0 @@
 "github_branch":"* dev-gst-embeddings",
 {
    "run_name": "libritts_100+360-angleproto",
    "run_description": "train speaker encoder for libritts 100 and 360",
    "audio":{
        // Audio processing parameters
        "num_mels": 80,         // size of the mel spec frame. 
        "num_freq": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
        "win_length": 1024,     // stft window length in ms.
        "hop_length": 256,      // stft window hop-lengh in ms.
        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
        "min_level_db": -100,   // normalization range
        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
        // Normalization parameters
        "signal_norm": true,    // normalize the spec values in range [0, 1]
        "symmetric_norm": true, // move normalization to range [-1, 1]
        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
    },
    "reinit_layers": [],
    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
    "grad_clip": 3.0, // upper limit for gradients for clipping.
    "epochs": 1000, // total number of epochs to train.
    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
    "steps_plot_stats": 10, // number of steps to plot embeddings.
    "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
    "wd": 0.000001, // Weight decay weight.
    "checkpoint": true, // If true, it saves checkpoints per "save_step"
    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
    "print_step": 1, // Number of steps to log traning on console.
    "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
    "model": {
        "input_dim": 80, // input_dim == num_mels
        "proj_dim": 128,
        "lstm_dim": 384,
        "num_lstm_layers": 3
    },
    "datasets": 
        [
            {
                "name": "vctk",
                "path": "../../../datasets/VCTK-Corpus-removed-silence/",
                "meta_file_train": null,
                "meta_file_val": null
            }
        ]
 }
--- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json
+++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json
@ -1,60 +0,0 @@
 "github_branch":"* dev-gst-embeddings",
 {
    "run_name": "libritts_100+360-angleproto",
    "run_description": "train speaker encoder for libritts 100 and 360",
    "audio":{
        // Audio processing parameters
        "num_mels": 80,         // size of the mel spec frame. 
        "num_freq": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
        "win_length": 1024,     // stft window length in ms.
        "hop_length": 256,      // stft window hop-lengh in ms.
        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
        "min_level_db": -100,   // normalization range
        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
        // Normalization parameters
        "signal_norm": true,    // normalize the spec values in range [0, 1]
        "symmetric_norm": true, // move normalization to range [-1, 1]
        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
    },
    "reinit_layers": [],
    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
    "grad_clip": 3.0, // upper limit for gradients for clipping.
    "epochs": 1000, // total number of epochs to train.
    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
    "steps_plot_stats": 10, // number of steps to plot embeddings.
    "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
    "wd": 0.000001, // Weight decay weight.
    "checkpoint": true, // If true, it saves checkpoints per "save_step"
    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
    "print_step": 1, // Number of steps to log traning on console.
    "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
    "model": {
        "input_dim": 80, // input_dim == num_mels
        "proj_dim": 128,
        "lstm_dim": 384,
        "num_lstm_layers": 3
    },
    "datasets": 
        [
            {
                "name": "vctk",
                "path": "../../../datasets/VCTK-Corpus-removed-silence/",
                "meta_file_train": null,
                "meta_file_val": null
            }
        ]
 }
--- a/mozilla_voice_tts/speaker_encoder/losses.py
+++ b/mozilla_voice_tts/speaker_encoder/losses.py
@ -23,7 +23,7 @@ class GE2ELoss(nn.Module):
        self.b = nn.Parameter(torch.tensor(init_b))
        self.loss_method = loss_method
-        print('Initialised Generalized End-to-End loss')
+        print(' > Initialised Generalized End-to-End loss')
        assert self.loss_method in ["softmax", "contrast"]
@ -142,7 +142,7 @@ class AngleProtoLoss(nn.Module):
        self.b = nn.Parameter(torch.tensor(init_b))
        self.criterion = torch.nn.CrossEntropyLoss()
-        print('Initialised Angular Prototypical loss')
+        print(' > Initialised Angular Prototypical loss')
    def forward(self, x):
        """
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -0,0 +1,163 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n",
    "\n",
    "Before running this script please DON'T FORGET: \n",
    "- to set file paths.\n",
    "- to download related model files from TTS.\n",
    "- download or clone related repos, linked below.\n",
    "- setup the repositories. ```python setup.py install```\n",
    "- to checkout right commit versions (given next to the model) of TTS.\n",
    "- to set the right paths in the cell below.\n",
    "\n",
    "Repository:\n",
    "- TTS: https://github.com/mozilla/TTS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "import os\n",
    "import importlib\n",
    "import random\n",
    "import librosa\n",
    "import torch\n",
    "\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
    "\n",
    "# you may need to change this depending on your system\n",
    "os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
    "\n",
    "\n",
    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
    "from TTS.utils.audio import AudioProcessor\n",
    "from TTS.utils.io import load_config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You should also adjust all the path constants to point at the relevant locations for you locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n",
    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
    "\n",
    "\n",
    "DATASETS_NAME = ['vctk'] # list the datasets\n",
    "DATASETS_PATH = ['../../../datasets/VCTK/']\n",
    "DATASETS_METAFILE = ['']\n",
    "\n",
    "USE_CUDA = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess dataset\n",
    "meta_data = []\n",
    "for i in range(len(DATASETS_NAME)):\n",
    "    preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
    "      \n",
    "meta_data= list(meta_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = load_config(CONFIG_PATH)\n",
    "ap = AudioProcessor(**c['audio'])\n",
    "\n",
    "model = SpeakerEncoder(**c.model)\n",
    "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
    "model.eval()\n",
    "if USE_CUDA:\n",
    "    model.cuda()\n",
    "\n",
    "embeddings_dict = {}\n",
    "len_meta_data= len(meta_data)\n",
    "\n",
    "for i in tqdm(range(len_meta_data)):\n",
    "    _, wav_file, speaker_id = meta_data[i]\n",
    "    wav_file_name = os.path.basename(wav_file)\n",
    "    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
    "    mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
    "    if USE_CUDA:\n",
    "        mel_spec = mel_spec.cuda()\n",
    "    embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
    "    embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and export speakers.json\n",
    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
    "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#test load integrity\n",
    "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
    "assert speaker_mapping == speaker_mapping_load\n",
    "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -63,9 +63,9 @@
    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
    "\n",
    "\n",
-    "DATASETS_NAME = ['brspeech'] # list the datasets\n",
+    "DATASETS_NAME = ['vctk'] # list the datasets\n",
-    "DATASETS_PATH = ['../../../datasets/BRSpeech-2.0-beta8']\n",
+    "DATASETS_PATH = ['../../../datasets/VCTK/']\n",
-    "DATASETS_METAFILE = ['TTS_metadata_brspeech2+cv_all_valited_lines.csv']\n",
+    "DATASETS_METAFILE = ['']\n",
    "\n",
    "USE_CUDA = True"
   ]