add Jupyter Notebook for Extract Speaker Embedding per sample using AngleProto

2020-07-31 01:03:03 -03:00 · 2020-07-31 01:03:03 -03:00 · f37159c135
parent bd4c6ee42a
commit f37159c135
6 changed files with 168 additions and 185 deletions
--- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json
+++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json
@ -1,60 +0,0 @@
-
-"github_branch":"* dev-gst-embeddings",
-{
-    "run_name": "libritts_100+360-angleproto",
-    "run_description": "train speaker encoder for libritts 100 and 360",
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "num_freq": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
-    },
-    "reinit_layers": [],
-    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "epochs": 1000, // total number of epochs to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 10, // number of steps to plot embeddings.
-    "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
-    "print_step": 1, // Number of steps to log traning on console.
-    "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
-    "model": {
-        "input_dim": 80, // input_dim == num_mels
-        "proj_dim": 128,
-        "lstm_dim": 384,
-        "num_lstm_layers": 3
-    },
-    "datasets": 
-        [
-            {
-                "name": "vctk",
-                "path": "../../../datasets/VCTK-Corpus-removed-silence/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            }
-        ]
-}
--- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json
+++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json
@ -1,60 +0,0 @@
-
-"github_branch":"* dev-gst-embeddings",
-{
-    "run_name": "libritts_100+360-angleproto",
-    "run_description": "train speaker encoder for libritts 100 and 360",
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "num_freq": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
-    },
-    "reinit_layers": [],
-    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "epochs": 1000, // total number of epochs to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 10, // number of steps to plot embeddings.
-    "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
-    "print_step": 1, // Number of steps to log traning on console.
-    "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
-    "model": {
-        "input_dim": 80, // input_dim == num_mels
-        "proj_dim": 128,
-        "lstm_dim": 384,
-        "num_lstm_layers": 3
-    },
-    "datasets": 
-        [
-            {
-                "name": "vctk",
-                "path": "../../../datasets/VCTK-Corpus-removed-silence/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            }
-        ]
-}
--- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json
+++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json
@ -1,60 +0,0 @@
-
-"github_branch":"* dev-gst-embeddings",
-{
-    "run_name": "libritts_100+360-angleproto",
-    "run_description": "train speaker encoder for libritts 100 and 360",
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "num_freq": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
-    },
-    "reinit_layers": [],
-    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "epochs": 1000, // total number of epochs to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 10, // number of steps to plot embeddings.
-    "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
-    "print_step": 1, // Number of steps to log traning on console.
-    "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
-    "model": {
-        "input_dim": 80, // input_dim == num_mels
-        "proj_dim": 128,
-        "lstm_dim": 384,
-        "num_lstm_layers": 3
-    },
-    "datasets": 
-        [
-            {
-                "name": "vctk",
-                "path": "../../../datasets/VCTK-Corpus-removed-silence/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            }
-        ]
-}
--- a/mozilla_voice_tts/speaker_encoder/losses.py
+++ b/mozilla_voice_tts/speaker_encoder/losses.py
@ -23,7 +23,7 @@ class GE2ELoss(nn.Module):
        self.b = nn.Parameter(torch.tensor(init_b))
        self.loss_method = loss_method

-        print('Initialised Generalized End-to-End loss')
+        print(' > Initialised Generalized End-to-End loss')

        assert self.loss_method in ["softmax", "contrast"]

@ -142,7 +142,7 @@ class AngleProtoLoss(nn.Module):
        self.b = nn.Parameter(torch.tensor(init_b))
        self.criterion = torch.nn.CrossEntropyLoss()

-        print('Initialised Angular Prototypical loss')
+        print(' > Initialised Angular Prototypical loss')

    def forward(self, x):
        """
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -0,0 +1,163 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n",
+    "\n",
+    "Before running this script please DON'T FORGET: \n",
+    "- to set file paths.\n",
+    "- to download related model files from TTS.\n",
+    "- download or clone related repos, linked below.\n",
+    "- setup the repositories. ```python setup.py install```\n",
+    "- to checkout right commit versions (given next to the model) of TTS.\n",
+    "- to set the right paths in the cell below.\n",
+    "\n",
+    "Repository:\n",
+    "- TTS: https://github.com/mozilla/TTS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import os\n",
+    "import importlib\n",
+    "import random\n",
+    "import librosa\n",
+    "import torch\n",
+    "\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
+    "\n",
+    "# you may need to change this depending on your system\n",
+    "os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
+    "\n",
+    "\n",
+    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
+    "from TTS.utils.audio import AudioProcessor\n",
+    "from TTS.utils.io import load_config"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should also adjust all the path constants to point at the relevant locations for you locally"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n",
+    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
+    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
+    "\n",
+    "\n",
+    "DATASETS_NAME = ['vctk'] # list the datasets\n",
+    "DATASETS_PATH = ['../../../datasets/VCTK/']\n",
+    "DATASETS_METAFILE = ['']\n",
+    "\n",
+    "USE_CUDA = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Preprocess dataset\n",
+    "meta_data = []\n",
+    "for i in range(len(DATASETS_NAME)):\n",
+    "    preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
+    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
+    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
+    "      \n",
+    "meta_data= list(meta_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = load_config(CONFIG_PATH)\n",
+    "ap = AudioProcessor(**c['audio'])\n",
+    "\n",
+    "model = SpeakerEncoder(**c.model)\n",
+    "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
+    "model.eval()\n",
+    "if USE_CUDA:\n",
+    "    model.cuda()\n",
+    "\n",
+    "embeddings_dict = {}\n",
+    "len_meta_data= len(meta_data)\n",
+    "\n",
+    "for i in tqdm(range(len_meta_data)):\n",
+    "    _, wav_file, speaker_id = meta_data[i]\n",
+    "    wav_file_name = os.path.basename(wav_file)\n",
+    "    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
+    "    mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
+    "    if USE_CUDA:\n",
+    "        mel_spec = mel_spec.cuda()\n",
+    "    embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
+    "    embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create and export speakers.json\n",
+    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
+    "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#test load integrity\n",
+    "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
+    "assert speaker_mapping == speaker_mapping_load\n",
+    "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -63,9 +63,9 @@
    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
    "\n",
    "\n",
-    "DATASETS_NAME = ['brspeech'] # list the datasets\n",
-    "DATASETS_PATH = ['../../../datasets/BRSpeech-2.0-beta8']\n",
-    "DATASETS_METAFILE = ['TTS_metadata_brspeech2+cv_all_valited_lines.csv']\n",
+    "DATASETS_NAME = ['vctk'] # list the datasets\n",
+    "DATASETS_PATH = ['../../../datasets/VCTK/']\n",
+    "DATASETS_METAFILE = ['']\n",
    "\n",
    "USE_CUDA = True"
   ]