add support for CorentinJ Speaker encoder and add notebook for extract embeddings

2020-08-05 08:43:27 -03:00 · 2020-08-05 08:43:27 -03:00 · 07c961382f
parent ac032f00f3
commit 07c961382f
4 changed files with 25541 additions and 21 deletions
--- a/mozilla_voice_tts/speaker_encoder/config.json
+++ b/mozilla_voice_tts/speaker_encoder/config.json
@ -1,14 +1,14 @@

 {
-    "run_name": "libritts_100+360-angleproto",
-    "run_description": "train speaker encoder for libritts 100 and 360",
+    "run_name": "Model compatible to  CorentinJ/Real-Time-Voice-Cloning",
+    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
    "audio":{
        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "num_freq": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
+        "num_mels": 40,         // size of the mel spec frame. 
+        "fft_size": 400,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 400,     // stft window length in ms.
+        "hop_length": 160,      // stft window hop-lengh in ms.
        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
@ -27,7 +27,7 @@
        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
    },
    "reinit_layers": [],
-    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
+    "loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
    "grad_clip": 3.0, // upper limit for gradients for clipping.
    "epochs": 1000, // total number of epochs to train.
    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
@ -41,12 +41,13 @@
    "checkpoint": true, // If true, it saves checkpoints per "save_step"
    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
    "print_step": 1, // Number of steps to log traning on console.
-    "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
+    "output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
    "model": {
-        "input_dim": 80, // input_dim == num_mels
-        "proj_dim": 128,
-        "lstm_dim": 384,
-        "num_lstm_layers": 3
+        "input_dim": 40,
+        "proj_dim": 256,
+        "lstm_dim": 256,
+        "num_lstm_layers": 3,
+        "use_lstm_with_projection": false
    },
    "datasets": 
        [
--- a/mozilla_voice_tts/speaker_encoder/model.py
+++ b/mozilla_voice_tts/speaker_encoder/model.py
@ -16,15 +16,33 @@ class LSTMWithProjection(nn.Module):
        o, (_, _) = self.lstm(x)
        return self.linear(o)

+class LSTMWithoutProjection(nn.Module):
+    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+        super().__init__()
+        self.lstm = nn.LSTM(input_size=input_dim,
+                            hidden_size=lstm_dim,
+                            num_layers=num_lstm_layers,
+                            batch_first=True)
+        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        _, (hidden, _) = self.lstm(x)
+        return self.relu(self.linear(hidden[-1]))

 class SpeakerEncoder(nn.Module):
-    def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3):
+    def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
        super().__init__()
+        self.use_lstm_with_projection = use_lstm_with_projection
        layers = []
+        # choise LSTM layer
+        if use_lstm_with_projection:
            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
            for _ in range(num_lstm_layers - 1):
                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
            self.layers = nn.Sequential(*layers)
+        else:
+            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+
        self._init_layers()

    def _init_layers(self):
@ -37,12 +55,18 @@ class SpeakerEncoder(nn.Module):
    def forward(self, x):
        # TODO: implement state passing for lstms
        d = self.layers(x)
+        if self.use_lstm_with_projection:
            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
        return d

    def inference(self, x):
        d = self.layers.forward(x)
+        if self.use_lstm_with_projection:
            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
        return d

    def compute_embedding(self, x, num_frames=160, overlap=0.5):
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -79,7 +79,7 @@
    "#Preprocess dataset\n",
    "meta_data = []\n",
    "for i in range(len(DATASETS_NAME)):\n",
-    "    preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
+    "    preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
    "      \n",
--- a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb