From 78bad25f2b6e7261442665d8c5c320fedfdfa6db Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 7 May 2021 23:45:15 -0300 Subject: [PATCH 01/18] update voxceleb download link --- TTS/speaker_encoder/utils/prepare_voxceleb.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/TTS/speaker_encoder/utils/prepare_voxceleb.py b/TTS/speaker_encoder/utils/prepare_voxceleb.py index bc043a58..05a65bea 100644 --- a/TTS/speaker_encoder/utils/prepare_voxceleb.py +++ b/TTS/speaker_encoder/utils/prepare_voxceleb.py @@ -31,23 +31,23 @@ from absl import logging SUBSETS = { "vox1_dev_wav": [ - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad", ], - "vox1_test_wav": ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"], + "vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"], "vox2_dev_aac": [ - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag", - "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah", ], - "vox2_test_aac": ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"], + "vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"], } MD5SUM = { From 77d85c6cc5a3fb0cd5e852d0430468112c28a66c Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 10 May 2021 17:08:38 -0300 Subject: [PATCH 02/18] add softmaxproto loss and bug fix in data loader --- TTS/bin/train_encoder.py | 47 ++++++----- TTS/speaker_encoder/{ => configs}/config.json | 3 + .../configs/config_softmaxproto.json | 78 +++++++++++++++++++ TTS/speaker_encoder/dataset.py | 58 +++++++++++--- TTS/speaker_encoder/losses.py | 64 ++++++++++++++- TTS/speaker_encoder/utils/generic_utils.py | 2 +- 6 files changed, 219 insertions(+), 33 deletions(-) rename TTS/speaker_encoder/{ => configs}/config.json (97%) create mode 100644 TTS/speaker_encoder/configs/config_softmaxproto.json diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 3a3f876e..05a76b68 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -11,7 +11,7 @@ import torch from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import MyDataset -from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss +from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxLoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.model import SpeakerEncoder from TTS.speaker_encoder.utils.generic_utils import check_config_speaker_encoder, save_best_model from TTS.speaker_encoder.utils.visual import plot_embeddings @@ -45,15 +45,16 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False dataset = MyDataset( ap, meta_data_eval if is_val else meta_data_train, - voice_len=1.6, + voice_len=getattr(c, "voice_len", 1.6), num_utter_per_speaker=c.num_utters_per_speaker, num_speakers_in_batch=c.num_speakers_in_batch, - skip_speakers=False, + skip_speakers=getattr(c, "skip_speakers", False), storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], additive_noise=c.storage["additive_noise"], verbose=verbose, ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader( dataset, @@ -62,11 +63,25 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False num_workers=c.num_loader_workers, collate_fn=dataset.collate_fn, ) - return loader + return loader, dataset.get_num_speakers() -def train(model, criterion, optimizer, scheduler, ap, global_step): - data_loader = setup_loader(ap, is_val=False, verbose=True) +def train(model, optimizer, scheduler, ap, global_step): + data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) + + if c.loss == "ge2e": + criterion = GE2ELoss(loss_method="softmax") + elif c.loss == "angleproto": + criterion = AngleProtoLoss() + elif c.loss == "softmaxproto": + criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers) + else: + raise Exception("The %s not is a loss supported" % c.loss) + + if use_cuda: + model = model.cuda() + criterion.cuda() + model.train() epoch_time = 0 best_loss = float("inf") @@ -77,7 +92,8 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): start_time = time.time() # setup input data - inputs = data[0] + inputs, labels = data + loader_time = time.time() - end_time global_step += 1 @@ -89,13 +105,13 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): # dispatch data to GPU if use_cuda: inputs = inputs.cuda(non_blocking=True) - # labels = labels.cuda(non_blocking=True) + labels = labels.cuda(non_blocking=True) # forward pass model outputs = model(inputs) # loss computation - loss = criterion(outputs.view(c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1)) + loss = criterion(outputs.view(c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1), labels) loss.backward() grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() @@ -158,13 +174,6 @@ def main(args): # pylint: disable=redefined-outer-name ) optimizer = RAdam(model.parameters(), lr=c.lr) - if c.loss == "ge2e": - criterion = GE2ELoss(loss_method="softmax") - elif c.loss == "angleproto": - criterion = AngleProtoLoss() - else: - raise Exception("The %s not is a loss supported" % c.loss) - if args.restore_path: checkpoint = torch.load(args.restore_path) try: @@ -187,10 +196,6 @@ def main(args): # pylint: disable=redefined-outer-name else: args.restore_step = 0 - if use_cuda: - model = model.cuda() - criterion.cuda() - if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: @@ -203,7 +208,7 @@ def main(args): # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_meta_data(c.datasets) global_step = args.restore_step - _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step) + _, global_step = train(model, optimizer, scheduler, ap, global_step) if __name__ == "__main__": diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/configs/config.json similarity index 97% rename from TTS/speaker_encoder/config.json rename to TTS/speaker_encoder/configs/config.json index 4fbd84cc..2b437e5a 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/configs/config.json @@ -37,6 +37,9 @@ "steps_plot_stats": 10, // number of steps to plot embeddings. "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "num_utters_per_speaker": 10, // + "skip_speakers": false, // skip speakers with samples less than "num_utters_per_speaker" + + "voice_len": 1.6, // number of seconds for each training instance "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" diff --git a/TTS/speaker_encoder/configs/config_softmaxproto.json b/TTS/speaker_encoder/configs/config_softmaxproto.json new file mode 100644 index 00000000..f7a24a15 --- /dev/null +++ b/TTS/speaker_encoder/configs/config_softmaxproto.json @@ -0,0 +1,78 @@ + +{ + "run_name": "speaker_encoder", + "run_description": "train speaker encoder with VCTK", + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + "stft_pad_mode": "reflect", + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + "reinit_layers": [], + + "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + + // Speakers config + "num_speakers_in_batch": 108, // Batch size for training. + "num_utters_per_speaker": 2, // + "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" + + "voice_len": 2, // number of seconds for each training instance + + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 20, // Number of steps to log traning on console. + "output_path": "../../../checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + + "model": { + "input_dim": 80, + "proj_dim": 512, + "lstm_dim": 768, + "num_lstm_layers": 3, + "use_lstm_with_projection": true + }, + "storage": { + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 15, // the size of the in-memory storage with respect to a single batch + "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness + }, + "datasets": + [ + { + "name": "vctk", + "path": "/workspace/store/ecasanova/datasets/VCTK-Corpus-removed-silence/", + "meta_file_train": null, + "meta_file_val": null + } + ] +} \ No newline at end of file diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 38d8b5f9..3308fef9 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -30,7 +30,6 @@ class MyDataset(Dataset): super().__init__() self.items = meta_data self.sample_rate = ap.sample_rate - self.voice_len = voice_len self.seq_len = int(voice_len * self.sample_rate) self.num_speakers_in_batch = num_speakers_in_batch self.num_utter_per_speaker = num_utter_per_speaker @@ -41,10 +40,15 @@ class MyDataset(Dataset): self.storage = queue.Queue(maxsize=storage_size * num_speakers_in_batch) self.sample_from_storage_p = float(sample_from_storage_p) self.additive_noise = float(additive_noise) + + speakers_aux = list(self.speakers) + speakers_aux.sort() + self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)} + if self.verbose: print("\n > DataLoader initialization") print(f" | > Speakers per Batch: {num_speakers_in_batch}") - print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters") + print(f" | > Storage Size: {self.storage.maxsize} instances, each with {num_utter_per_speaker} utters") print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}") print(f" | > Noise added : {self.additive_noise}") print(f" | > Number of instances : {len(self.items)}") @@ -110,8 +114,16 @@ class MyDataset(Dataset): def __len__(self): return int(1e10) - def __sample_speaker(self): + def get_num_speakers(self): + return len(self.speakers) + + def __sample_speaker(self, ignore_speakers=None): speaker = random.sample(self.speakers, 1)[0] + # if list of speakers_id is provide make sure that it's will be ignored + if ignore_speakers: + while self.speakerid_to_classid[speaker] in ignore_speakers: + speaker = random.sample(self.speakers, 1)[0] + if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]): utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker) else: @@ -127,7 +139,8 @@ class MyDataset(Dataset): for _ in range(self.num_utter_per_speaker): # TODO:dummy but works while True: - if len(self.speaker_to_utters[speaker]) > 0: + # remove speakers that have num_utter less than 2 + if len(self.speaker_to_utters[speaker]) > 1: utter = random.sample(self.speaker_to_utters[speaker], 1)[0] else: self.speakers.remove(speaker) @@ -139,21 +152,47 @@ class MyDataset(Dataset): self.speaker_to_utters[speaker].remove(utter) wavs.append(wav) - labels.append(speaker) + labels.append(self.speakerid_to_classid[speaker]) return wavs, labels def __getitem__(self, idx): speaker, _ = self.__sample_speaker() - return speaker + speaker_id = self.speakerid_to_classid[speaker] + return speaker, speaker_id def collate_fn(self, batch): + # get the batch speaker_ids + batch = np.array(batch) + speakers_id_in_batch = set(batch[:, 1].astype(np.int32)) + labels = [] feats = [] - for speaker in batch: + speakers = set() + for speaker, speaker_id in batch: + if random.random() < self.sample_from_storage_p and self.storage.full(): # sample from storage (if full), ignoring the speaker wavs_, labels_ = random.choice(self.storage.queue) + + # force choose the current speaker or other not in batch + '''while labels_[0] in speakers_id_in_batch: + if labels_[0] == speaker_id: + break + wavs_, labels_ = random.choice(self.storage.queue)''' + + speakers.add(labels_[0]) + speakers_id_in_batch.add(labels_[0]) + else: + # ensure that an speaker appears only once in the batch + if speaker_id in speakers: + speaker, _ = self.__sample_speaker(speakers_id_in_batch) + speaker_id = self.speakerid_to_classid[speaker] + # append the new speaker from batch + speakers_id_in_batch.add(speaker_id) + + speakers.add(speaker_id) + # don't sample from storage, but from HDD wavs_, labels_ = self.__sample_speaker_utterances(speaker) # if storage is full, remove an item @@ -167,14 +206,15 @@ class MyDataset(Dataset): noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_] wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))] - # get a random subset of each of the wavs and convert to MFCC. + # get a random subset of each of the wavs and extract mel spectrograms. offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_] mels_ = [ self.ap.melspectrogram(wavs_[i][offsets_[i] : offsets_[i] + self.seq_len]) for i in range(len(wavs_)) ] feats_ = [torch.FloatTensor(mel) for mel in mels_] - labels.append(labels_) + labels.append(torch.LongTensor(labels_)) feats.extend(feats_) feats = torch.stack(feats) + labels = torch.stack(labels) return feats.transpose(1, 2), labels diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py index 69264ab4..52871fb4 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/speaker_encoder/losses.py @@ -103,10 +103,13 @@ class GE2ELoss(nn.Module): L.append(L_row) return torch.stack(L) - def forward(self, dvecs): + def forward(self, dvecs, label=None): """ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ + + assert x.size()[1] >= 2 + centroids = torch.mean(dvecs, 1) cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids) torch.clamp(self.w, 1e-6) @@ -138,10 +141,13 @@ class AngleProtoLoss(nn.Module): print(" > Initialised Angular Prototypical loss") - def forward(self, x): + def forward(self, x, label=None): """ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ + + assert x.size()[1] >= 2 + out_anchor = torch.mean(x[:, 1:, :], 1) out_positive = x[:, 0, :] num_speakers = out_anchor.size()[0] @@ -155,3 +161,57 @@ class AngleProtoLoss(nn.Module): label = torch.arange(num_speakers).to(cos_sim_matrix.device) L = self.criterion(cos_sim_matrix, label) return L + +class SoftmaxLoss(nn.Module): + """ + Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982 + Args: + - embedding_dim (float): speaker embedding dim + - n_speakers (float): number of speakers + """ + def __init__(self, embedding_dim, n_speakers): + super().__init__() + + self.criterion = torch.nn.CrossEntropyLoss() + self.fc = nn.Linear(embedding_dim, n_speakers) + + print('Initialised Softmax Loss') + + def forward(self, x, label=None): + + x = self.fc(x) + L = self.criterion(x, label) + + return L + +class SoftmaxAngleProtoLoss(nn.Module): + """ + Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153 + Args: + - embedding_dim (float): speaker embedding dim + - n_speakers (float): number of speakers + - init_w (float): defines the initial value of w + - init_b (float): definies the initial value of b + """ + def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0): + super().__init__() + + self.softmax = SoftmaxLoss(embedding_dim, n_speakers) + self.angleproto = AngleProtoLoss(init_w, init_b) + + print('Initialised SoftmaxAnglePrototypical Loss') + + def forward(self, x, label=None): + """ + Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + """ + + assert x.size()[1] == 2 + + Lp = self.angleproto(x) + + x = x.reshape(-1, x.size()[-1]) + label = label.reshape(-1) + Ls = self.softmax(x, label) + + return Ls+Lp diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index c9bfa679..c50304cc 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -82,7 +82,7 @@ def check_config_speaker_encoder(c): check_argument("griffin_lim_iters", c["audio"], restricted=True, val_type=int, min_val=10, max_val=1000) # training parameters - check_argument("loss", c, enum_list=["ge2e", "angleproto"], restricted=True, val_type=str) + check_argument("loss", c, enum_list=["ge2e", "angleproto", "softmaxproto"], restricted=True, val_type=str) check_argument("grad_clip", c, restricted=True, val_type=float) check_argument("epochs", c, restricted=True, val_type=int, min_val=1) check_argument("lr", c, restricted=True, val_type=float, min_val=0) From 85ccad7e0aca3ac0173040b983210465ae35638f Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 11 May 2021 00:59:57 -0300 Subject: [PATCH 03/18] add Audio data augamentation Addtive and RIR --- TTS/bin/train_encoder.py | 2 +- TTS/speaker_encoder/configs/config.json | 10 ++ .../configs/config_softmaxproto.json | 49 ++++++- TTS/speaker_encoder/dataset.py | 43 ++++-- TTS/speaker_encoder/utils/generic_utils.py | 133 +++++++++++++++++- 5 files changed, 217 insertions(+), 20 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 05a76b68..5eca376c 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -51,8 +51,8 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False skip_speakers=getattr(c, "skip_speakers", False), storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], - additive_noise=c.storage["additive_noise"], verbose=verbose, + augmentation_config=getattr(c, "audio_augmentation", None) ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None diff --git a/TTS/speaker_encoder/configs/config.json b/TTS/speaker_encoder/configs/config.json index 2b437e5a..6d983e86 100644 --- a/TTS/speaker_encoder/configs/config.json +++ b/TTS/speaker_encoder/configs/config.json @@ -53,6 +53,16 @@ "num_lstm_layers": 3, "use_lstm_with_projection": true }, + + "audio_augmentation": { + "p": 0, + //add a gaussian noise to the data in order to increase robustness + "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise + "p": 1, // propability of apply this method, 0 is disable + "min_amplitude": 0.0, + "max_amplitude": 1e-5 + } + }, "storage": { "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage "storage_size": 15, // the size of the in-memory storage with respect to a single batch diff --git a/TTS/speaker_encoder/configs/config_softmaxproto.json b/TTS/speaker_encoder/configs/config_softmaxproto.json index f7a24a15..f1bf9be0 100644 --- a/TTS/speaker_encoder/configs/config_softmaxproto.json +++ b/TTS/speaker_encoder/configs/config_softmaxproto.json @@ -25,7 +25,7 @@ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! "spec_gain": 20.0, - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) "trim_db": 60, // threshold for timming silence. Set this according to your dataset. "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, @@ -41,7 +41,7 @@ "steps_plot_stats": 10, // number of steps to plot embeddings. // Speakers config - "num_speakers_in_batch": 108, // Batch size for training. + "num_speakers_in_batch": 2, // Batch size for training. "num_utters_per_speaker": 2, // "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" @@ -54,6 +54,41 @@ "print_step": 20, // Number of steps to log traning on console. "output_path": "../../../checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "audio_augmentation": { + "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation + "rir":{ + "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", + "conv_mode": "full" + }, + "additive":{ + "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", + // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored + "speech":{ + "min_snr_in_db": 13, + "max_snr_in_db": 20, + "min_num_noises": 3, + "max_num_noises": 7 + }, + "noise":{ + "min_snr_in_db": 0, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + }, + "music":{ + "min_snr_in_db": 5, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + } + }, + //add a gaussian noise to the data in order to increase robustness + "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise + "p": 1, // propability of apply this method, 0 is disable + "min_amplitude": 0.0, + "max_amplitude": 1e-5 + } + }, "model": { "input_dim": 80, "proj_dim": 512, @@ -63,11 +98,17 @@ }, "storage": { "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 15, // the size of the in-memory storage with respect to a single batch - "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness + "storage_size": 1 // the size of the in-memory storage with respect to a single batch }, "datasets": [ + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab/", + "meta_file_train": "train.tsv", + "meta_file_val": "test.tsv" + }, { "name": "vctk", "path": "/workspace/store/ecasanova/datasets/VCTK-Corpus-removed-silence/", diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 3308fef9..14bb57c8 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -4,7 +4,7 @@ import random import numpy as np import torch from torch.utils.data import Dataset - +from TTS.speaker_encoder.utils.generic_utils import AugmentWAV class MyDataset(Dataset): def __init__( @@ -15,10 +15,11 @@ class MyDataset(Dataset): num_speakers_in_batch=64, storage_size=1, sample_from_storage_p=0.5, - additive_noise=0, + additive_noise= 1e-5, num_utter_per_speaker=10, skip_speakers=False, verbose=False, + augmentation_config=None ): """ Args: @@ -39,18 +40,27 @@ class MyDataset(Dataset): self.__parse_items() self.storage = queue.Queue(maxsize=storage_size * num_speakers_in_batch) self.sample_from_storage_p = float(sample_from_storage_p) - self.additive_noise = float(additive_noise) speakers_aux = list(self.speakers) speakers_aux.sort() self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)} + # Augmentation + self.augmentator = None + self.gaussian_augmentation_config = None + if augmentation_config: + self.data_augmentation_p = augmentation_config['p'] + if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config): + self.augmentator = AugmentWAV(ap, augmentation_config) + + if 'gaussian' in augmentation_config.keys(): + self.gaussian_augmentation_config = augmentation_config['gaussian'] + if self.verbose: print("\n > DataLoader initialization") print(f" | > Speakers per Batch: {num_speakers_in_batch}") print(f" | > Storage Size: {self.storage.maxsize} instances, each with {num_utter_per_speaker} utters") print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}") - print(f" | > Noise added : {self.additive_noise}") print(f" | > Number of instances : {len(self.items)}") print(f" | > Sequence length: {self.seq_len}") print(f" | > Num speakers: {len(self.speakers)}") @@ -151,6 +161,10 @@ class MyDataset(Dataset): break self.speaker_to_utters[speaker].remove(utter) + if self.augmentator is not None and self.data_augmentation_p: + if random.random() < self.data_augmentation_p: + wav = self.augmentator.apply_one(wav) + wavs.append(wav) labels.append(self.speakerid_to_classid[speaker]) return wavs, labels @@ -201,20 +215,21 @@ class MyDataset(Dataset): # put the newly loaded item into storage self.storage.put_nowait((wavs_, labels_)) - # add random gaussian noise - if self.additive_noise > 0: - noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_] - wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))] - # get a random subset of each of the wavs and extract mel spectrograms. - offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_] - mels_ = [ - self.ap.melspectrogram(wavs_[i][offsets_[i] : offsets_[i] + self.seq_len]) for i in range(len(wavs_)) - ] - feats_ = [torch.FloatTensor(mel) for mel in mels_] + feats_ = [] + for wav in wavs_: + offset = random.randint(0, wav.shape[0] - self.seq_len) + wav = wav[offset : offset + self.seq_len] + # add random gaussian noise + if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']: + if random.random() < self.gaussian_augmentation_config['p']: + wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav)) + mel = self.ap.melspectrogram(wav) + feats_.append(torch.FloatTensor(mel)) labels.append(torch.LongTensor(labels_)) feats.extend(feats_) feats = torch.stack(feats) labels = torch.stack(labels) + return feats.transpose(1, 2), labels diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index c50304cc..d1dbf3ae 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -2,11 +2,143 @@ import datetime import os import re +import numpy as np import torch +import glob +import random +from scipy import signal from TTS.speaker_encoder.model import SpeakerEncoder from TTS.utils.generic_utils import check_argument + +class AugmentWAV(object): + def __init__(self, ap, augmentation_config): + + self.ap = ap + + '''augmentation_config = { + "p": 1, + "rir":{ + "rir_path": "rir_path/" + "conv_mode": "full" + }, + "additive":{ + "sounds_path": "musan/", + # directorys in sounds_path + "speech":{ + "min_snr_in_db": 13, + "max_snr_in_db": 20, + "min_num_noises": 3, + "max_num_noises": 7 + }, + "noise":{ + "min_snr_in_db": 0, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + }, + "music":{ + "min_snr_in_db": 5, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + } + } + }''' + + self.use_additive_noise = False + if 'additive' in augmentation_config.keys(): + self.additive_noise_config = augmentation_config['additive'] + additive_path = self.additive_noise_config['sounds_path'] + if additive_path: + self.use_additive_noise = True + # get noise types + self.additive_noise_types = [] + for key in self.additive_noise_config.keys(): + if isinstance(self.additive_noise_config[key], dict): + self.additive_noise_types.append(key) + + additive_files = glob.glob(os.path.join(additive_path,'**/*.wav'), recursive=True) + + self.noise_list = {} + + for wav_file in additive_files: + noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0] + # ignore not listed directories + if noise_dir not in self.additive_noise_types: + continue + if not noise_dir in self.noise_list: + self.noise_list[noise_dir] = [] + self.noise_list[noise_dir].append(wav_file) + + print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}") + + self.use_rir = False + if 'rir' in augmentation_config.keys(): + self.rir_config = augmentation_config['rir'] + if self.rir_config['rir_path']: + self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'],'**/*.wav'), recursive=True) + self.use_rir = True + + print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances") + + self.create_augmentation_global_list() + + def create_augmentation_global_list(self): + if self.use_additive_noise: + self.global_noise_list = self.additive_noise_types + else: + self.global_noise_list = [] + if self.use_rir: + self.global_noise_list.append("RIR_AUG") + + def additive_noise(self, noise_type, audio): + + clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4) + + noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises'])) + + audio_len = audio.shape[0] + noises_wav = None + for noise in noise_list: + noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len] + + if noiseaudio.shape[0] < audio_len: + continue + + noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises']) + noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4) + noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio + + if noises_wav is None: + noises_wav = noise_wav + else: + noises_wav += noise_wav + + # if all possibel files is less than audio, choose other files + if noises_wav is None: + print("audio ignorado") + return self.additive_noise(noise_type, audio) + + return audio + noises_wav + + def reverberate(self, audio): + audio_len = audio.shape[0] + + rir_file = random.choice(self.rir_files) + + rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate) + rir = rir / np.sqrt(np.sum(rir ** 2)) + return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len] + + def apply_one(self, audio): + return self.reverberate(audio) + noise_type = random.choice(self.global_noise_list) + if noise_type == "RIR_AUG": + return self.reverberate(audio) + else: + return self.additive_noise(noise_type, audio) def to_camel(text): text = text.capitalize() @@ -112,7 +244,6 @@ def check_config_speaker_encoder(c): check_argument("storage", c, restricted=True, val_type=dict) check_argument("sample_from_storage_p", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0) check_argument("storage_size", c["storage"], restricted=True, val_type=int, min_val=1, max_val=100) - check_argument("additive_noise", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0) # datasets - checking only the first entry check_argument("datasets", c, restricted=True, val_type=list) From 3fcc748b2ede723b19b039313c5d29dfefb8d351 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 11 May 2021 16:27:05 -0300 Subject: [PATCH 04/18] implement the Speaker Encoder H/ASP --- .gitignore | 3 + TTS/bin/train_encoder.py | 60 +++---- TTS/speaker_encoder/configs/config.json | 1 + .../configs/config_softmaxproto.json | 8 +- .../configs/config_voxceleb_trainer.json | 110 ++++++++++++ .../{model.py => models/lstm.py} | 2 +- TTS/speaker_encoder/models/resnet.py | 157 ++++++++++++++++++ TTS/speaker_encoder/utils/generic_utils.py | 24 ++- 8 files changed, 322 insertions(+), 43 deletions(-) create mode 100644 TTS/speaker_encoder/configs/config_voxceleb_trainer.json rename TTS/speaker_encoder/{model.py => models/lstm.py} (99%) create mode 100644 TTS/speaker_encoder/models/resnet.py diff --git a/.gitignore b/.gitignore index 1829dd93..df077bc7 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,6 @@ TTS/tts/layers/glow_tts/monotonic_align/core.c .vscode-upload.json temp_build/* recipes/* + +# nohup logs +*.out \ No newline at end of file diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 5eca376c..264ac74f 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -12,8 +12,7 @@ from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import MyDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxLoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.speaker_encoder.utils.generic_utils import check_config_speaker_encoder, save_best_model +from TTS.speaker_encoder.utils.generic_utils import check_config_speaker_encoder, save_best_model, save_checkpoint, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.tts.datasets.preprocess import load_meta_data from TTS.utils.audio import AudioProcessor @@ -66,21 +65,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False return loader, dataset.get_num_speakers() -def train(model, optimizer, scheduler, ap, global_step): - data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) - - if c.loss == "ge2e": - criterion = GE2ELoss(loss_method="softmax") - elif c.loss == "angleproto": - criterion = AngleProtoLoss() - elif c.loss == "softmaxproto": - criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers) - else: - raise Exception("The %s not is a loss supported" % c.loss) - - if use_cuda: - model = model.cuda() - criterion.cuda() +def train(model, optimizer, scheduler, criterion, data_loader, ap, global_step): model.train() epoch_time = 0 @@ -154,7 +139,7 @@ def train(model, optimizer, scheduler, ap, global_step): ) # save best model - best_loss = save_best_model(model, optimizer, avg_loss, best_loss, OUT_PATH, global_step) + best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step) end_time = time.time() return avg_loss, global_step @@ -166,14 +151,24 @@ def main(args): # pylint: disable=redefined-outer-name global meta_data_eval ap = AudioProcessor(**c.audio) - model = SpeakerEncoder( - input_dim=c.model["input_dim"], - proj_dim=c.model["proj_dim"], - lstm_dim=c.model["lstm_dim"], - num_lstm_layers=c.model["num_lstm_layers"], - ) + model = setup_model(c) optimizer = RAdam(model.parameters(), lr=c.lr) + # pylint: disable=redefined-outer-name + meta_data_train, meta_data_eval = load_meta_data(c.datasets) + + data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) + + if c.loss == "ge2e": + criterion = GE2ELoss(loss_method="softmax") + elif c.loss == "angleproto": + criterion = AngleProtoLoss() + elif c.loss == "softmaxproto": + criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers) + else: + raise Exception("The %s not is a loss supported" % c.loss) + + if args.restore_path: checkpoint = torch.load(args.restore_path) try: @@ -183,14 +178,19 @@ def main(args): # pylint: disable=redefined-outer-name if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint["model"]) - except KeyError: + + if 'criterion' in checkpoint: + criterion.load_state_dict(checkpoint["criterion"]) + + except (KeyError, RuntimeError): print(" > Partial model initialization.") model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint, c) + model_dict = set_init_dict(model_dict, checkpoint['model'], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group["lr"] = c.lr + print(" > Model restored from step %d" % checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: @@ -204,11 +204,13 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) - # pylint: disable=redefined-outer-name - meta_data_train, meta_data_eval = load_meta_data(c.datasets) + if use_cuda: + model = model.cuda() + criterion.cuda() global_step = args.restore_step - _, global_step = train(model, optimizer, scheduler, ap, global_step) + # save_checkpoint(model, optimizer, criterion, 0.9, '../', global_step, 1) + _, global_step = train(model, optimizer, scheduler, criterion, data_loader, ap, global_step) if __name__ == "__main__": diff --git a/TTS/speaker_encoder/configs/config.json b/TTS/speaker_encoder/configs/config.json index 6d983e86..84253b6e 100644 --- a/TTS/speaker_encoder/configs/config.json +++ b/TTS/speaker_encoder/configs/config.json @@ -1,5 +1,6 @@ { + "model_name": "lstm", "run_name": "mueller91", "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", "audio":{ diff --git a/TTS/speaker_encoder/configs/config_softmaxproto.json b/TTS/speaker_encoder/configs/config_softmaxproto.json index f1bf9be0..2283eb9c 100644 --- a/TTS/speaker_encoder/configs/config_softmaxproto.json +++ b/TTS/speaker_encoder/configs/config_softmaxproto.json @@ -1,5 +1,6 @@ { + "model_name": "resnet", "run_name": "speaker_encoder", "run_description": "train speaker encoder with VCTK", "audio":{ @@ -41,7 +42,7 @@ "steps_plot_stats": 10, // number of steps to plot embeddings. // Speakers config - "num_speakers_in_batch": 2, // Batch size for training. + "num_speakers_in_batch": 128, // Batch size for training. "num_utters_per_speaker": 2, // "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" @@ -91,10 +92,7 @@ }, "model": { "input_dim": 80, - "proj_dim": 512, - "lstm_dim": 768, - "num_lstm_layers": 3, - "use_lstm_with_projection": true + "proj_dim": 512 }, "storage": { "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage diff --git a/TTS/speaker_encoder/configs/config_voxceleb_trainer.json b/TTS/speaker_encoder/configs/config_voxceleb_trainer.json new file mode 100644 index 00000000..e5e487f0 --- /dev/null +++ b/TTS/speaker_encoder/configs/config_voxceleb_trainer.json @@ -0,0 +1,110 @@ + +{ + "model_name": "resnet", + "run_name": "speaker_encoder", + "run_description": "train speaker encoder with VCTK", + "audio":{ + // Audio processing parameters + "num_mels": 64, // size of the mel spec frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + "stft_pad_mode": "reflect", + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + "reinit_layers": [], + + "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + + // Speakers config + "num_speakers_in_batch": 256, // Batch size for training. + "num_utters_per_speaker": 2, // + "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" + + "voice_len": 2, // number of seconds for each training instance + + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 20, // Number of steps to log traning on console. + "output_path": "../../../checkpoints/speaker_encoder/continue-training-voxceleb-trainer/", // DATASET-RELATED: output path for all training outputs. + + "audio_augmentation": { + "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation + "rir":{ + "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", + "conv_mode": "full" + }, + "additive":{ + "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", + // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored + "speech":{ + "min_snr_in_db": 13, + "max_snr_in_db": 20, + "min_num_noises": 3, + "max_num_noises": 7 + }, + "noise":{ + "min_snr_in_db": 0, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + }, + "music":{ + "min_snr_in_db": 5, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + } + }, + //add a gaussian noise to the data in order to increase robustness + "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise + "p": 0.5, // propability of apply this method, 0 is disable + "min_amplitude": 0.0, + "max_amplitude": 1e-5 + } + }, + "model": { + "input_dim": 64, + "proj_dim": 512 + }, + "storage": { + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 1 // the size of the in-memory storage with respect to a single batch + }, + "datasets": + [ + { + "name": "voxceleb2", + "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", + "meta_file_train": null, + "meta_file_val": null + } + ] +} \ No newline at end of file diff --git a/TTS/speaker_encoder/model.py b/TTS/speaker_encoder/models/lstm.py similarity index 99% rename from TTS/speaker_encoder/model.py rename to TTS/speaker_encoder/models/lstm.py index 3d52382a..05a56675 100644 --- a/TTS/speaker_encoder/model.py +++ b/TTS/speaker_encoder/models/lstm.py @@ -29,7 +29,7 @@ class LSTMWithoutProjection(nn.Module): return self.relu(self.linear(hidden[-1])) -class SpeakerEncoder(nn.Module): +class LSTMSpeakerEncoder(nn.Module): def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True): super().__init__() self.use_lstm_with_projection = use_lstm_with_projection diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py new file mode 100644 index 00000000..b35a9c89 --- /dev/null +++ b/TTS/speaker_encoder/models/resnet.py @@ -0,0 +1,157 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class SELayer(nn.Module): + def __init__(self, channel, reduction=8): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), + nn.Sigmoid() + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + +class SEBasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): + super(SEBasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.se = SELayer(planes, reduction) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.relu(out) + out = self.bn1(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + return out + +class ResNetSpeakerEncoder(nn.Module): + """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 + Adapted from: https://github.com/clovaai/voxceleb_trainer + """ + def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False): + super(ResNetSpeakerEncoder, self).__init__() + + self.encoder_type = encoder_type + self.input_dim = input_dim + self.log_input = log_input + self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=3, stride=1, padding=1) + self.relu = nn.ReLU(inplace=True) + self.bn1 = nn.BatchNorm2d(num_filters[0]) + + self.inplanes = num_filters[0] + self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0]) + self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2)) + self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2)) + self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2)) + + self.instancenorm = nn.InstanceNorm1d(input_dim) + + outmap_size = int(self.input_dim/8) + + self.attention = nn.Sequential( + nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), + nn.ReLU(), + nn.BatchNorm1d(128), + nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), + nn.Softmax(dim=2), + ) + + if self.encoder_type == "SAP": + out_dim = num_filters[3] * outmap_size + elif self.encoder_type == "ASP": + out_dim = num_filters[3] * outmap_size * 2 + else: + raise ValueError('Undefined encoder') + + self.fc = nn.Linear(out_dim, proj_dim) + + self._init_layers() + + def _init_layers(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def create_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def new_parameter(self, *size): + out = nn.Parameter(torch.FloatTensor(*size)) + nn.init.xavier_normal_(out) + return out + + def forward(self, x): + x = x.transpose(1, 2) + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + if self.log_input: x = (x+1e-6).log() + x = self.instancenorm(x).unsqueeze(1) + + x = self.conv1(x) + x = self.relu(x) + x = self.bn1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = x.reshape(x.size()[0],-1,x.size()[-1]) + + w = self.attention(x) + + if self.encoder_type == "SAP": + x = torch.sum(x * w, dim=2) + elif self.encoder_type == "ASP": + mu = torch.sum(x * w, dim=2) + sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu ** 2 ).clamp(min=1e-5) ) + x = torch.cat((mu, sg),1) + + x = x.view(x.size()[0], -1) + x = self.fc(x) + + return x diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index d1dbf3ae..42ef3086 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -8,7 +8,8 @@ import glob import random from scipy import signal -from TTS.speaker_encoder.model import SpeakerEncoder +from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder +from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder from TTS.utils.generic_utils import check_argument class AugmentWAV(object): @@ -146,11 +147,14 @@ def to_camel(text): def setup_model(c): - model = SpeakerEncoder(c.model["input_dim"], c.model["proj_dim"], c.model["lstm_dim"], c.model["num_lstm_layers"]) + if c.model_name.lower() == 'lstm': + model = LSTMSpeakerEncoder(c.model["input_dim"], c.model["proj_dim"], c.model["lstm_dim"], c.model["num_lstm_layers"]) + elif c.model_name.lower() == 'resnet': + model = ResNetSpeakerEncoder(input_dim=c.model["input_dim"], proj_dim=c.model["proj_dim"]) return model -def save_checkpoint(model, optimizer, model_loss, out_path, current_step, epoch): +def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): checkpoint_path = "checkpoint_{}.pth.tar".format(current_step) checkpoint_path = os.path.join(out_path, checkpoint_path) print(" | | > Checkpoint saving : {}".format(checkpoint_path)) @@ -159,6 +163,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path, current_step, epoch) state = { "model": new_state_dict, "optimizer": optimizer.state_dict() if optimizer is not None else None, + "criterion": criterion.state_dict(), "step": current_step, "epoch": epoch, "loss": model_loss, @@ -167,12 +172,13 @@ def save_checkpoint(model, optimizer, model_loss, out_path, current_step, epoch) torch.save(state, checkpoint_path) -def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step): +def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step): if model_loss < best_loss: new_state_dict = model.state_dict() state = { "model": new_state_dict, "optimizer": optimizer.state_dict(), + "criterion": criterion.state_dict(), "step": current_step, "loss": model_loss, "date": datetime.date.today().strftime("%B %d, %Y"), @@ -234,11 +240,13 @@ def check_config_speaker_encoder(c): # model parameters check_argument("model", c, restricted=True, val_type=dict) + check_argument("model_name", c, restricted=True, val_type=str) check_argument("input_dim", c["model"], restricted=True, val_type=int) - check_argument("proj_dim", c["model"], restricted=True, val_type=int) - check_argument("lstm_dim", c["model"], restricted=True, val_type=int) - check_argument("num_lstm_layers", c["model"], restricted=True, val_type=int) - check_argument("use_lstm_with_projection", c["model"], restricted=True, val_type=bool) + if c.model_name.lower() == 'lstm': + check_argument("proj_dim", c["model"], restricted=True, val_type=int) + check_argument("lstm_dim", c["model"], restricted=True, val_type=int) + check_argument("num_lstm_layers", c["model"], restricted=True, val_type=int) + check_argument("use_lstm_with_projection", c["model"], restricted=True, val_type=bool) # in-memory storage parameters check_argument("storage", c, restricted=True, val_type=dict) From 3433c2f3485a3cb7e4b03a178f303ae168b99fbd Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 12 May 2021 03:06:46 -0300 Subject: [PATCH 05/18] add compute embedding for the new speaker encoder --- TTS/bin/compute_embeddings.py | 4 ++-- TTS/speaker_encoder/models/resnet.py | 33 ++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index c38e0e7e..410086de 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -6,7 +6,7 @@ import numpy as np import torch from tqdm import tqdm -from TTS.speaker_encoder.model import SpeakerEncoder +from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.speakers import save_speaker_mapping from TTS.utils.audio import AudioProcessor @@ -77,7 +77,7 @@ for output_file in output_files: os.makedirs(os.path.dirname(output_file), exist_ok=True) # define Encoder model -model = SpeakerEncoder(**c.model) +model = setup_model(c) model.load_state_dict(torch.load(args.model_path)["model"]) model.eval() if args.use_cuda: diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index b35a9c89..9b79b7a7 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -124,7 +124,7 @@ class ResNetSpeakerEncoder(nn.Module): nn.init.xavier_normal_(out) return out - def forward(self, x): + def forward(self, x, training=True): x = x.transpose(1, 2) with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): @@ -140,7 +140,7 @@ class ResNetSpeakerEncoder(nn.Module): x = self.layer3(x) x = self.layer4(x) - x = x.reshape(x.size()[0],-1,x.size()[-1]) + x = x.reshape(x.size()[0], -1, x.size()[-1]) w = self.attention(x) @@ -154,4 +154,33 @@ class ResNetSpeakerEncoder(nn.Module): x = x.view(x.size()[0], -1) x = self.fc(x) + if not training: + x = torch.nn.functional.normalize(x, p=2, dim=1) return x + + @torch.no_grad() + def compute_embedding(self, x, num_frames=250, overlap=0.5): + """ + Generate embeddings for a batch of utterances + x: 1xTxD + """ + num_overlap = int(num_frames * overlap) + max_len = x.shape[1] + embed = None + cur_iter = 0 + for offset in range(0, max_len, num_frames - num_overlap): + cur_iter += 1 + end_offset = min(x.shape[1], offset + num_frames) + + # ignore slices with two or less frames, because it's can break instance normalization + if end_offset-offset <= 1: + continue + + frames = x[:, offset:end_offset] + + if embed is None: + embed = self.forward(frames, training=False) + else: + embed += self.forward(frames, training=False) + + return embed / cur_iter From 856ea1975800aafb45cf24a20b4edf75d071f56d Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 18 May 2021 03:43:16 -0300 Subject: [PATCH 06/18] bug fix in dataloader and update inference --- TTS/bin/train_encoder.py | 12 +- TTS/speaker_encoder/configs/config.json | 3 +- .../configs/config_resnet_commonvoice.json | 957 ++++++++++++++++++ ...on => config_resnet_voxcebel_trainer.json} | 7 +- .../configs/config_resnet_voxceleb1and2.json | 117 +++ TTS/speaker_encoder/configs/config_temp.json | 117 +++ TTS/speaker_encoder/dataset.py | 101 +- TTS/speaker_encoder/losses.py | 7 +- TTS/speaker_encoder/models/resnet.py | 33 +- TTS/speaker_encoder/utils/generic_utils.py | 46 +- 10 files changed, 1336 insertions(+), 64 deletions(-) create mode 100644 TTS/speaker_encoder/configs/config_resnet_commonvoice.json rename TTS/speaker_encoder/configs/{config_voxceleb_trainer.json => config_resnet_voxcebel_trainer.json} (96%) create mode 100644 TTS/speaker_encoder/configs/config_resnet_voxceleb1and2.json create mode 100644 TTS/speaker_encoder/configs/config_temp.json diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 264ac74f..a39bfccf 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -71,8 +71,10 @@ def train(model, optimizer, scheduler, criterion, data_loader, ap, global_step): epoch_time = 0 best_loss = float("inf") avg_loss = 0 + avg_loss_all = 0 avg_loader_time = 0 end_time = time.time() + for _, data in enumerate(data_loader): start_time = time.time() @@ -137,9 +139,13 @@ def train(model, optimizer, scheduler, criterion, data_loader, ap, global_step): ), flush=True, ) + + avg_loss_all += avg_loss - # save best model - best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step) + if global_step % c.save_step == 0: + # save best model + best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step) + avg_loss_all = 0 end_time = time.time() return avg_loss, global_step @@ -155,7 +161,7 @@ def main(args): # pylint: disable=redefined-outer-name optimizer = RAdam(model.parameters(), lr=c.lr) # pylint: disable=redefined-outer-name - meta_data_train, meta_data_eval = load_meta_data(c.datasets) + meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=False) data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) diff --git a/TTS/speaker_encoder/configs/config.json b/TTS/speaker_encoder/configs/config.json index 84253b6e..30d83e51 100644 --- a/TTS/speaker_encoder/configs/config.json +++ b/TTS/speaker_encoder/configs/config.json @@ -25,7 +25,8 @@ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, "reinit_layers": [], "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) diff --git a/TTS/speaker_encoder/configs/config_resnet_commonvoice.json b/TTS/speaker_encoder/configs/config_resnet_commonvoice.json new file mode 100644 index 00000000..b3223824 --- /dev/null +++ b/TTS/speaker_encoder/configs/config_resnet_commonvoice.json @@ -0,0 +1,957 @@ + +{ + "model_name": "resnet", + "run_name": "speaker_encoder", + "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", + // AUDIO PARAMETERS + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + "stft_pad_mode": "reflect", + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + "reinit_layers": [], + + "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 100, // number of steps to plot embeddings. + + // Speakers config + "num_speakers_in_batch": 200, // Batch size for training. + "num_utters_per_speaker": 2, // + "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" + + "voice_len": 2, // number of seconds for each training instance + + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. + "print_step": 50, // Number of steps to log traning on console. + "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-continue/", // DATASET-RELATED: output path for all training outputs. + + "audio_augmentation": { + "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation + "rir":{ + "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", + "conv_mode": "full" + }, + "additive":{ + "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", + // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored + "speech":{ + "min_snr_in_db": 13, + "max_snr_in_db": 20, + "min_num_noises": 2, + "max_num_noises": 3 + }, + "noise":{ + "min_snr_in_db": 0, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + }, + "music":{ + "min_snr_in_db": 5, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + } + }, + //add a gaussian noise to the data in order to increase robustness + "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise + "p": 0.5, // propability of apply this method, 0 is disable + "min_amplitude": 0.0, + "max_amplitude": 1e-5 + } + }, + "model": { + "input_dim": 80, + "proj_dim": 512 + }, + "storage": { + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 35 // the size of the in-memory storage with respect to a single batch + }, + "datasets": + [ + { + "name": "voxceleb2", + "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb1", + "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", + "meta_file_train": "dev.tsv", + "meta_file_val": null + } + + ] +} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_voxceleb_trainer.json b/TTS/speaker_encoder/configs/config_resnet_voxcebel_trainer.json similarity index 96% rename from TTS/speaker_encoder/configs/config_voxceleb_trainer.json rename to TTS/speaker_encoder/configs/config_resnet_voxcebel_trainer.json index e5e487f0..bf9d1c4c 100644 --- a/TTS/speaker_encoder/configs/config_voxceleb_trainer.json +++ b/TTS/speaker_encoder/configs/config_resnet_voxcebel_trainer.json @@ -1,8 +1,7 @@ - { "model_name": "resnet", "run_name": "speaker_encoder", - "run_description": "train speaker encoder with VCTK", + "run_description": "train speaker encoder with VoxCeleb", "audio":{ // Audio processing parameters "num_mels": 64, // size of the mel spec frame. @@ -51,7 +50,7 @@ "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "save_step": 2000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 20, // Number of steps to log traning on console. "output_path": "../../../checkpoints/speaker_encoder/continue-training-voxceleb-trainer/", // DATASET-RELATED: output path for all training outputs. @@ -96,7 +95,7 @@ }, "storage": { "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 1 // the size of the in-memory storage with respect to a single batch + "storage_size": 25 // the size of the in-memory storage with respect to a single batch }, "datasets": [ diff --git a/TTS/speaker_encoder/configs/config_resnet_voxceleb1and2.json b/TTS/speaker_encoder/configs/config_resnet_voxceleb1and2.json new file mode 100644 index 00000000..3fafd165 --- /dev/null +++ b/TTS/speaker_encoder/configs/config_resnet_voxceleb1and2.json @@ -0,0 +1,117 @@ + +{ + "model_name": "resnet", + "run_name": "speaker_encoder", + "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb dev and Voxceleb 2 dev", + // AUDIO PARAMETERS + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + "stft_pad_mode": "reflect", + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + "reinit_layers": [], + + "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + + // Speakers config + "num_speakers_in_batch": 200, // Batch size for training. + "num_utters_per_speaker": 2, // + "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" + + "voice_len": 2, // number of seconds for each training instance + + "num_loader_workers": 1, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 2000, // Number of training steps expected to save the best checkpoints in training. + "print_step": 20, // Number of steps to log traning on console. + "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb12-pre-training/", // DATASET-RELATED: output path for all training outputs. + + "audio_augmentation": { + "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation + "rir":{ + "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", + "conv_mode": "full" + }, + "additive":{ + "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", + // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored + "speech":{ + "min_snr_in_db": 13, + "max_snr_in_db": 20, + "min_num_noises": 3, + "max_num_noises": 7 + }, + "noise":{ + "min_snr_in_db": 0, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + }, + "music":{ + "min_snr_in_db": 5, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + } + }, + //add a gaussian noise to the data in order to increase robustness + "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise + "p": 1, // propability of apply this method, 0 is disable + "min_amplitude": 0.0, + "max_amplitude": 1e-5 + } + }, + "model": { + "input_dim": 80, + "proj_dim": 512 + }, + "storage": { + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 35 // the size of the in-memory storage with respect to a single batch + }, + "datasets": + [ + { + "name": "voxceleb2", + "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb1", + "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/", + "meta_file_train": null, + "meta_file_val": null + } + ] +} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_temp.json b/TTS/speaker_encoder/configs/config_temp.json new file mode 100644 index 00000000..737b16d8 --- /dev/null +++ b/TTS/speaker_encoder/configs/config_temp.json @@ -0,0 +1,117 @@ + +{ + "model_name": "resnet", + "run_name": "speaker_encoder", + "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb dev and Voxceleb 2 dev", + // AUDIO PARAMETERS + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 22050, //22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + "stft_pad_mode": "reflect", + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + "reinit_layers": [], + + "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + + // Speakers config + "num_speakers_in_batch": 256, // Batch size for training. + "num_utters_per_speaker": 2, // + "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" + + "voice_len": 2, // number of seconds for each training instance + + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 5000, // Number of training steps expected to save the best checkpoints in training. + "print_step": 20, // Number of steps to log traning on console. + "output_path": "../../../checkpoints/speaker_encoder/continue-training-voxceleb-trainer-test/", // DATASET-RELATED: output path for all training outputs. + + "audio_augmentation": { + "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation + "rir":{ + "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", + "conv_mode": "full" + }, + "additive":{ + "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", + // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored + "speech":{ + "min_snr_in_db": 13, + "max_snr_in_db": 20, + "min_num_noises": 3, + "max_num_noises": 7 + }, + "noise":{ + "min_snr_in_db": 0, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + }, + "music":{ + "min_snr_in_db": 5, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + } + }, + //add a gaussian noise to the data in order to increase robustness + "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise + "p": 1, // propability of apply this method, 0 is disable + "min_amplitude": 0.0, + "max_amplitude": 1e-5 + } + }, + "model": { + "input_dim": 80, + "proj_dim": 512 + }, + "storage": { + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 35 // the size of the in-memory storage with respect to a single batch + }, + "datasets": + [ + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + } + ] +} \ No newline at end of file diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 14bb57c8..9c308a73 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -1,10 +1,10 @@ -import queue + import random import numpy as np import torch from torch.utils.data import Dataset -from TTS.speaker_encoder.utils.generic_utils import AugmentWAV +from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage class MyDataset(Dataset): def __init__( @@ -38,7 +38,8 @@ class MyDataset(Dataset): self.ap = ap self.verbose = verbose self.__parse_items() - self.storage = queue.Queue(maxsize=storage_size * num_speakers_in_batch) + storage_max_size = storage_size * num_speakers_in_batch + self.storage = Storage(maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch) self.sample_from_storage_p = float(sample_from_storage_p) speakers_aux = list(self.speakers) @@ -59,7 +60,7 @@ class MyDataset(Dataset): if self.verbose: print("\n > DataLoader initialization") print(f" | > Speakers per Batch: {num_speakers_in_batch}") - print(f" | > Storage Size: {self.storage.maxsize} instances, each with {num_utter_per_speaker} utters") + print(f" | > Storage Size: {storage_max_size} instances, each with {num_utter_per_speaker} utters") print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}") print(f" | > Number of instances : {len(self.items)}") print(f" | > Sequence length: {self.seq_len}") @@ -130,9 +131,11 @@ class MyDataset(Dataset): def __sample_speaker(self, ignore_speakers=None): speaker = random.sample(self.speakers, 1)[0] # if list of speakers_id is provide make sure that it's will be ignored - if ignore_speakers: - while self.speakerid_to_classid[speaker] in ignore_speakers: + if ignore_speakers and self.speakerid_to_classid[speaker] in ignore_speakers: + while True: speaker = random.sample(self.speakers, 1)[0] + if self.speakerid_to_classid[speaker] not in ignore_speakers: + break if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]): utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker) @@ -153,13 +156,18 @@ class MyDataset(Dataset): if len(self.speaker_to_utters[speaker]) > 1: utter = random.sample(self.speaker_to_utters[speaker], 1)[0] else: - self.speakers.remove(speaker) + if speaker in self.speakers: + self.speakers.remove(speaker) + speaker, _ = self.__sample_speaker() continue + wav = self.load_wav(utter) if wav.shape[0] - self.seq_len > 0: break - self.speaker_to_utters[speaker].remove(utter) + + if utter in self.speaker_to_utters[speaker]: + self.speaker_to_utters[speaker].remove(utter) if self.augmentator is not None and self.data_augmentation_p: if random.random() < self.data_augmentation_p: @@ -174,6 +182,13 @@ class MyDataset(Dataset): speaker_id = self.speakerid_to_classid[speaker] return speaker, speaker_id + def __load_from_disk_and_storage(self, speaker): + # don't sample from storage, but from HDD + wavs_, labels_ = self.__sample_speaker_utterances(speaker) + # put the newly loaded item into storage + self.storage.append((wavs_, labels_)) + return wavs_, labels_ + def collate_fn(self, batch): # get the batch speaker_ids batch = np.array(batch) @@ -182,38 +197,50 @@ class MyDataset(Dataset): labels = [] feats = [] speakers = set() - for speaker, speaker_id in batch: + from_disk = 0 + from_storage = 0 + for speaker, speaker_id in batch: + speaker_id = int(speaker_id) + + # ensure that an speaker appears only once in the batch + if speaker_id in speakers: + speaker, _ = self.__sample_speaker(ignore_speakers=speakers_id_in_batch) + speaker_id = self.speakerid_to_classid[speaker] if random.random() < self.sample_from_storage_p and self.storage.full(): - # sample from storage (if full), ignoring the speaker - wavs_, labels_ = random.choice(self.storage.queue) - - # force choose the current speaker or other not in batch - '''while labels_[0] in speakers_id_in_batch: - if labels_[0] == speaker_id: - break - wavs_, labels_ = random.choice(self.storage.queue)''' - - speakers.add(labels_[0]) - speakers_id_in_batch.add(labels_[0]) + # sample from storage (if full) + # print(help(self.storage)) + wavs_, labels_ = self.storage.get_random_sample_fast() + from_storage += 1 + # force choose the current speaker or other not in batch + # It's necessary for ideal training with AngleProto and GE2E losses + if labels_[0] in speakers_id_in_batch and labels_[0] != speaker_id: + attempts = 0 + while True: + wavs_, labels_ = self.storage.get_random_sample_fast() + if labels_[0] == speaker_id or labels_[0] not in speakers_id_in_batch: + break + attempts += 1 + # Try 5 times after that load from disk + if attempts >= 5: + wavs_, labels_ = self.__load_from_disk_and_storage(speaker) + from_storage -= 1 + from_disk += 1 + break else: - # ensure that an speaker appears only once in the batch - if speaker_id in speakers: - speaker, _ = self.__sample_speaker(speakers_id_in_batch) - speaker_id = self.speakerid_to_classid[speaker] - # append the new speaker from batch - speakers_id_in_batch.add(speaker_id) - - speakers.add(speaker_id) - # don't sample from storage, but from HDD - wavs_, labels_ = self.__sample_speaker_utterances(speaker) - # if storage is full, remove an item - if self.storage.full(): - _ = self.storage.get_nowait() - # put the newly loaded item into storage - self.storage.put_nowait((wavs_, labels_)) + wavs_, labels_ = self.__load_from_disk_and_storage(speaker) + from_disk += 1 + + # append speaker for control + speakers.add(labels_[0]) + + # remove current speaker and append other + if speaker_id in speakers_id_in_batch: + speakers_id_in_batch.remove(speaker_id) + + speakers_id_in_batch.add(labels_[0]) # get a random subset of each of the wavs and extract mel spectrograms. feats_ = [] @@ -229,6 +256,10 @@ class MyDataset(Dataset): labels.append(torch.LongTensor(labels_)) feats.extend(feats_) + + if self.num_speakers_in_batch != len(speakers): + raise ValueError('Speakers appear more than once on the Batch. This cannot happen because the loss functions AngleProto and GE2E consider these samples to be from another speaker.') + feats = torch.stack(feats) labels = torch.stack(labels) diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py index 52871fb4..e521fe81 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/speaker_encoder/losses.py @@ -178,6 +178,9 @@ class SoftmaxLoss(nn.Module): print('Initialised Softmax Loss') def forward(self, x, label=None): + # reshape for compatibility + x = x.reshape(-1, x.size()[-1]) + label = label.reshape(-1) x = self.fc(x) L = self.criterion(x, label) @@ -206,12 +209,8 @@ class SoftmaxAngleProtoLoss(nn.Module): Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ - assert x.size()[1] == 2 - Lp = self.angleproto(x) - x = x.reshape(-1, x.size()[-1]) - label = label.reshape(-1) Ls = self.softmax(x, label) return Ls+Lp diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 9b79b7a7..23464527 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,6 +1,8 @@ import torch +import numpy as np import torch.nn as nn import torch.nn.functional as F +import numpy as np class SELayer(nn.Module): def __init__(self, channel, reduction=8): @@ -159,28 +161,27 @@ class ResNetSpeakerEncoder(nn.Module): return x @torch.no_grad() - def compute_embedding(self, x, num_frames=250, overlap=0.5): + def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): """ Generate embeddings for a batch of utterances x: 1xTxD """ - num_overlap = int(num_frames * overlap) max_len = x.shape[1] - embed = None - cur_iter = 0 - for offset in range(0, max_len, num_frames - num_overlap): - cur_iter += 1 - end_offset = min(x.shape[1], offset + num_frames) - # ignore slices with two or less frames, because it's can break instance normalization - if end_offset-offset <= 1: - continue + if max_len < num_frames: + num_frames = max_len - frames = x[:, offset:end_offset] + offsets = np.linspace(0, max_len-num_frames, num=num_eval) - if embed is None: - embed = self.forward(frames, training=False) - else: - embed += self.forward(frames, training=False) + embeddings = [] + for offset in offsets: + offset = int(offset) + end_offset = int(offset+num_frames) + frames = x[:,offset:end_offset] + embed = self.forward(frames, training=False) + embeddings.append(embed) - return embed / cur_iter + embeddings = torch.stack(embeddings) + if return_mean: + embeddings = torch.mean(embeddings, dim=0) + return embeddings \ No newline at end of file diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 42ef3086..78e7cb49 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -8,10 +8,54 @@ import glob import random from scipy import signal +from multiprocessing import Manager + from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder from TTS.utils.generic_utils import check_argument - + +class Storage(object): + def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): + # use multiprocessing for threading safe + self.storage = Manager().list() + self.maxsize = maxsize + self.num_speakers_in_batch = num_speakers_in_batch + self.num_threads = num_threads + self.ignore_last_batch = False + + if storage_batchs >= 3: + self.ignore_last_batch = True + + # used for fast random sample + self.safe_storage_size = self.maxsize - self.num_threads + if self.ignore_last_batch: + self.safe_storage_size -= self.num_speakers_in_batch + + def __len__(self): + return len(self.storage) + + def full(self): + return len(self.storage) >= self.maxsize + + def append(self, item): + # if storage is full, remove an item + if self.full(): + self.storage.pop(0) + + self.storage.append(item) + + def get_random_sample(self): + # safe storage size considering all threads remove one item from storage in same time + storage_size = len(self.storage) - self.num_threads + + if self.ignore_last_batch: + storage_size -= self.num_speakers_in_batch + + return self.storage[random.randint(0, storage_size)] + def get_random_sample_fast(self): + '''Call this method only when storage is full''' + return self.storage[random.randint(0, self.safe_storage_size)] + class AugmentWAV(object): def __init__(self, ap, augmentation_config): From df6a98d0c34685f2204ad86afcb877f310ef28c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 19 May 2021 14:00:44 +0200 Subject: [PATCH 07/18] type def for gradual_training --- TTS/tts/configs/tacotron_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index d3a54269..a567cd88 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -122,7 +122,7 @@ class TacotronConfig(BaseTTSConfig): gst_style_input: str = None # model specific params r: int = 2 - gradual_training: List[List] = None + gradual_training: List[List[int]] = None memory_size: int = -1 prenet_type: str = "original" prenet_dropout: bool = True From d570c2d790d2c4e262606669e2361bbe739aa184 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 26 May 2021 01:11:37 -0300 Subject: [PATCH 08/18] pylint fix and data loader bug fix --- TTS/bin/train_encoder.py | 15 +- .../configs/config_resnet_angleproto.json | 956 ++++++++++++++++++ ... => config_resnet_softmax_angleproto.json} | 2 +- .../config_resnet_voxcebel_trainer.json | 109 -- .../configs/config_resnet_voxceleb1and2.json | 117 --- .../configs/config_softmaxproto.json | 117 --- TTS/speaker_encoder/configs/config_temp.json | 117 --- TTS/speaker_encoder/dataset.py | 59 +- TTS/speaker_encoder/losses.py | 9 +- TTS/speaker_encoder/models/resnet.py | 29 +- TTS/speaker_encoder/utils/generic_utils.py | 55 +- 11 files changed, 1015 insertions(+), 570 deletions(-) create mode 100644 TTS/speaker_encoder/configs/config_resnet_angleproto.json rename TTS/speaker_encoder/configs/{config_resnet_commonvoice.json => config_resnet_softmax_angleproto.json} (99%) delete mode 100644 TTS/speaker_encoder/configs/config_resnet_voxcebel_trainer.json delete mode 100644 TTS/speaker_encoder/configs/config_resnet_voxceleb1and2.json delete mode 100644 TTS/speaker_encoder/configs/config_softmaxproto.json delete mode 100644 TTS/speaker_encoder/configs/config_temp.json diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index a39bfccf..055062fe 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -11,8 +11,8 @@ import torch from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import MyDataset -from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxLoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.utils.generic_utils import check_config_speaker_encoder, save_best_model, save_checkpoint, setup_model +from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss +from TTS.speaker_encoder.utils.generic_utils import check_config_speaker_encoder, save_best_model, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.tts.datasets.preprocess import load_meta_data from TTS.utils.audio import AudioProcessor @@ -51,9 +51,8 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], verbose=verbose, - augmentation_config=getattr(c, "audio_augmentation", None) + augmentation_config=getattr(c, "audio_augmentation", None) ) - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader( dataset, @@ -65,8 +64,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False return loader, dataset.get_num_speakers() -def train(model, optimizer, scheduler, criterion, data_loader, ap, global_step): - +def train(model, optimizer, scheduler, criterion, data_loader, global_step): model.train() epoch_time = 0 best_loss = float("inf") @@ -80,7 +78,6 @@ def train(model, optimizer, scheduler, criterion, data_loader, ap, global_step): # setup input data inputs, labels = data - loader_time = time.time() - end_time global_step += 1 @@ -139,7 +136,6 @@ def train(model, optimizer, scheduler, criterion, data_loader, ap, global_step): ), flush=True, ) - avg_loss_all += avg_loss if global_step % c.save_step == 0: @@ -215,8 +211,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion.cuda() global_step = args.restore_step - # save_checkpoint(model, optimizer, criterion, 0.9, '../', global_step, 1) - _, global_step = train(model, optimizer, scheduler, criterion, data_loader, ap, global_step) + _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step) if __name__ == "__main__": diff --git a/TTS/speaker_encoder/configs/config_resnet_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_angleproto.json new file mode 100644 index 00000000..7cae1b25 --- /dev/null +++ b/TTS/speaker_encoder/configs/config_resnet_angleproto.json @@ -0,0 +1,956 @@ +{ + "model_name": "resnet", + "run_name": "speaker_encoder", + "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", + // AUDIO PARAMETERS + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + "stft_pad_mode": "reflect", + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + "reinit_layers": [], + + "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 100, // number of steps to plot embeddings. + + // Speakers config + "num_speakers_in_batch": 200, // Batch size for training. + "num_utters_per_speaker": 2, // + "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" + + "voice_len": 2, // number of seconds for each training instance + + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. + "print_step": 50, // Number of steps to log traning on console. + "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs. + + "audio_augmentation": { + "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation + "rir":{ + "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", + "conv_mode": "full" + }, + "additive":{ + "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", + // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored + "speech":{ + "min_snr_in_db": 13, + "max_snr_in_db": 20, + "min_num_noises": 2, + "max_num_noises": 3 + }, + "noise":{ + "min_snr_in_db": 0, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + }, + "music":{ + "min_snr_in_db": 5, + "max_snr_in_db": 15, + "min_num_noises": 1, + "max_num_noises": 1 + } + }, + //add a gaussian noise to the data in order to increase robustness + "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise + "p": 0.5, // propability of apply this method, 0 is disable + "min_amplitude": 0.0, + "max_amplitude": 1e-5 + } + }, + "model": { + "input_dim": 80, + "proj_dim": 512 + }, + "storage": { + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 35 // the size of the in-memory storage with respect to a single batch + }, + "datasets": + [ + { + "name": "voxceleb2", + "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb1", + "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", + "meta_file_train": "dev.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", + "meta_file_train": "train.tsv", + "meta_file_val": null + }, + + { + "name": "common_voice", + "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", + "meta_file_train": "dev.tsv", + "meta_file_val": null + } + + ] +} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_resnet_commonvoice.json b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json similarity index 99% rename from TTS/speaker_encoder/configs/config_resnet_commonvoice.json rename to TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json index b3223824..4baca49a 100644 --- a/TTS/speaker_encoder/configs/config_resnet_commonvoice.json +++ b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json @@ -54,7 +54,7 @@ "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. "print_step": 50, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-continue/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all/", // DATASET-RELATED: output path for all training outputs. "audio_augmentation": { "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation diff --git a/TTS/speaker_encoder/configs/config_resnet_voxcebel_trainer.json b/TTS/speaker_encoder/configs/config_resnet_voxcebel_trainer.json deleted file mode 100644 index bf9d1c4c..00000000 --- a/TTS/speaker_encoder/configs/config_resnet_voxcebel_trainer.json +++ /dev/null @@ -1,109 +0,0 @@ -{ - "model_name": "resnet", - "run_name": "speaker_encoder", - "run_description": "train speaker encoder with VoxCeleb", - "audio":{ - // Audio processing parameters - "num_mels": 64, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "stft_pad_mode": "reflect", - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - - "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - - // Speakers config - "num_speakers_in_batch": 256, // Batch size for training. - "num_utters_per_speaker": 2, // - "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - - "voice_len": 2, // number of seconds for each training instance - - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 2000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 20, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/continue-training-voxceleb-trainer/", // DATASET-RELATED: output path for all training outputs. - - "audio_augmentation": { - "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation - "rir":{ - "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", - // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 3, - "max_num_noises": 7 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - }, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 0.5, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "model": { - "input_dim": 64, - "proj_dim": 512 - }, - "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 25 // the size of the in-memory storage with respect to a single batch - }, - "datasets": - [ - { - "name": "voxceleb2", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_resnet_voxceleb1and2.json b/TTS/speaker_encoder/configs/config_resnet_voxceleb1and2.json deleted file mode 100644 index 3fafd165..00000000 --- a/TTS/speaker_encoder/configs/config_resnet_voxceleb1and2.json +++ /dev/null @@ -1,117 +0,0 @@ - -{ - "model_name": "resnet", - "run_name": "speaker_encoder", - "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb dev and Voxceleb 2 dev", - // AUDIO PARAMETERS - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "stft_pad_mode": "reflect", - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - - "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - - // Speakers config - "num_speakers_in_batch": 200, // Batch size for training. - "num_utters_per_speaker": 2, // - "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - - "voice_len": 2, // number of seconds for each training instance - - "num_loader_workers": 1, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 2000, // Number of training steps expected to save the best checkpoints in training. - "print_step": 20, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb12-pre-training/", // DATASET-RELATED: output path for all training outputs. - - "audio_augmentation": { - "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation - "rir":{ - "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", - // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 3, - "max_num_noises": 7 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - }, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 1, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "model": { - "input_dim": 80, - "proj_dim": 512 - }, - "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 35 // the size of the in-memory storage with respect to a single batch - }, - "datasets": - [ - { - "name": "voxceleb2", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb1", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_softmaxproto.json b/TTS/speaker_encoder/configs/config_softmaxproto.json deleted file mode 100644 index 2283eb9c..00000000 --- a/TTS/speaker_encoder/configs/config_softmaxproto.json +++ /dev/null @@ -1,117 +0,0 @@ - -{ - "model_name": "resnet", - "run_name": "speaker_encoder", - "run_description": "train speaker encoder with VCTK", - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "stft_pad_mode": "reflect", - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - - "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - - // Speakers config - "num_speakers_in_batch": 128, // Batch size for training. - "num_utters_per_speaker": 2, // - "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - - "voice_len": 2, // number of seconds for each training instance - - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 20, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - - "audio_augmentation": { - "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation - "rir":{ - "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", - // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 3, - "max_num_noises": 7 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - }, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 1, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "model": { - "input_dim": 80, - "proj_dim": 512 - }, - "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 1 // the size of the in-memory storage with respect to a single batch - }, - "datasets": - [ - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab/", - "meta_file_train": "train.tsv", - "meta_file_val": "test.tsv" - }, - { - "name": "vctk", - "path": "/workspace/store/ecasanova/datasets/VCTK-Corpus-removed-silence/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_temp.json b/TTS/speaker_encoder/configs/config_temp.json deleted file mode 100644 index 737b16d8..00000000 --- a/TTS/speaker_encoder/configs/config_temp.json +++ /dev/null @@ -1,117 +0,0 @@ - -{ - "model_name": "resnet", - "run_name": "speaker_encoder", - "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb dev and Voxceleb 2 dev", - // AUDIO PARAMETERS - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, //22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "stft_pad_mode": "reflect", - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - - "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - - // Speakers config - "num_speakers_in_batch": 256, // Batch size for training. - "num_utters_per_speaker": 2, // - "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - - "voice_len": 2, // number of seconds for each training instance - - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 5000, // Number of training steps expected to save the best checkpoints in training. - "print_step": 20, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/continue-training-voxceleb-trainer-test/", // DATASET-RELATED: output path for all training outputs. - - "audio_augmentation": { - "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation - "rir":{ - "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", - // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 3, - "max_num_noises": 7 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - }, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 1, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "model": { - "input_dim": 80, - "proj_dim": 512 - }, - "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 35 // the size of the in-memory storage with respect to a single batch - }, - "datasets": - [ - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 9c308a73..0b673753 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -8,18 +8,17 @@ from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage class MyDataset(Dataset): def __init__( - self, - ap, - meta_data, - voice_len=1.6, - num_speakers_in_batch=64, - storage_size=1, - sample_from_storage_p=0.5, - additive_noise= 1e-5, - num_utter_per_speaker=10, - skip_speakers=False, - verbose=False, - augmentation_config=None + self, + ap, + meta_data, + voice_len=1.6, + num_speakers_in_batch=64, + storage_size=1, + sample_from_storage_p=0.5, + num_utter_per_speaker=10, + skip_speakers=False, + verbose=False, + augmentation_config=None ): """ Args: @@ -105,23 +104,6 @@ class MyDataset(Dataset): self.speakers = [k for (k, v) in self.speaker_to_utters.items()] - # def __parse_items(self): - # """ - # Find unique speaker ids and create a dict mapping utterances from speaker id - # """ - # speakers = list({item[-1] for item in self.items}) - # self.speaker_to_utters = {} - # self.speakers = [] - # for speaker in speakers: - # speaker_utters = [item[1] for item in self.items if item[2] == speaker] - # if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers: - # print( - # f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}." - # ) - # else: - # self.speakers.append(speaker) - # self.speaker_to_utters[speaker] = speaker_utters - def __len__(self): return int(1e10) @@ -197,22 +179,26 @@ class MyDataset(Dataset): labels = [] feats = [] speakers = set() - from_disk = 0 - from_storage = 0 + for speaker, speaker_id in batch: speaker_id = int(speaker_id) # ensure that an speaker appears only once in the batch if speaker_id in speakers: + + # remove current speaker + if speaker_id in speakers_id_in_batch: + speakers_id_in_batch.remove(speaker_id) + speaker, _ = self.__sample_speaker(ignore_speakers=speakers_id_in_batch) speaker_id = self.speakerid_to_classid[speaker] + speakers_id_in_batch.add(speaker_id) if random.random() < self.sample_from_storage_p and self.storage.full(): # sample from storage (if full) - # print(help(self.storage)) wavs_, labels_ = self.storage.get_random_sample_fast() - from_storage += 1 - # force choose the current speaker or other not in batch + + # force choose the current speaker or other not in batch # It's necessary for ideal training with AngleProto and GE2E losses if labels_[0] in speakers_id_in_batch and labels_[0] != speaker_id: attempts = 0 @@ -225,13 +211,10 @@ class MyDataset(Dataset): # Try 5 times after that load from disk if attempts >= 5: wavs_, labels_ = self.__load_from_disk_and_storage(speaker) - from_storage -= 1 - from_disk += 1 break else: # don't sample from storage, but from HDD wavs_, labels_ = self.__load_from_disk_and_storage(speaker) - from_disk += 1 # append speaker for control speakers.add(labels_[0]) @@ -258,7 +241,7 @@ class MyDataset(Dataset): feats.extend(feats_) if self.num_speakers_in_batch != len(speakers): - raise ValueError('Speakers appear more than once on the Batch. This cannot happen because the loss functions AngleProto and GE2E consider these samples to be from another speaker.') + raise ValueError('Error: Speakers appear more than once on the Batch. This cannot happen because the loss functions AngleProto and GE2E consider these samples to be from another speaker.') feats = torch.stack(feats) labels = torch.stack(labels) diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py index e521fe81..f0165739 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/speaker_encoder/losses.py @@ -103,18 +103,18 @@ class GE2ELoss(nn.Module): L.append(L_row) return torch.stack(L) - def forward(self, dvecs, label=None): + def forward(self, x, label=None): """ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ assert x.size()[1] >= 2 - centroids = torch.mean(dvecs, 1) - cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids) + centroids = torch.mean(x, 1) + cos_sim_matrix = self.calc_cosine_sim(x, centroids) torch.clamp(self.w, 1e-6) cos_sim_matrix = self.w * cos_sim_matrix + self.b - L = self.embed_loss(dvecs, cos_sim_matrix) + L = self.embed_loss(x, cos_sim_matrix) return L.mean() @@ -141,6 +141,7 @@ class AngleProtoLoss(nn.Module): print(" > Initialised Angular Prototypical loss") + # pylint: disable=W0613 def forward(self, x, label=None): """ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 23464527..0cb7e0a8 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,18 +1,16 @@ import torch import numpy as np import torch.nn as nn -import torch.nn.functional as F -import numpy as np class SELayer(nn.Module): def __init__(self, channel, reduction=8): super(SELayer, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential( - nn.Linear(channel, channel // reduction), - nn.ReLU(inplace=True), - nn.Linear(channel // reduction, channel), - nn.Sigmoid() + nn.Linear(channel, channel // reduction), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), + nn.Sigmoid() ) def forward(self, x): @@ -57,16 +55,17 @@ class ResNetSpeakerEncoder(nn.Module): """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 Adapted from: https://github.com/clovaai/voxceleb_trainer """ + # pylint: disable=W0102 def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False): super(ResNetSpeakerEncoder, self).__init__() self.encoder_type = encoder_type self.input_dim = input_dim self.log_input = log_input - self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=3, stride=1, padding=1) + self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1) self.relu = nn.ReLU(inplace=True) self.bn1 = nn.BatchNorm2d(num_filters[0]) - + self.inplanes = num_filters[0] self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0]) self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2)) @@ -116,11 +115,12 @@ class ResNetSpeakerEncoder(nn.Module): layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion - for i in range(1, blocks): + for _ in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) + # pylint: disable=R0201 def new_parameter(self, *size): out = nn.Parameter(torch.FloatTensor(*size)) nn.init.xavier_normal_(out) @@ -130,7 +130,8 @@ class ResNetSpeakerEncoder(nn.Module): x = x.transpose(1, 2) with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): - if self.log_input: x = (x+1e-6).log() + if self.log_input: + x = (x+1e-6).log() x = self.instancenorm(x).unsqueeze(1) x = self.conv1(x) @@ -150,8 +151,8 @@ class ResNetSpeakerEncoder(nn.Module): x = torch.sum(x * w, dim=2) elif self.encoder_type == "ASP": mu = torch.sum(x * w, dim=2) - sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu ** 2 ).clamp(min=1e-5) ) - x = torch.cat((mu, sg),1) + sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu ** 2).clamp(min=1e-5)) + x = torch.cat((mu, sg), 1) x = x.view(x.size()[0], -1) x = self.fc(x) @@ -177,11 +178,11 @@ class ResNetSpeakerEncoder(nn.Module): for offset in offsets: offset = int(offset) end_offset = int(offset+num_frames) - frames = x[:,offset:end_offset] + frames = x[:, offset:end_offset] embed = self.forward(frames, training=False) embeddings.append(embed) embeddings = torch.stack(embeddings) if return_mean: embeddings = torch.mean(embeddings, dim=0) - return embeddings \ No newline at end of file + return embeddings diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 78e7cb49..ff8f0447 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -52,47 +52,18 @@ class Storage(object): storage_size -= self.num_speakers_in_batch return self.storage[random.randint(0, storage_size)] + def get_random_sample_fast(self): '''Call this method only when storage is full''' return self.storage[random.randint(0, self.safe_storage_size)] - + class AugmentWAV(object): def __init__(self, ap, augmentation_config): self.ap = ap - - '''augmentation_config = { - "p": 1, - "rir":{ - "rir_path": "rir_path/" - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "musan/", - # directorys in sounds_path - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 3, - "max_num_noises": 7 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - } - }''' - self.use_additive_noise = False + if 'additive' in augmentation_config.keys(): self.additive_noise_config = augmentation_config['additive'] additive_path = self.additive_noise_config['sounds_path'] @@ -104,7 +75,7 @@ class AugmentWAV(object): if isinstance(self.additive_noise_config[key], dict): self.additive_noise_types.append(key) - additive_files = glob.glob(os.path.join(additive_path,'**/*.wav'), recursive=True) + additive_files = glob.glob(os.path.join(additive_path, '**/*.wav'), recursive=True) self.noise_list = {} @@ -118,12 +89,13 @@ class AugmentWAV(object): self.noise_list[noise_dir].append(wav_file) print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}") - + self.use_rir = False + if 'rir' in augmentation_config.keys(): self.rir_config = augmentation_config['rir'] if self.rir_config['rir_path']: - self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'],'**/*.wav'), recursive=True) + self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'], '**/*.wav'), recursive=True) self.use_rir = True print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances") @@ -161,9 +133,8 @@ class AugmentWAV(object): else: noises_wav += noise_wav - # if all possibel files is less than audio, choose other files + # if all possible files is less than audio, choose other files if noises_wav is None: - print("audio ignorado") return self.additive_noise(noise_type, audio) return audio + noises_wav @@ -172,24 +143,21 @@ class AugmentWAV(object): audio_len = audio.shape[0] rir_file = random.choice(self.rir_files) - rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate) rir = rir / np.sqrt(np.sum(rir ** 2)) return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len] def apply_one(self, audio): - return self.reverberate(audio) noise_type = random.choice(self.global_noise_list) if noise_type == "RIR_AUG": return self.reverberate(audio) - else: - return self.additive_noise(noise_type, audio) + + return self.additive_noise(noise_type, audio) def to_camel(text): text = text.capitalize() return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) - def setup_model(c): if c.model_name.lower() == 'lstm': model = LSTMSpeakerEncoder(c.model["input_dim"], c.model["proj_dim"], c.model["lstm_dim"], c.model["num_lstm_layers"]) @@ -286,8 +254,9 @@ def check_config_speaker_encoder(c): check_argument("model", c, restricted=True, val_type=dict) check_argument("model_name", c, restricted=True, val_type=str) check_argument("input_dim", c["model"], restricted=True, val_type=int) + check_argument("proj_dim", c["model"], restricted=True, val_type=int) + if c.model_name.lower() == 'lstm': - check_argument("proj_dim", c["model"], restricted=True, val_type=int) check_argument("lstm_dim", c["model"], restricted=True, val_type=int) check_argument("num_lstm_layers", c["model"], restricted=True, val_type=int) check_argument("use_lstm_with_projection", c["model"], restricted=True, val_type=bool) From c90037c2e9b384d569154e1b5dab2ef43b3498d5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 26 May 2021 16:01:30 -0300 Subject: [PATCH 09/18] solve merge problems --- TTS/bin/train_encoder.py | 21 +++-- TTS/config/shared_configs.py | 2 +- .../configs/config_resnet_angleproto.json | 14 ++-- .../config_resnet_softmax_angleproto.json | 8 +- TTS/speaker_encoder/dataset.py | 3 - TTS/speaker_encoder/losses.py | 5 +- TTS/speaker_encoder/speaker_encoder_config.py | 14 +++- TTS/speaker_encoder/utils/generic_utils.py | 83 ++----------------- 8 files changed, 40 insertions(+), 110 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 7e9f662f..a4191dfb 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -12,7 +12,7 @@ from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import MyDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.utils.generic_utils import check_config_speaker_encoder, save_best_model, setup_model +from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.tts.datasets.preprocess import load_meta_data @@ -38,15 +38,16 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False dataset = MyDataset( ap, meta_data_eval if is_val else meta_data_train, - voice_len=getattr(c, "voice_len", 1.6), + voice_len=c.voice_len, num_utter_per_speaker=c.num_utters_per_speaker, num_speakers_in_batch=c.num_speakers_in_batch, - skip_speakers=getattr(c, "skip_speakers", False), + skip_speakers=c.skip_speakers, storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], verbose=verbose, - augmentation_config=getattr(c, "audio_augmentation", None) + augmentation_config=c.audio_augmentation ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader( dataset, @@ -133,17 +134,15 @@ def train(model, optimizer, scheduler, criterion, data_loader, global_step): ) avg_loss_all += avg_loss - if global_step % c.save_step == 0: - # save best model + if global_step >= c.max_train_step or global_step % c.save_step == 0: + # save best model only best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step) avg_loss_all = 0 - end_time = time.time() - - # checkpoint and check stop train cond. - if global_step >= c.max_train_step or global_step % c.save_step == 0: - save_checkpoint(model, optimizer, avg_loss, OUT_PATH, global_step) if global_step >= c.max_train_step: break + + end_time = time.time() + return avg_loss, global_step diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 69f1ee31..a7976db7 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -226,7 +226,7 @@ class BaseTrainingConfig(Coqpit): run_description: str = "" # training params epochs: int = 10000 - batch_size: int = MISSING + batch_size: int = None eval_batch_size: int = None mixed_precision: bool = False # eval params diff --git a/TTS/speaker_encoder/configs/config_resnet_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_angleproto.json index 7cae1b25..95cf5ccf 100644 --- a/TTS/speaker_encoder/configs/config_resnet_angleproto.json +++ b/TTS/speaker_encoder/configs/config_resnet_angleproto.json @@ -1,5 +1,5 @@ { - "model_name": "resnet", + "model": "speaker_encoder", "run_name": "speaker_encoder", "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", // AUDIO PARAMETERS @@ -34,7 +34,7 @@ "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. + "max_train_step": 1000000, // total number of steps to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" @@ -45,15 +45,14 @@ "num_speakers_in_batch": 200, // Batch size for training. "num_utters_per_speaker": 2, // "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - "voice_len": 2, // number of seconds for each training instance - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. "print_step": 50, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto-continue/", // DATASET-RELATED: output path for all training outputs. "audio_augmentation": { "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation @@ -90,12 +89,13 @@ "max_amplitude": 1e-5 } }, - "model": { + "model_params": { + "model_name": "resnet", "input_dim": 80, "proj_dim": 512 }, "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "sample_from_storage_p": 0.5, // the probability with which we'll sample from the DataSet in-memory storage "storage_size": 35 // the size of the in-memory storage with respect to a single batch }, "datasets": diff --git a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json index 4baca49a..ccbd751a 100644 --- a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json +++ b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json @@ -1,6 +1,6 @@ { - "model_name": "resnet", + "model": "speaker_encoder", "run_name": "speaker_encoder", "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", // AUDIO PARAMETERS @@ -35,7 +35,7 @@ "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. + "max_train_step": 1000000, // total number of steps to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" @@ -46,7 +46,6 @@ "num_speakers_in_batch": 200, // Batch size for training. "num_utters_per_speaker": 2, // "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - "voice_len": 2, // number of seconds for each training instance "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. @@ -91,7 +90,8 @@ "max_amplitude": 1e-5 } }, - "model": { + "model_params": { + "model_name": "resnet", "input_dim": 80, "proj_dim": 512 }, diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 0b673753..45a7bc12 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -240,9 +240,6 @@ class MyDataset(Dataset): labels.append(torch.LongTensor(labels_)) feats.extend(feats_) - if self.num_speakers_in_batch != len(speakers): - raise ValueError('Error: Speakers appear more than once on the Batch. This cannot happen because the loss functions AngleProto and GE2E consider these samples to be from another speaker.') - feats = torch.stack(feats) labels = torch.stack(labels) diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py index 64f0773d..9b573b6d 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/speaker_encoder/losses.py @@ -103,7 +103,7 @@ class GE2ELoss(nn.Module): L.append(L_row) return torch.stack(L) - def forward(self, x, label=None): + def forward(self, x, _label=None): """ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ @@ -141,8 +141,7 @@ class AngleProtoLoss(nn.Module): print(" > Initialized Angular Prototypical loss") - # pylint: disable=W0613 - def forward(self, x, label=None): + def forward(self, x, _label=None): """ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/speaker_encoder/speaker_encoder_config.py index dcba3b6c..31149822 100644 --- a/TTS/speaker_encoder/speaker_encoder_config.py +++ b/TTS/speaker_encoder/speaker_encoder_config.py @@ -13,11 +13,11 @@ class SpeakerEncoderConfig(BaseTrainingConfig): model: str = "speaker_encoder" audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) - # model params model_params: dict = field( default_factory=lambda: { - "input_dim": 40, + "model_name": "lstm", + "input_dim": 80, "proj_dim": 256, "lstm_dim": 768, "num_lstm_layers": 3, @@ -25,16 +25,20 @@ class SpeakerEncoderConfig(BaseTrainingConfig): } ) + audio_augmentation : dict = field( + default_factory=lambda: { + } + ) + storage: dict = field( default_factory=lambda: { "sample_from_storage_p": 0.66, # the probability with which we'll sample from the DataSet in-memory storage "storage_size": 15, # the size of the in-memory storage with respect to a single batch - "additive_noise": 1e-5, # add very small gaussian noise to the data in order to increase robustness } ) # training params - max_train_step: int = 1000 # end training when number of training steps reaches this value. + max_train_step: int = 1000000 # end training when number of training steps reaches this value. loss: str = "angleproto" grad_clip: float = 3.0 lr: float = 0.0001 @@ -53,6 +57,8 @@ class SpeakerEncoderConfig(BaseTrainingConfig): num_speakers_in_batch: int = MISSING num_utters_per_speaker: int = MISSING num_loader_workers: int = MISSING + skip_speakers: bool = False + voice_len: float = 1.6 def check_values(self): super().check_values() diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index f3c4b8f8..3299f75a 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -1,16 +1,17 @@ import re +import os import numpy as np import torch import glob import random +import datetime from scipy import signal from multiprocessing import Manager from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder -from TTS.utils.generic_utils import check_argument class Storage(object): def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): @@ -157,10 +158,10 @@ def to_camel(text): return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) def setup_model(c): - if c.model_name.lower() == 'lstm': - model = LSTMSpeakerEncoder(c.model["input_dim"], c.model["proj_dim"], c.model["lstm_dim"], c.model["num_lstm_layers"]) - elif c.model_name.lower() == 'resnet': - model = ResNetSpeakerEncoder(input_dim=c.model["input_dim"], proj_dim=c.model["proj_dim"]) + if c.model_params['model_name'].lower() == 'lstm': + model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"]) + elif c.model_params['model_name'].lower() == 'resnet': + model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"]) return model def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): @@ -198,75 +199,3 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) torch.save(state, bestmodel_path) return best_loss - - -def check_config_speaker_encoder(c): - """Check the config.json file of the speaker encoder""" - check_argument("run_name", c, restricted=True, val_type=str) - check_argument("run_description", c, val_type=str) - - # audio processing parameters - check_argument("audio", c, restricted=True, val_type=dict) - check_argument("num_mels", c["audio"], restricted=True, val_type=int, min_val=10, max_val=2056) - check_argument("fft_size", c["audio"], restricted=True, val_type=int, min_val=128, max_val=4058) - check_argument("sample_rate", c["audio"], restricted=True, val_type=int, min_val=512, max_val=100000) - check_argument( - "frame_length_ms", - c["audio"], - restricted=True, - val_type=float, - min_val=10, - max_val=1000, - alternative="win_length", - ) - check_argument( - "frame_shift_ms", c["audio"], restricted=True, val_type=float, min_val=1, max_val=1000, alternative="hop_length" - ) - check_argument("preemphasis", c["audio"], restricted=True, val_type=float, min_val=0, max_val=1) - check_argument("min_level_db", c["audio"], restricted=True, val_type=int, min_val=-1000, max_val=10) - check_argument("ref_level_db", c["audio"], restricted=True, val_type=int, min_val=0, max_val=1000) - check_argument("power", c["audio"], restricted=True, val_type=float, min_val=1, max_val=5) - check_argument("griffin_lim_iters", c["audio"], restricted=True, val_type=int, min_val=10, max_val=1000) - - # training parameters - check_argument("loss", c, enum_list=["ge2e", "angleproto", "softmaxproto"], restricted=True, val_type=str) - check_argument("grad_clip", c, restricted=True, val_type=float) - check_argument("epochs", c, restricted=True, val_type=int, min_val=1) - check_argument("lr", c, restricted=True, val_type=float, min_val=0) - check_argument("lr_decay", c, restricted=True, val_type=bool) - check_argument("warmup_steps", c, restricted=True, val_type=int, min_val=0) - check_argument("tb_model_param_stats", c, restricted=True, val_type=bool) - check_argument("num_speakers_in_batch", c, restricted=True, val_type=int) - check_argument("num_loader_workers", c, restricted=True, val_type=int) - check_argument("wd", c, restricted=True, val_type=float, min_val=0.0, max_val=1.0) - - # checkpoint and output parameters - check_argument("steps_plot_stats", c, restricted=True, val_type=int) - check_argument("checkpoint", c, restricted=True, val_type=bool) - check_argument("save_step", c, restricted=True, val_type=int) - check_argument("print_step", c, restricted=True, val_type=int) - check_argument("output_path", c, restricted=True, val_type=str) - - # model parameters - check_argument("model", c, restricted=True, val_type=dict) - check_argument("model_name", c, restricted=True, val_type=str) - check_argument("input_dim", c["model"], restricted=True, val_type=int) - check_argument("proj_dim", c["model"], restricted=True, val_type=int) - - if c.model_name.lower() == 'lstm': - check_argument("lstm_dim", c["model"], restricted=True, val_type=int) - check_argument("num_lstm_layers", c["model"], restricted=True, val_type=int) - check_argument("use_lstm_with_projection", c["model"], restricted=True, val_type=bool) - - # in-memory storage parameters - check_argument("storage", c, restricted=True, val_type=dict) - check_argument("sample_from_storage_p", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0) - check_argument("storage_size", c["storage"], restricted=True, val_type=int, min_val=1, max_val=100) - - # datasets - checking only the first entry - check_argument("datasets", c, restricted=True, val_type=list) - for dataset_entry in c["datasets"]: - check_argument("name", dataset_entry, restricted=True, val_type=str) - check_argument("path", dataset_entry, restricted=True, val_type=str) - check_argument("meta_file_train", dataset_entry, restricted=True, val_type=[str, list]) - check_argument("meta_file_val", dataset_entry, restricted=True, val_type=str) From 7a9a27282ace3756f7c46e5e8156e3ee86b81f94 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 26 May 2021 18:14:06 -0300 Subject: [PATCH 10/18] fix unit tests --- tests/test_speaker_encoder.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/tests/test_speaker_encoder.py b/tests/test_speaker_encoder.py index 8939ccf6..3b45d2e2 100644 --- a/tests/test_speaker_encoder.py +++ b/tests/test_speaker_encoder.py @@ -4,8 +4,8 @@ import torch as T from tests import get_tests_input_path from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss -from TTS.speaker_encoder.model import SpeakerEncoder - +from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder +# from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder file_path = get_tests_input_path() @@ -14,7 +14,7 @@ class SpeakerEncoderTests(unittest.TestCase): def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] - model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) + model = LSTMSpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) # computing d vectors output = model.forward(dummy_input) assert output.shape[0] == 4 @@ -96,17 +96,3 @@ class AngleProtoLossTests(unittest.TestCase): loss = AngleProtoLoss() output = loss.forward(dummy_input) assert output.item() < 0.005 - - -# class LoaderTest(unittest.TestCase): -# def test_output(self): -# items = libri_tts("/home/erogol/Data/Libri-TTS/train-clean-360/") -# ap = AudioProcessor(**c['audio']) -# dataset = MyDataset(ap, items, 1.6, 64, 10) -# loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) -# count = 0 -# for mel, spk in loader: -# print(mel.shape) -# if count == 4: -# break -# count += 1 From bc5307caa03efb44ca4194950fd913fe498dae75 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 26 May 2021 20:35:58 -0300 Subject: [PATCH 11/18] add unit tests for SoftmaxAngleProtoLoss and ResnetSpeakerEncoder and bugfix --- TTS/speaker_encoder/models/resnet.py | 6 +-- tests/inputs/test_speaker_encoder_config.json | 4 +- tests/test_speaker_encoder.py | 53 +++++++++++++++++-- tests/test_speaker_encoder_train.py | 2 +- tests/test_speaker_manager.py | 4 +- 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 0cb7e0a8..fe89c5aa 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -126,7 +126,7 @@ class ResNetSpeakerEncoder(nn.Module): nn.init.xavier_normal_(out) return out - def forward(self, x, training=True): + def forward(self, x, l2_norm=False): x = x.transpose(1, 2) with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): @@ -157,7 +157,7 @@ class ResNetSpeakerEncoder(nn.Module): x = x.view(x.size()[0], -1) x = self.fc(x) - if not training: + if l2_norm: x = torch.nn.functional.normalize(x, p=2, dim=1) return x @@ -179,7 +179,7 @@ class ResNetSpeakerEncoder(nn.Module): offset = int(offset) end_offset = int(offset+num_frames) frames = x[:, offset:end_offset] - embed = self.forward(frames, training=False) + embed = self.forward(frames, l2_norm=True) embeddings.append(embed) embeddings = torch.stack(embeddings) diff --git a/tests/inputs/test_speaker_encoder_config.json b/tests/inputs/test_speaker_encoder_config.json index 4f3678e1..09a2f6a4 100644 --- a/tests/inputs/test_speaker_encoder_config.json +++ b/tests/inputs/test_speaker_encoder_config.json @@ -46,6 +46,7 @@ "batch_size": 32, "output_path": "", // DATASET-RELATED: output path for all training outputs. "model_params": { + "model_name": "lstm", "input_dim": 40, "proj_dim": 256, "lstm_dim": 768, @@ -54,8 +55,7 @@ }, "storage": { "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 15, // the size of the in-memory storage with respect to a single batch - "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness + "storage_size": 15 // the size of the in-memory storage with respect to a single batch }, "datasets":null } \ No newline at end of file diff --git a/tests/test_speaker_encoder.py b/tests/test_speaker_encoder.py index 3b45d2e2..f56a9577 100644 --- a/tests/test_speaker_encoder.py +++ b/tests/test_speaker_encoder.py @@ -3,13 +3,13 @@ import unittest import torch as T from tests import get_tests_input_path -from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss +from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder -# from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder +from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder file_path = get_tests_input_path() -class SpeakerEncoderTests(unittest.TestCase): +class LSTMSpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D @@ -39,6 +39,31 @@ class SpeakerEncoderTests(unittest.TestCase): assert output.shape[1] == 256 assert len(output.shape) == 2 +class ResNetSpeakerEncoderTests(unittest.TestCase): + # pylint: disable=R0201 + def test_in_out(self): + dummy_input = T.rand(4, 20, 80) # B x T x D + dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] + model = ResNetSpeakerEncoder(input_dim=80, proj_dim=256) + # computing d vectors + output = model.forward(dummy_input) + assert output.shape[0] == 4 + assert output.shape[1] == 256 + output = model.forward(dummy_input, l2_norm=True) + assert output.shape[0] == 4 + assert output.shape[1] == 256 + + # check normalization + output_norm = T.nn.functional.normalize(output, dim=1, p=2) + assert_diff = (output_norm - output).sum().item() + assert output.type() == "torch.FloatTensor" + assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}" + # compute d for a given batch + dummy_input = T.rand(1, 240, 80) # B x T x D + output = model.compute_embedding(dummy_input, num_frames=160, num_eval=10) + assert output.shape[0] == 1 + assert output.shape[1] == 256 + assert len(output.shape) == 2 class GE2ELossTests(unittest.TestCase): # pylint: disable=R0201 @@ -67,7 +92,6 @@ class GE2ELossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 - class AngleProtoLossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -96,3 +120,24 @@ class AngleProtoLossTests(unittest.TestCase): loss = AngleProtoLoss() output = loss.forward(dummy_input) assert output.item() < 0.005 + +class SoftmaxAngleProtoLossTests(unittest.TestCase): + # pylint: disable=R0201 + def test_in_out(self): + + embedding_dim = 64 + num_speakers = 5 + batch_size = 4 + + dummy_label = T.randint(low=0, high=num_speakers, size=(batch_size, num_speakers)) + # check random input + dummy_input = T.rand(batch_size, num_speakers, embedding_dim) # num_speaker x num_utterance x dim + loss = SoftmaxAngleProtoLoss(embedding_dim=embedding_dim, n_speakers=num_speakers) + output = loss.forward(dummy_input, dummy_label) + assert output.item() >= 0.0 + + # check all zeros + dummy_input = T.ones(batch_size, num_speakers, embedding_dim) # num_speaker x num_utterance x dim + loss = SoftmaxAngleProtoLoss(embedding_dim=embedding_dim, n_speakers=num_speakers) + output = loss.forward(dummy_input, dummy_label) + assert output.item() >= 0.0 diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py index 525730f2..831c48f2 100644 --- a/tests/test_speaker_encoder_train.py +++ b/tests/test_speaker_encoder_train.py @@ -19,7 +19,7 @@ config = SpeakerEncoderConfig( print_step=1, save_step=1, print_eval=True, - audio=BaseAudioConfig(num_mels=40), + audio=BaseAudioConfig(num_mels=80), ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index ffb98ed7..f80e56fc 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -6,7 +6,7 @@ import torch from tests import get_tests_input_path from TTS.config import load_config -from TTS.speaker_encoder.model import SpeakerEncoder +from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.speaker_encoder.utils.io import save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor @@ -28,7 +28,7 @@ class SpeakerManagerTest(unittest.TestCase): config.audio.resample = True # create a dummy speaker encoder - model = SpeakerEncoder(**config.model_params) + model = setup_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder From 5af505ff333a0f0219877c42fc6c24ed4e22a18c Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 26 May 2021 21:43:51 -0300 Subject: [PATCH 12/18] remove unused notebooks --- ...- ExtractSpeakerEmbeddings-by-sample.ipynb | 163 -------------- ...J-ExtractSpeakerEmbeddings-by-sample.ipynb | 212 ------------------ ...- ExtractSpeakerEmbeddings-by-sample.ipynb | 163 -------------- 3 files changed, 538 deletions(-) delete mode 100644 notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb delete mode 100644 notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb delete mode 100644 notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb diff --git a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb deleted file mode 100644 index 15206130..00000000 --- a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb +++ /dev/null @@ -1,163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related model files from TTS.\n", - "- download or clone related repos, linked below.\n", - "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model) of TTS.\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repository:\n", - "- TTS: https://github.com/mozilla/TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import importlib\n", - "import random\n", - "import librosa\n", - "import torch\n", - "\n", - "import numpy as np\n", - "from tqdm import tqdm\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", - "\n", - "\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should also adjust all the path constants to point at the relevant locations for you locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", - "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", - "\n", - "\n", - "DATASETS_NAME = ['vctk'] # list the datasets\n", - "DATASETS_PATH = ['../../../datasets/VCTK/']\n", - "DATASETS_METAFILE = ['']\n", - "\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Preprocess dataset\n", - "meta_data = []\n", - "for i in range(len(DATASETS_NAME)):\n", - " preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", - " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", - " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", - " \n", - "meta_data= list(meta_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "c = load_config(CONFIG_PATH)\n", - "ap = AudioProcessor(**c['audio'])\n", - "\n", - "model = SpeakerEncoder(**c.model)\n", - "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n", - "model.eval()\n", - "if USE_CUDA:\n", - " model.cuda()\n", - "\n", - "embeddings_dict = {}\n", - "len_meta_data= len(meta_data)\n", - "\n", - "for i in tqdm(range(len_meta_data)):\n", - " _, wav_file, speaker_id = meta_data[i]\n", - " wav_file_name = os.path.basename(wav_file)\n", - " mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n", - " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", - " if USE_CUDA:\n", - " mel_spec = mel_spec.cuda()\n", - " embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", - " embeddings_dict[wav_file_name] = [embedd,speaker_id]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create and export speakers.json\n", - "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n", - "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#test load integrity\n", - "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n", - "assert speaker_mapping == speaker_mapping_load\n", - "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb deleted file mode 100644 index 1c4e8759..00000000 --- a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb +++ /dev/null @@ -1,212 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n", - "\n", - "Before running this script please DON'T FORGET:\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repositories:\n", - "- TTS: https://github.com/coqui/TTS\n", - "- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import os\n", - "import importlib\n", - "import random\n", - "import librosa\n", - "import torch\n", - "\n", - "import numpy as np\n", - "from TTS.utils.io import load_config\n", - "from tqdm import tqdm\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='0'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Clone encoder \n", - "!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n", - "os.chdir('Real-Time-Voice-Cloning/')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Install voxceleb_trainer Requeriments\n", - "!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0 scipy>=1.0.0 tqdm sounddevice Unidecode inflect multiprocess numba" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Download encoder Checkpoint\n", - "!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", - "!unzip pretrained.zip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from encoder import inference as encoder\n", - "from encoder.params_model import model_embedding_size as speaker_embedding_size\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n", - "encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n", - "print(\"Testing your configuration with small inputs.\")\n", - "# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n", - "# sampling rate, which may differ.\n", - "# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n", - "# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n", - "# The sampling rate is the number of values (samples) recorded per second, it is set to\n", - "# 16000 for the encoder. Creating an array of length will always correspond \n", - "# to an audio of 1 second.\n", - "print(\"\\tTesting the encoder...\")\n", - "\n", - "wav = np.zeros(encoder.sampling_rate) \n", - "embed = encoder.embed_utterance(wav)\n", - "print(embed.shape)\n", - "\n", - "# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n", - "# embeddings it will be).\n", - "#embed /= np.linalg.norm(embed) # for random embedding\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SAVE_PATH = '../'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set constants\n", - "DATASETS_NAME = ['vctk'] # list the datasets\n", - "DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n", - "DATASETS_METAFILE = ['']\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Preprocess dataset\n", - "meta_data = []\n", - "for i in range(len(DATASETS_NAME)):\n", - " preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", - " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", - " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", - " \n", - "meta_data= list(meta_data)\n", - "\n", - "meta_data = meta_data\n", - "embeddings_dict = {}\n", - "len_meta_data= len(meta_data)\n", - "for i in tqdm(range(len_meta_data)):\n", - " _, wave_file_path, speaker_id = meta_data[i]\n", - " wav_file_name = os.path.basename(wave_file_path)\n", - " # Extract Embedding\n", - " preprocessed_wav = encoder.preprocess_wav(wave_file_path)\n", - " file_embedding = encoder.embed_utterance(preprocessed_wav)\n", - " embeddings_dict[wav_file_name] = [file_embedding.reshape(-1).tolist(), speaker_id]\n", - " del file_embedding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create and export speakers.json and aplly a L2_norm in embedding\n", - "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0]} for i, sample in enumerate(embeddings_dict.keys())}\n", - "save_speaker_mapping(SAVE_PATH, speaker_mapping)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#test load integrity\n", - "speaker_mapping_load = load_speaker_mapping(SAVE_PATH)\n", - "assert speaker_mapping == speaker_mapping_load\n", - "print(\"The file speakers.json has been exported to \",SAVE_PATH, ' with ', len(embeddings_dict.keys()), ' samples')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb deleted file mode 100644 index 09add419..00000000 --- a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb +++ /dev/null @@ -1,163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a noteboook used to generate the speaker embeddings with the GE2E speaker encoder model for multi-speaker training.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related model files from TTS.\n", - "- download or clone related repos, linked below.\n", - "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model) of TTS.\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repository:\n", - "- TTS: https://github.com/coqui/TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import importlib\n", - "import random\n", - "import librosa\n", - "import torch\n", - "\n", - "import numpy as np\n", - "from tqdm import tqdm\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", - "\n", - "\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should also adjust all the path constants to point at the relevant locations for you locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_RUN_PATH = \"../../Coqui-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", - "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", - "\n", - "\n", - "DATASETS_NAME = ['vctk'] # list the datasets\n", - "DATASETS_PATH = ['../../../datasets/VCTK/']\n", - "DATASETS_METAFILE = ['']\n", - "\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Preprocess dataset\n", - "meta_data = []\n", - "for i in range(len(DATASETS_NAME)):\n", - " preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", - " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", - " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", - " \n", - "meta_data= list(meta_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "c = load_config(CONFIG_PATH)\n", - "ap = AudioProcessor(**c['audio'])\n", - "\n", - "model = SpeakerEncoder(**c.model)\n", - "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n", - "model.eval()\n", - "if USE_CUDA:\n", - " model.cuda()\n", - "\n", - "embeddings_dict = {}\n", - "len_meta_data= len(meta_data)\n", - "\n", - "for i in tqdm(range(len_meta_data)):\n", - " _, wav_file, speaker_id = meta_data[i]\n", - " wav_file_name = os.path.basename(wav_file)\n", - " mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n", - " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", - " if USE_CUDA:\n", - " mel_spec = mel_spec.cuda()\n", - " embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", - " embeddings_dict[wav_file_name] = [embedd,speaker_id]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create and export speakers.json\n", - "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n", - "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#test load integrity\n", - "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n", - "assert speaker_mapping == speaker_mapping_load\n", - "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 1496f271dc23daaa6be9a4bf100f9e52d9bd7921 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 27 May 2021 00:45:18 -0300 Subject: [PATCH 13/18] update Compute embeddings script --- TTS/bin/compute_embeddings.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 410086de..dce9ea83 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -10,7 +10,7 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.speakers import save_speaker_mapping from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_config +from TTS.config import load_config, BaseDatasetConfig parser = argparse.ArgumentParser( description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.' @@ -44,7 +44,7 @@ sep = args.separator if args.target_dataset != "": # if target dataset is defined dataset_config = [ - {"name": args.target_dataset, "path": args.data_path, "meta_file_train": None, "meta_file_val": None}, + BaseDatasetConfig(name=args.target_dataset, path=args.data_path, meta_file_train=None, meta_file_val=None), ] wav_files, _ = load_meta_data(dataset_config, eval_split=False) output_files = [wav_file[1].replace(data_path, args.output_path).replace(".wav", ".npy") for wav_file in wav_files] @@ -106,6 +106,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)): speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist() if args.target_dataset != "": - # save speaker_mapping if target dataset is defined - mapping_file_path = os.path.join(args.output_path, "speakers.json") - save_speaker_mapping(args.output_path, speaker_mapping) + if speaker_mapping: + # save speaker_mapping if target dataset is defined + mapping_file_path = os.path.join(args.output_path, "speakers.json") + save_speaker_mapping(args.output_path, speaker_mapping) From 825734a3a912bbc61cda419e7cfcdf392ff337f2 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 27 May 2021 19:10:24 -0300 Subject: [PATCH 14/18] remove unused embeddings export --- TTS/bin/compute_embeddings.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index dce9ea83..045aa372 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -22,7 +22,7 @@ parser.add_argument( help="Path to config file for training.", ) parser.add_argument("data_path", type=str, help="Data path for wav files - directory or CSV file") -parser.add_argument("output_path", type=str, help="path for training outputs.") +parser.add_argument("output_path", type=str, help="path for output speakers.json.") parser.add_argument( "--target_dataset", type=str, @@ -47,7 +47,6 @@ if args.target_dataset != "": BaseDatasetConfig(name=args.target_dataset, path=args.data_path, meta_file_train=None, meta_file_val=None), ] wav_files, _ = load_meta_data(dataset_config, eval_split=False) - output_files = [wav_file[1].replace(data_path, args.output_path).replace(".wav", ".npy") for wav_file in wav_files] else: # if target dataset is not defined if len(split_ext) > 0 and split_ext[1].lower() == ".csv": @@ -71,10 +70,8 @@ else: # Parse all wav files in data_path wav_files = glob.glob(data_path + "/**/*.wav", recursive=True) - output_files = [wav_file.replace(data_path, args.output_path).replace(".wav", ".npy") for wav_file in wav_files] -for output_file in output_files: - os.makedirs(os.path.dirname(output_file), exist_ok=True) +os.makedirs(args.output_path, exist_ok=True) # define Encoder model model = setup_model(c) @@ -96,7 +93,6 @@ for idx, wav_file in enumerate(tqdm(wav_files)): mel_spec = mel_spec.cuda() embedd = model.compute_embedding(mel_spec) embedd = embedd.detach().cpu().numpy() - np.save(output_files[idx], embedd) if args.target_dataset != "": # create speaker_mapping if target dataset is defined @@ -110,3 +106,4 @@ if args.target_dataset != "": # save speaker_mapping if target dataset is defined mapping_file_path = os.path.join(args.output_path, "speakers.json") save_speaker_mapping(args.output_path, speaker_mapping) + print("Speaker embedding saved at:", mapping_file_path) From 208bb0f0ee75cec191e6e1f3875d4c9025dfa85a Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 27 May 2021 20:01:00 -0300 Subject: [PATCH 15/18] add batched speaker encoder inference --- TTS/bin/compute_embeddings.py | 1 - TTS/speaker_encoder/models/resnet.py | 12 +++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 045aa372..9affac64 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -2,7 +2,6 @@ import argparse import glob import os -import numpy as np import torch from tqdm import tqdm diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index fe89c5aa..aa2171ed 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -174,15 +174,17 @@ class ResNetSpeakerEncoder(nn.Module): offsets = np.linspace(0, max_len-num_frames, num=num_eval) - embeddings = [] + frames_batch = [] for offset in offsets: offset = int(offset) end_offset = int(offset+num_frames) frames = x[:, offset:end_offset] - embed = self.forward(frames, l2_norm=True) - embeddings.append(embed) + frames_batch.append(frames) + + frames_batch = torch.cat(frames_batch, dim=0) + embeddings = self.forward(frames_batch, l2_norm=True) - embeddings = torch.stack(embeddings) if return_mean: - embeddings = torch.mean(embeddings, dim=0) + embeddings = torch.mean(embeddings, dim=0, keepdim=True) + return embeddings From 099142d4ddb83330865185ea090c55738777fa65 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 27 May 2021 21:50:56 -0300 Subject: [PATCH 16/18] bug fix --- TTS/tts/configs/shared_configs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 6c710ca2..4690e76f 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -80,12 +80,12 @@ class CharactersConfig(Coqpit): ): """Check config fields""" c = asdict(self) - check_argument("pad", c, "characters", restricted=True) - check_argument("eos", c, "characters", restricted=True) - check_argument("bos", c, "characters", restricted=True) - check_argument("characters", c, "characters", restricted=True) + check_argument("pad", c, prerequest="characters", restricted=True) + check_argument("eos", c, prerequest="characters", restricted=True) + check_argument("bos", c, prerequest="characters", restricted=True) + check_argument("characters", c, prerequest="characters", restricted=True) check_argument("phonemes", c, restricted=True) - check_argument("punctuations", c, "characters", restricted=True) + check_argument("punctuations", c, prerequest="characters", restricted=True) @dataclass From 7448177b72442b90136af6ba39423db3c4a70aeb Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 29 May 2021 21:11:53 -0300 Subject: [PATCH 17/18] use SpeakerManager on compute embeddings script --- TTS/bin/compute_embeddings.py | 35 ++++++++++--------- TTS/bin/train_encoder.py | 4 +-- .../configs/config_resnet_angleproto.json | 2 +- TTS/speaker_encoder/dataset.py | 2 +- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 9affac64..003da1e5 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -7,7 +7,7 @@ from tqdm import tqdm from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.utils.speakers import save_speaker_mapping +from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor from TTS.config import load_config, BaseDatasetConfig @@ -28,7 +28,7 @@ parser.add_argument( default="", help="Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.", ) -parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=False) +parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) parser.add_argument("--separator", type=str, help="Separator used in file if CSV is passed for data_path", default="|") args = parser.parse_args() @@ -69,9 +69,6 @@ else: # Parse all wav files in data_path wav_files = glob.glob(data_path + "/**/*.wav", recursive=True) - -os.makedirs(args.output_path, exist_ok=True) - # define Encoder model model = setup_model(c) model.load_state_dict(torch.load(args.model_path)["model"]) @@ -85,6 +82,8 @@ for idx, wav_file in enumerate(tqdm(wav_files)): if isinstance(wav_file, list): speaker_name = wav_file[2] wav_file = wav_file[1] + else: + speaker_name = None mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T mel_spec = torch.FloatTensor(mel_spec[None, :, :]) @@ -93,16 +92,20 @@ for idx, wav_file in enumerate(tqdm(wav_files)): embedd = model.compute_embedding(mel_spec) embedd = embedd.detach().cpu().numpy() - if args.target_dataset != "": - # create speaker_mapping if target dataset is defined - wav_file_name = os.path.basename(wav_file) - speaker_mapping[wav_file_name] = {} - speaker_mapping[wav_file_name]["name"] = speaker_name - speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist() + # create speaker_mapping if target dataset is defined + wav_file_name = os.path.basename(wav_file) + speaker_mapping[wav_file_name] = {} + speaker_mapping[wav_file_name]["name"] = speaker_name + speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist() -if args.target_dataset != "": - if speaker_mapping: - # save speaker_mapping if target dataset is defined +if speaker_mapping: + # save speaker_mapping if target dataset is defined + if '.json' not in args.output_path: mapping_file_path = os.path.join(args.output_path, "speakers.json") - save_speaker_mapping(args.output_path, speaker_mapping) - print("Speaker embedding saved at:", mapping_file_path) + else: + mapping_file_path = args.output_path + os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) + speaker_manager = SpeakerManager() + # pylint: disable=W0212 + speaker_manager._save_json(mapping_file_path, speaker_mapping) + print("Speaker embeddings saved at:", mapping_file_path) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index a4191dfb..c9493535 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -9,7 +9,7 @@ import traceback import torch from torch.utils.data import DataLoader -from TTS.speaker_encoder.dataset import MyDataset +from TTS.speaker_encoder.dataset import SpeakerEncoderDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model @@ -35,7 +35,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False if is_val: loader = None else: - dataset = MyDataset( + dataset = SpeakerEncoderDataset( ap, meta_data_eval if is_val else meta_data_train, voice_len=c.voice_len, diff --git a/TTS/speaker_encoder/configs/config_resnet_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_angleproto.json index 95cf5ccf..c26d29ce 100644 --- a/TTS/speaker_encoder/configs/config_resnet_angleproto.json +++ b/TTS/speaker_encoder/configs/config_resnet_angleproto.json @@ -52,7 +52,7 @@ "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. "print_step": 50, // Number of steps to log traning on console. - "output_path": "../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto-continue/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../checkpoints/speaker_encoder/angleproto/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs. "audio_augmentation": { "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 45a7bc12..cd95a4f5 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -6,7 +6,7 @@ import torch from torch.utils.data import Dataset from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage -class MyDataset(Dataset): +class SpeakerEncoderDataset(Dataset): def __init__( self, ap, From cc192b6843e7020a6665f8c699ca44b1171540c8 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 29 May 2021 22:43:41 -0300 Subject: [PATCH 18/18] add resnet speaker encoder train unit test --- tests/test_speaker_encoder_train.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py index 831c48f2..e168a785 100644 --- a/tests/test_speaker_encoder_train.py +++ b/tests/test_speaker_encoder_train.py @@ -9,7 +9,6 @@ from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") - config = SpeakerEncoderConfig( batch_size=4, num_speakers_in_batch=1, @@ -45,3 +44,28 @@ command_train = ( ) run_cli(command_train) shutil.rmtree(continue_path) + +# test resnet speaker encoder +config.model_params['model_name'] = "resnet" +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " +) +run_cli(command_train) +shutil.rmtree(continue_path)