diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 97e2f0e3..b5552e32 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -35,7 +35,7 @@ def main(): command += unargs command.append("") - # run processes + # run a processes per GPU processes = [] for i in range(num_gpus): my_env = os.environ.copy() diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py index a03bfd82..de9e5865 100644 --- a/TTS/bin/eval_encoder.py +++ b/TTS/bin/eval_encoder.py @@ -1,17 +1,18 @@ import argparse -import torch from argparse import RawTextHelpFormatter +import torch from tqdm import tqdm from TTS.config import load_config from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.speakers import SpeakerManager + def compute_encoder_accuracy(dataset_items, encoder_manager): class_name_key = encoder_manager.speaker_encoder_config.class_name_key - map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, 'map_classid_to_classname', None) + map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, "map_classid_to_classname", None) class_acc_dict = {} @@ -43,11 +44,11 @@ def compute_encoder_accuracy(dataset_items, encoder_manager): acc_avg = 0 for key, values in class_acc_dict.items(): - acc = sum(values)/len(values) + acc = sum(values) / len(values) print("Class", key, "Accuracy:", acc) acc_avg += acc - print("Average Accuracy:", acc_avg/len(class_acc_dict)) + print("Average Accuracy:", acc_avg / len(class_acc_dict)) if __name__ == "__main__": diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index fe31c510..8b3f53db 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -210,7 +210,13 @@ If you don't specify any models, then it uses LJSpeech based English model. args = parser.parse_args() # print the description if either text or list_models is not set - if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav: + if ( + not args.text + and not args.list_models + and not args.list_speaker_idxs + and not args.list_language_idxs + and not args.reference_wav + ): parser.parse_args(["-h"]) # load model manager @@ -296,7 +302,14 @@ If you don't specify any models, then it uses LJSpeech based English model. print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx) + wav = synthesizer.tts( + args.text, + args.speaker_idx, + args.language_idx, + args.speaker_wav, + reference_wav=args.reference_wav, + reference_speaker_name=args.reference_speaker_idx, + ) # save the results print(" > Saving output to {}".format(args.out_path)) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index af3e6ec4..b8d38bac 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -9,6 +9,7 @@ import traceback import torch from torch.utils.data import DataLoader from trainer.torch import NoamLR +from trainer.trainer_utils import get_optimizer from TTS.encoder.dataset import EncoderDataset from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_speaker_encoder_model @@ -19,7 +20,6 @@ from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder from TTS.utils.io import copy_model_files -from trainer.trainer_utils import get_optimizer from TTS.utils.training import check_update torch.backends.cudnn.enabled = True @@ -52,16 +52,21 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False sampler = PerfectBatchSampler( dataset.items, classes, - batch_size=num_classes_in_batch*num_utter_per_class, # total batch size + batch_size=num_classes_in_batch * num_utter_per_class, # total batch size num_classes_in_batch=num_classes_in_batch, num_gpus=1, shuffle=not is_val, - drop_last=True) + drop_last=True, + ) if len(classes) < num_classes_in_batch: if is_val: - raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !") - raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !") + raise RuntimeError( + f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !" + ) + raise RuntimeError( + f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !" + ) # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal if is_val: @@ -76,6 +81,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False return loader, classes, dataset.get_map_classid_to_classname() + def evaluation(model, criterion, data_loader, global_step): eval_loss = 0 for _, data in enumerate(data_loader): @@ -84,8 +90,12 @@ def evaluation(model, criterion, data_loader, global_step): inputs, labels = data # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] - labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape) - inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + labels = torch.transpose( + labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1 + ).reshape(labels.shape) + inputs = torch.transpose( + inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1 + ).reshape(inputs.shape) # dispatch data to GPU if use_cuda: @@ -96,20 +106,23 @@ def evaluation(model, criterion, data_loader, global_step): outputs = model(inputs) # loss computation - loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels) + loss = criterion( + outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels + ) eval_loss += loss.item() - eval_avg_loss = eval_loss/len(data_loader) + eval_avg_loss = eval_loss / len(data_loader) # save stats dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss}) # plot the last batch in the evaluation figures = { - "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), } dashboard_logger.eval_figures(global_step, figures) return eval_avg_loss + def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step): model.train() best_loss = float("inf") @@ -124,8 +137,12 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, # setup input data inputs, labels = data # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] - labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape) - inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape( + labels.shape + ) + inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape( + inputs.shape + ) # ToDo: move it to a unit test # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape) # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) @@ -157,7 +174,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, outputs = model(inputs) # loss computation - loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels) + loss = criterion( + outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels + ) loss.backward() grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() @@ -211,7 +230,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, print( ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} " "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format( - epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time + epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time ), flush=True, ) @@ -222,10 +241,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, print("\n\n") print("--> EVAL PERFORMANCE") print( - " | > Epoch:{} AvgLoss: {:.5f} ".format( - epoch, eval_loss - ), - flush=True, + " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss), + flush=True, ) # save the best checkpoint best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch) @@ -262,7 +279,9 @@ def main(args): # pylint: disable=redefined-outer-name copy_model_files(c, OUT_PATH) if args.restore_path: - criterion, args.restore_step = model.load_checkpoint(c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion) + criterion, args.restore_step = model.load_checkpoint( + c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion + ) print(" > Model restored from step %d" % args.restore_step, flush=True) else: args.restore_step = 0 diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py index 02b88d66..ebbaa045 100644 --- a/TTS/encoder/configs/base_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -33,10 +33,7 @@ class BaseEncoderConfig(BaseTrainingConfig): grad_clip: float = 3.0 lr: float = 0.0001 optimizer: str = "radam" - optimizer_params: Dict = field(default_factory=lambda: { - "betas": [0.9, 0.999], - "weight_decay": 0 - }) + optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) lr_decay: bool = False warmup_steps: int = 4000 diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py index a4db021b..582b1fe9 100644 --- a/TTS/encoder/dataset.py +++ b/TTS/encoder/dataset.py @@ -5,6 +5,7 @@ from torch.utils.data import Dataset from TTS.encoder.utils.generic_utils import AugmentWAV + class EncoderDataset(Dataset): def __init__( self, @@ -57,7 +58,6 @@ class EncoderDataset(Dataset): print(f" | > Num Classes: {len(self.classes)}") print(f" | > Classes: {self.classes}") - def load_wav(self, filename): audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) return audio @@ -75,9 +75,7 @@ class EncoderDataset(Dataset): ] # skip classes with number of samples >= self.num_utter_per_class - class_to_utters = { - k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class - } + class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class} classes = list(class_to_utters.keys()) classes.sort() @@ -105,11 +103,11 @@ class EncoderDataset(Dataset): def get_class_list(self): return self.classes + def set_classes(self, classes): self.classes = classes self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} - def get_map_classid_to_classname(self): return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items()) diff --git a/TTS/encoder/losses.py b/TTS/encoder/losses.py index de65d8d6..5b5aa0fc 100644 --- a/TTS/encoder/losses.py +++ b/TTS/encoder/losses.py @@ -195,6 +195,7 @@ class SoftmaxLoss(nn.Module): class_id = torch.argmax(activations) return class_id + class SoftmaxAngleProtoLoss(nn.Module): """ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153 diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index c35c636d..ac7d7dd5 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -1,12 +1,13 @@ +import numpy as np import torch import torchaudio -import numpy as np +from coqpit import Coqpit from torch import nn -from TTS.utils.io import load_fsspec from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.utils.generic_utils import set_init_dict -from coqpit import Coqpit +from TTS.utils.io import load_fsspec + class PreEmphasis(nn.Module): def __init__(self, coefficient=0.97): @@ -20,6 +21,7 @@ class PreEmphasis(nn.Module): x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") return torch.nn.functional.conv1d(x, self.filter).squeeze(1) + class BaseEncoder(nn.Module): """Base `encoder` class. Every new `encoder` model must inherit this. @@ -32,31 +34,31 @@ class BaseEncoder(nn.Module): def get_torch_mel_spectrogram_class(self, audio_config): return torch.nn.Sequential( - PreEmphasis(audio_config["preemphasis"]), - # TorchSTFT( - # n_fft=audio_config["fft_size"], - # hop_length=audio_config["hop_length"], - # win_length=audio_config["win_length"], - # sample_rate=audio_config["sample_rate"], - # window="hamming_window", - # mel_fmin=0.0, - # mel_fmax=None, - # use_htk=True, - # do_amp_to_db=False, - # n_mels=audio_config["num_mels"], - # power=2.0, - # use_mel=True, - # mel_norm=None, - # ) - torchaudio.transforms.MelSpectrogram( - sample_rate=audio_config["sample_rate"], - n_fft=audio_config["fft_size"], - win_length=audio_config["win_length"], - hop_length=audio_config["hop_length"], - window_fn=torch.hamming_window, - n_mels=audio_config["num_mels"], - ) - ) + PreEmphasis(audio_config["preemphasis"]), + # TorchSTFT( + # n_fft=audio_config["fft_size"], + # hop_length=audio_config["hop_length"], + # win_length=audio_config["win_length"], + # sample_rate=audio_config["sample_rate"], + # window="hamming_window", + # mel_fmin=0.0, + # mel_fmax=None, + # use_htk=True, + # do_amp_to_db=False, + # n_mels=audio_config["num_mels"], + # power=2.0, + # use_mel=True, + # mel_norm=None, + # ) + torchaudio.transforms.MelSpectrogram( + sample_rate=audio_config["sample_rate"], + n_fft=audio_config["fft_size"], + win_length=audio_config["win_length"], + hop_length=audio_config["hop_length"], + window_fn=torch.hamming_window, + n_mels=audio_config["num_mels"], + ), + ) @torch.no_grad() def inference(self, x, l2_norm=True): @@ -104,7 +106,9 @@ class BaseEncoder(nn.Module): raise Exception("The %s not is a loss supported" % c.loss) return criterion - def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None): + def load_checkpoint( + self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None + ): state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) try: self.load_state_dict(state["model"]) @@ -127,7 +131,12 @@ class BaseEncoder(nn.Module): print(" > Criterion load ignored because of:", error) # instance and load the criterion for the encoder classifier in inference time - if eval and criterion is None and "criterion" in state and getattr(config, 'map_classid_to_classname', None) is not None: + if ( + eval + and criterion is None + and "criterion" in state + and getattr(config, "map_classid_to_classname", None) is not None + ): criterion = self.get_criterion(config, len(config.map_classid_to_classname)) criterion.load_state_dict(state["criterion"]) diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py index c4ba9537..84e9967f 100644 --- a/TTS/encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -4,6 +4,7 @@ from torch import nn # from TTS.utils.audio import TorchSTFT from TTS.encoder.models.base_encoder import BaseEncoder + class SELayer(nn.Module): def __init__(self, channel, reduction=8): super(SELayer, self).__init__() diff --git a/TTS/encoder/utils/samplers.py b/TTS/encoder/utils/samplers.py index 947f5da0..08256b34 100644 --- a/TTS/encoder/utils/samplers.py +++ b/TTS/encoder/utils/samplers.py @@ -1,4 +1,5 @@ import random + from torch.utils.data.sampler import Sampler, SubsetRandomSampler @@ -34,10 +35,21 @@ class PerfectBatchSampler(Sampler): drop_last (bool): if True, drops last incomplete batch. """ - def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False, label_key="class_name"): + def __init__( + self, + dataset_items, + classes, + batch_size, + num_classes_in_batch, + num_gpus=1, + shuffle=True, + drop_last=False, + label_key="class_name", + ): super().__init__(dataset_items) - assert batch_size % (num_classes_in_batch * num_gpus) == 0, ( - 'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).') + assert ( + batch_size % (num_classes_in_batch * num_gpus) == 0 + ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." label_indices = {} for idx, item in enumerate(dataset_items): @@ -93,7 +105,7 @@ class PerfectBatchSampler(Sampler): if groups % self._dp_devices == 0: yield batch else: - batch = batch[:(groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch] + batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch] if len(batch) > 0: yield batch diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 222f8519..945c031f 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -7,15 +7,15 @@ import torch.distributed as dist from coqpit import Coqpit from torch import nn from torch.utils.data import DataLoader +from torch.utils.data.sampler import WeightedRandomSampler from trainer.torch import DistributedSampler, DistributedSamplerWrapper from TTS.model import BaseTrainerModel from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_balancer_weights +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from torch.utils.data.sampler import WeightedRandomSampler # pylint: skip-file @@ -258,7 +258,7 @@ class BaseTTS(BaseTrainerModel): # sampler for DDP if sampler is None: sampler = DistributedSampler(dataset) if num_gpus > 1 else None - else: # If a sampler is already defined use this sampler and DDP sampler together + else: # If a sampler is already defined use this sampler and DDP sampler together sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler return sampler diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 818b9a54..afadbadd 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -994,8 +994,11 @@ class Vits(BaseTTS): outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p} return outputs + @torch.no_grad() - def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None): + def inference_voice_conversion( + self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None + ): """Inference for voice conversion Args: @@ -1006,7 +1009,13 @@ class Vits(BaseTTS): reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]` """ # compute spectrograms - y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2) + y = wav_to_spec( + reference_wav, + self.config.audio.fft_size, + self.config.audio.hop_length, + self.config.audio.win_length, + center=False, + ).transpose(1, 2) y_lengths = torch.tensor([y.size(-1)]).to(y.device) speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 1a5da94a..0227412d 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -269,7 +269,9 @@ class SpeakerManager: """ self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) - self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint(self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda) + self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint( + self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda + ) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list: diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 582fb4f1..f9e13251 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -206,6 +206,7 @@ def synthesis( } return return_dict + def transfer_voice( model, CONFIG, @@ -269,12 +270,7 @@ def transfer_voice( _func = model.module.inference_voice_conversion else: _func = model.inference_voice_conversion - model_outputs = _func( - reference_wav, - speaker_id, - d_vector, - reference_speaker_id, - reference_d_vector) + model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector) # convert outputs to numpy # plot results diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 687794b4..2ea23adb 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -119,7 +119,7 @@ class Synthesizer(object): if use_cuda: self.tts_model.cuda() - if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): + if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) def _set_speaker_encoder_paths_from_tts_config(self): @@ -199,8 +199,8 @@ class Synthesizer(object): if not text and not reference_wav: raise ValueError( - "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." - ) + "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." + ) if text: sens = self.split_into_sentences(text) @@ -214,7 +214,9 @@ class Synthesizer(object): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the average speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False) + speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector( + speaker_name, num_samples=None, randomize=False + ) speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name @@ -315,25 +317,31 @@ class Synthesizer(object): if reference_speaker_name and isinstance(reference_speaker_name, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. - reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(reference_speaker_name)[0] - reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :] # [1 x embedding_dim] + reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker( + reference_speaker_name + )[0] + reference_speaker_embedding = np.array(reference_speaker_embedding)[ + None, : + ] # [1 x embedding_dim] else: # get speaker idx from the speaker name reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name] else: - reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(reference_wav) + reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip( + reference_wav + ) outputs = transfer_voice( - model=self.tts_model, - CONFIG=self.tts_config, - use_cuda=self.use_cuda, - reference_wav=reference_wav, - speaker_id=speaker_id, - d_vector=speaker_embedding, - use_griffin_lim=use_gl, - reference_speaker_id=reference_speaker_id, - reference_d_vector=reference_speaker_embedding - ) + model=self.tts_model, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + reference_wav=reference_wav, + speaker_id=speaker_id, + d_vector=speaker_embedding, + use_griffin_lim=use_gl, + reference_speaker_id=reference_speaker_id, + reference_d_vector=reference_speaker_embedding, + ) waveform = outputs if not use_gl: mel_postnet_spec = outputs[0].detach().cpu().numpy() diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py index 6a739009..b4cbae63 100644 --- a/recipes/ljspeech/hifigan/train_hifigan.py +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -41,11 +41,6 @@ model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py index d5ca9a76..225f5a30 100644 --- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -41,11 +41,6 @@ model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index a0ff8b02..04e6150e 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -84,11 +84,6 @@ model = Tacotron2(config, ap, tokenizer, speaker_manager=None) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py index 592b9a76..81d2b889 100644 --- a/recipes/ljspeech/univnet/train.py +++ b/recipes/ljspeech/univnet/train.py @@ -40,11 +40,6 @@ model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index c4ed0dda..26eb46be 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -6,12 +6,11 @@ from trainer import Trainer, TrainerArgs from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig -from TTS.tts.models.vits import CharactersConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.models.vits import Vits, VitsArgs +from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs from TTS.tts.utils.languages import LanguageManager -from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -131,11 +130,6 @@ model = Vits(config, ap, tokenizer, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index c888c629..42f1bfd5 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -1,14 +1,13 @@ import functools - import unittest import torch from TTS.config.shared_configs import BaseDatasetConfig +from TTS.encoder.utils.samplers import PerfectBatchSampler from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.languages import get_language_balancer_weights from TTS.tts.utils.speakers import get_speaker_balancer_weights -from TTS.encoder.utils.samplers import PerfectBatchSampler # Fixing random state to avoid random fails torch.manual_seed(0) @@ -60,7 +59,9 @@ class TestSamplers(unittest.TestCase): assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" def test_language_weighted_random_sampler(self): # pylint: disable=no-self-use - weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_language_balancer_weights(train_samples), len(train_samples)) + weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler( + get_language_balancer_weights(train_samples), len(train_samples) + ) ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) en, pt = 0, 0 for index in ids: @@ -73,7 +74,9 @@ class TestSamplers(unittest.TestCase): def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use - weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_speaker_balancer_weights(train_samples), len(train_samples)) + weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler( + get_speaker_balancer_weights(train_samples), len(train_samples) + ) ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) spk1, spk2 = 0, 0 for index in ids: @@ -92,11 +95,12 @@ class TestSamplers(unittest.TestCase): sampler = PerfectBatchSampler( train_samples, classes, - batch_size=2 * 3, # total batch size + batch_size=2 * 3, # total batch size num_classes_in_batch=2, label_key="speaker_name", shuffle=False, - drop_last=True) + drop_last=True, + ) batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)]) for batch in batchs: spk1, spk2 = 0, 0 @@ -116,11 +120,12 @@ class TestSamplers(unittest.TestCase): sampler = PerfectBatchSampler( train_samples, classes, - batch_size=2 * 3, # total batch size + batch_size=2 * 3, # total batch size num_classes_in_batch=2, label_key="speaker_name", shuffle=True, - drop_last=False) + drop_last=False, + ) batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)]) for batch in batchs: spk1, spk2 = 0, 0