diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index fc02144a..53246e07 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -16,6 +16,7 @@ from TTS.utils.io import load_config if __name__ == '__main__': + # pylint: disable=bad-continuation parser = argparse.ArgumentParser( description='''Extract attention masks from trained Tacotron/Tacotron2 models. These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n''' diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 9a06c866..e0d214d5 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -179,7 +179,6 @@ def main(): # load models synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda) - use_griffin_lim = vocoder_path is None print(" > Text: {}".format(args.text)) # # handle multi-speaker setting @@ -218,4 +217,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 5201f548..12fba6e1 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -34,7 +34,9 @@ print(" > Using CUDA: ", use_cuda) print(" > Number of GPUs: ", num_gpus) -def setup_loader(ap: AudioProcessor, is_val: bool=False, verbose: bool=False): +def setup_loader(ap: AudioProcessor, + is_val: bool = False, + verbose: bool = False): if is_val: loader = None else: @@ -254,8 +256,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, OUT_PATH, - new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) LOG_DIR = OUT_PATH tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder') diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index d03ab1ee..5cd23ce4 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -119,7 +119,7 @@ def format_data(data): avg_text_length, avg_spec_length, attn_mask, item_idx -def data_depended_init(data_loader, model, ap): +def data_depended_init(data_loader, model): """Data depended initialization for activation normalization.""" if hasattr(model, 'module'): for f in model.module.decoder.flows: @@ -138,7 +138,7 @@ def data_depended_init(data_loader, model, ap): # format data text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model _ = model.forward( @@ -177,7 +177,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data) + avg_text_length, avg_spec_length, attn_mask, _ = format_data(data) loader_time = time.time() - end_time @@ -191,20 +191,20 @@ def train(data_loader, model, criterion, optimizer, scheduler, # compute loss loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, - o_dur_log, o_total_dur, text_lengths) + o_dur_log, o_total_dur, text_lengths) # backward pass with loss scaling if c.mixed_precision: scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -332,7 +332,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( @@ -550,13 +550,14 @@ def main(args): # pylint: disable=redefined-outer-name eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step - model = data_depended_init(train_loader, model, ap) + model = data_depended_init(train_loader, model) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: @@ -632,8 +633,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index a24cf8bc..667f5abd 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -175,13 +175,13 @@ def train(data_loader, model, criterion, optimizer, scheduler, scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -518,7 +518,8 @@ def main(args): # pylint: disable=redefined-outer-name train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index be609905..4640a3eb 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -178,10 +178,10 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, - linear_input, stop_tokens, stop_targets, - mel_lengths, decoder_backward_output, - alignments, alignment_lengths, alignments_backward, - text_lengths) + linear_input, stop_tokens, stop_targets, + mel_lengths, decoder_backward_output, + alignments, alignment_lengths, + alignments_backward, text_lengths) # check nan loss if torch.isnan(loss_dict['loss']).any(): @@ -199,7 +199,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # stopnet optimizer step if c.separate_stopnet: - scaler_st.scale( loss_dict['stopnet_loss']).backward() + scaler_st.scale(loss_dict['stopnet_loss']).backward() scaler.unscale_(optimizer_st) optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) @@ -535,7 +535,6 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) - if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -706,8 +705,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 5f1e8c63..a1d1b322 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -33,9 +33,8 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: + loader = None + if not is_val or c.run_eval: dataset = GANDataset(ap=ap, items=eval_data if is_val else train_data, seq_len=c.seq_len, @@ -274,14 +273,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute spectrograms figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -430,11 +429,11 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + c.audio["sample_rate"]) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - # synthesize a full voice + # synthesize a full voice data_loader.return_segments = False return keep_avg.avg_values @@ -639,8 +638,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index fe5fb3d7..c53612c2 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -34,16 +34,16 @@ def setup_loader(ap, is_val=False, verbose=False): loader = None else: dataset = WaveGradDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose) + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=True, + use_noise_augment=False, + use_cache=c.use_cache, + verbose=verbose) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=c.batch_size, @@ -54,7 +54,6 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val else c.num_loader_workers, pin_memory=False) - return loader @@ -79,8 +78,8 @@ def format_test_data(data): return m, x -def train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, epoch): +def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, + epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 @@ -94,7 +93,8 @@ def train(model, criterion, optimizer, c_logger.print_train_start() # setup noise schedule noise_schedule = c['train_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) else: @@ -120,7 +120,7 @@ def train(model, criterion, optimizer, # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_wavegrad_dict = {'wavegrad_loss': loss} # check nan loss if torch.isnan(loss).any(): @@ -133,13 +133,13 @@ def train(model, criterion, optimizer, scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) scaler.step(optimizer) scaler.update() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) optimizer.step() # schedule update @@ -205,7 +205,8 @@ def train(model, criterion, optimizer, epoch, OUT_PATH, model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + scaler=scaler.state_dict() + if c.mixed_precision else None) end_time = time.time() @@ -246,14 +247,12 @@ def evaluate(model, criterion, ap, global_step, epoch): else: noise, x_noisy, noise_scale = model.compute_y_n(x) - # forward pass noise_hat = model(x_noisy, m, noise_scale) # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} - + loss_wavegrad_dict = {'wavegrad_loss': loss} loss_dict = dict() for key, value in loss_wavegrad_dict.items(): @@ -284,7 +283,9 @@ def evaluate(model, criterion, ap, global_step, epoch): # setup noise schedule and inference noise_schedule = c['test_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], + noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) # compute voice @@ -315,7 +316,8 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, + c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -395,26 +397,25 @@ def main(args): # pylint: disable=redefined-outer-name global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, - global_step, epoch) + _, global_step = train(model, criterion, optimizer, scheduler, scaler, + ap, global_step, epoch) + eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + best_loss = save_best_model( + target_loss, + best_loss, + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None) if __name__ == '__main__': @@ -486,8 +487,7 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_model_files(c, args.config_path, - OUT_PATH, new_fields) + copy_model_files(c, args.config_path, OUT_PATH, new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index 14d57837..6847e011 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -200,12 +200,9 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.inference(ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - use_cuda - ) + sample_wav = model.inference(ground_mel, c.batched, + c.target_samples, c.overlap_samples, + use_cuda) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms @@ -287,12 +284,8 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.inference(ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - use_cuda - ) + sample_wav = model.inference(ground_mel, c.batched, c.target_samples, + c.overlap_samples, use_cuda) predict_mel = ap.melspectrogram(sample_wav) # Sample audio diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index 7461282d..436a2764 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -87,5 +87,3 @@ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=tot best_schedule = {'beta': beta} print(f" > Found a better schedule. - MSE: {mse.item()}") np.save(args.output_path, best_schedule) - - diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 33cc4f36..6110ac4d 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -1,10 +1,9 @@ -import numpy -import numpy as np import queue -import torch import random + +import numpy as np +import torch from torch.utils.data import Dataset -from tqdm import tqdm class MyDataset(Dataset): @@ -155,7 +154,7 @@ class MyDataset(Dataset): # add random gaussian noise if self.additive_noise > 0: - noises_ = [numpy.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_] + noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_] wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))] # get a random subset of each of the wavs and convert to MFCC. diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 021c7f45..47bf79cc 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -114,4 +114,3 @@ def check_config_speaker_encoder(c): check_argument('path', dataset_entry, restricted=True, val_type=str) check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) - diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 38dd2890..3b327cbc 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -90,7 +90,8 @@ class MyDataset(Dataset): return data @staticmethod - def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, tp, add_blank): + def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, + language, tp, add_blank): """generate a phoneme sequence from text. since the usage is for subsequent caching, we never add bos and eos chars here. Instead we add those dynamically later; based on the @@ -98,13 +99,16 @@ class MyDataset(Dataset): phonemes = phoneme_to_sequence(text, [cleaners], language=language, enable_eos_bos=False, - tp=tp, add_blank=add_blank) + tp=tp, + add_blank=add_blank) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) return phonemes @staticmethod - def _load_or_generate_phoneme_sequence(wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, tp, add_blank): + def _load_or_generate_phoneme_sequence(wav_file, text, phoneme_cache_path, + enable_eos_bos, cleaners, language, + tp, add_blank): file_name = os.path.splitext(os.path.basename(wav_file))[0] # different names for normal phonemes and with blank chars. @@ -143,12 +147,16 @@ class MyDataset(Dataset): if not self.input_seq_computed: if self.use_phonemes: - text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank) + text = self._load_or_generate_phoneme_sequence( + wav_file, text, self.phoneme_cache_path, + self.enable_eos_bos, self.cleaners, self.phoneme_language, + self.tp, self.add_blank) else: text = np.asarray(text_to_sequence(text, [self.cleaners], - tp=self.tp, add_blank=self.add_blank), - dtype=np.int32) + tp=self.tp, + add_blank=self.add_blank), + dtype=np.int32) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] @@ -177,7 +185,8 @@ class MyDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item - phonemes = MyDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) + phonemes = MyDataset._load_or_generate_phoneme_sequence( + wav_file, text, *func_args) return phonemes def compute_input_seq(self, num_workers=0): @@ -188,13 +197,18 @@ class MyDataset(Dataset): print(" | > Computing input sequences ...") for idx, item in enumerate(tqdm.tqdm(self.items)): text, *_ = item - sequence = np.asarray(text_to_sequence(text, [self.cleaners], - tp=self.tp, add_blank=self.add_blank), - dtype=np.int32) + sequence = np.asarray(text_to_sequence( + text, [self.cleaners], + tp=self.tp, + add_blank=self.add_blank), + dtype=np.int32) self.items[idx][0] = sequence else: - func_args = [self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank] + func_args = [ + self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, + self.phoneme_language, self.tp, self.add_blank + ] if self.verbose: print(" | > Computing phonemes ...") if num_workers == 0: @@ -203,7 +217,11 @@ class MyDataset(Dataset): self.items[idx][0] = phonemes else: with Pool(num_workers) as p: - phonemes = list(tqdm.tqdm(p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items))) + phonemes = list( + tqdm.tqdm(p.imap(MyDataset._phoneme_worker, + [[item, func_args] + for item in self.items]), + total=len(self.items))) for idx, p in enumerate(phonemes): self.items[idx][0] = p diff --git a/TTS/tts/layers/common_layers.py b/TTS/tts/layers/common_layers.py index 5da9b49d..a23bb3f9 100644 --- a/TTS/tts/layers/common_layers.py +++ b/TTS/tts/layers/common_layers.py @@ -124,4 +124,4 @@ class Prenet(nn.Module): x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training) else: x = F.relu(linear(x)) - return x \ No newline at end of file + return x diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 5890f04d..9a803351 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -6,8 +6,6 @@ import subprocess import sys from pathlib import Path -import torch - def get_git_branch(): try: diff --git a/hubconf.py b/hubconf.py index 9de4f7b2..fc7003c9 100644 --- a/hubconf.py +++ b/hubconf.py @@ -33,4 +33,4 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name='vocoder if __name__ == '__main__': synthesizer = torch.hub.load('mozilla/TTS:hub_conf', 'tts', source='github') - synthesizer.tts("This is a test!") \ No newline at end of file + synthesizer.tts("This is a test!") diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index 9df32fef..3c6d06f5 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -54,6 +54,9 @@ "mulaw": false, // apply mulaw if mode is bits "padding": 2, // pad the input for resnet to see wider input length + // GENERATOR - for backward compatibility + "generator_model": "WaveRNN", + // DATASET //"use_gta": true, // use computed gta features from the tts model "data_path": "tests/data/ljspeech/wavs/", // path containing training wav files