mirror of https://github.com/coqui-ai/TTS.git
linter updates
parent
8871c111d2
commit
d49757faaa
|
@ -16,6 +16,7 @@ from TTS.utils.io import load_config
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
# pylint: disable=bad-continuation
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='''Extract attention masks from trained Tacotron/Tacotron2 models.
|
description='''Extract attention masks from trained Tacotron/Tacotron2 models.
|
||||||
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n'''
|
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n'''
|
||||||
|
|
|
@ -179,7 +179,6 @@ def main():
|
||||||
# load models
|
# load models
|
||||||
synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda)
|
synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda)
|
||||||
|
|
||||||
use_griffin_lim = vocoder_path is None
|
|
||||||
print(" > Text: {}".format(args.text))
|
print(" > Text: {}".format(args.text))
|
||||||
|
|
||||||
# # handle multi-speaker setting
|
# # handle multi-speaker setting
|
||||||
|
|
|
@ -34,7 +34,9 @@ print(" > Using CUDA: ", use_cuda)
|
||||||
print(" > Number of GPUs: ", num_gpus)
|
print(" > Number of GPUs: ", num_gpus)
|
||||||
|
|
||||||
|
|
||||||
def setup_loader(ap: AudioProcessor, is_val: bool=False, verbose: bool=False):
|
def setup_loader(ap: AudioProcessor,
|
||||||
|
is_val: bool = False,
|
||||||
|
verbose: bool = False):
|
||||||
if is_val:
|
if is_val:
|
||||||
loader = None
|
loader = None
|
||||||
else:
|
else:
|
||||||
|
@ -254,8 +256,7 @@ if __name__ == '__main__':
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
new_fields["restore_path"] = args.restore_path
|
new_fields["restore_path"] = args.restore_path
|
||||||
new_fields["github_branch"] = get_git_branch()
|
new_fields["github_branch"] = get_git_branch()
|
||||||
copy_model_files(c, args.config_path, OUT_PATH,
|
copy_model_files(c, args.config_path, OUT_PATH, new_fields)
|
||||||
new_fields)
|
|
||||||
|
|
||||||
LOG_DIR = OUT_PATH
|
LOG_DIR = OUT_PATH
|
||||||
tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder')
|
tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder')
|
||||||
|
|
|
@ -119,7 +119,7 @@ def format_data(data):
|
||||||
avg_text_length, avg_spec_length, attn_mask, item_idx
|
avg_text_length, avg_spec_length, attn_mask, item_idx
|
||||||
|
|
||||||
|
|
||||||
def data_depended_init(data_loader, model, ap):
|
def data_depended_init(data_loader, model):
|
||||||
"""Data depended initialization for activation normalization."""
|
"""Data depended initialization for activation normalization."""
|
||||||
if hasattr(model, 'module'):
|
if hasattr(model, 'module'):
|
||||||
for f in model.module.decoder.flows:
|
for f in model.module.decoder.flows:
|
||||||
|
@ -138,7 +138,7 @@ def data_depended_init(data_loader, model, ap):
|
||||||
|
|
||||||
# format data
|
# format data
|
||||||
text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\
|
text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\
|
||||||
_, _, attn_mask, item_idx = format_data(data)
|
_, _, attn_mask, _ = format_data(data)
|
||||||
|
|
||||||
# forward pass model
|
# forward pass model
|
||||||
_ = model.forward(
|
_ = model.forward(
|
||||||
|
@ -177,7 +177,7 @@ def train(data_loader, model, criterion, optimizer, scheduler,
|
||||||
|
|
||||||
# format data
|
# format data
|
||||||
text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
||||||
avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data)
|
avg_text_length, avg_spec_length, attn_mask, _ = format_data(data)
|
||||||
|
|
||||||
loader_time = time.time() - end_time
|
loader_time = time.time() - end_time
|
||||||
|
|
||||||
|
@ -332,7 +332,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch):
|
||||||
|
|
||||||
# format data
|
# format data
|
||||||
text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
|
||||||
_, _, attn_mask, item_idx = format_data(data)
|
_, _, attn_mask, _ = format_data(data)
|
||||||
|
|
||||||
# forward pass model
|
# forward pass model
|
||||||
z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
|
z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
|
||||||
|
@ -550,13 +550,14 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
|
eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
|
||||||
|
|
||||||
global_step = args.restore_step
|
global_step = args.restore_step
|
||||||
model = data_depended_init(train_loader, model, ap)
|
model = data_depended_init(train_loader, model)
|
||||||
for epoch in range(0, c.epochs):
|
for epoch in range(0, c.epochs):
|
||||||
c_logger.print_epoch_start(epoch, c.epochs)
|
c_logger.print_epoch_start(epoch, c.epochs)
|
||||||
train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
|
train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
|
||||||
scheduler, ap, global_step,
|
scheduler, ap, global_step,
|
||||||
epoch)
|
epoch)
|
||||||
eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch)
|
eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap,
|
||||||
|
global_step, epoch)
|
||||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||||
target_loss = train_avg_loss_dict['avg_loss']
|
target_loss = train_avg_loss_dict['avg_loss']
|
||||||
if c.run_eval:
|
if c.run_eval:
|
||||||
|
@ -632,8 +633,7 @@ if __name__ == '__main__':
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
new_fields["restore_path"] = args.restore_path
|
new_fields["restore_path"] = args.restore_path
|
||||||
new_fields["github_branch"] = get_git_branch()
|
new_fields["github_branch"] = get_git_branch()
|
||||||
copy_model_files(c, args.config_path,
|
copy_model_files(c, args.config_path, OUT_PATH, new_fields)
|
||||||
OUT_PATH, new_fields)
|
|
||||||
os.chmod(AUDIO_PATH, 0o775)
|
os.chmod(AUDIO_PATH, 0o775)
|
||||||
os.chmod(OUT_PATH, 0o775)
|
os.chmod(OUT_PATH, 0o775)
|
||||||
|
|
||||||
|
|
|
@ -518,7 +518,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
|
train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
|
||||||
scheduler, ap, global_step,
|
scheduler, ap, global_step,
|
||||||
epoch)
|
epoch)
|
||||||
eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch)
|
eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap,
|
||||||
|
global_step, epoch)
|
||||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||||
target_loss = train_avg_loss_dict['avg_loss']
|
target_loss = train_avg_loss_dict['avg_loss']
|
||||||
if c.run_eval:
|
if c.run_eval:
|
||||||
|
|
|
@ -180,8 +180,8 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
loss_dict = criterion(postnet_output, decoder_output, mel_input,
|
loss_dict = criterion(postnet_output, decoder_output, mel_input,
|
||||||
linear_input, stop_tokens, stop_targets,
|
linear_input, stop_tokens, stop_targets,
|
||||||
mel_lengths, decoder_backward_output,
|
mel_lengths, decoder_backward_output,
|
||||||
alignments, alignment_lengths, alignments_backward,
|
alignments, alignment_lengths,
|
||||||
text_lengths)
|
alignments_backward, text_lengths)
|
||||||
|
|
||||||
# check nan loss
|
# check nan loss
|
||||||
if torch.isnan(loss_dict['loss']).any():
|
if torch.isnan(loss_dict['loss']).any():
|
||||||
|
@ -535,7 +535,6 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
# setup criterion
|
# setup criterion
|
||||||
criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4)
|
criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4)
|
||||||
|
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
||||||
try:
|
try:
|
||||||
|
@ -706,8 +705,7 @@ if __name__ == '__main__':
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
new_fields["restore_path"] = args.restore_path
|
new_fields["restore_path"] = args.restore_path
|
||||||
new_fields["github_branch"] = get_git_branch()
|
new_fields["github_branch"] = get_git_branch()
|
||||||
copy_model_files(c, args.config_path,
|
copy_model_files(c, args.config_path, OUT_PATH, new_fields)
|
||||||
OUT_PATH, new_fields)
|
|
||||||
os.chmod(AUDIO_PATH, 0o775)
|
os.chmod(AUDIO_PATH, 0o775)
|
||||||
os.chmod(OUT_PATH, 0o775)
|
os.chmod(OUT_PATH, 0o775)
|
||||||
|
|
||||||
|
|
|
@ -33,9 +33,8 @@ use_cuda, num_gpus = setup_torch_training_env(True, True)
|
||||||
|
|
||||||
|
|
||||||
def setup_loader(ap, is_val=False, verbose=False):
|
def setup_loader(ap, is_val=False, verbose=False):
|
||||||
if is_val and not c.run_eval:
|
|
||||||
loader = None
|
loader = None
|
||||||
else:
|
if not is_val or c.run_eval:
|
||||||
dataset = GANDataset(ap=ap,
|
dataset = GANDataset(ap=ap,
|
||||||
items=eval_data if is_val else train_data,
|
items=eval_data if is_val else train_data,
|
||||||
seq_len=c.seq_len,
|
seq_len=c.seq_len,
|
||||||
|
@ -639,8 +638,7 @@ if __name__ == '__main__':
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
new_fields["restore_path"] = args.restore_path
|
new_fields["restore_path"] = args.restore_path
|
||||||
new_fields["github_branch"] = get_git_branch()
|
new_fields["github_branch"] = get_git_branch()
|
||||||
copy_model_files(c, args.config_path,
|
copy_model_files(c, args.config_path, OUT_PATH, new_fields)
|
||||||
OUT_PATH, new_fields)
|
|
||||||
os.chmod(AUDIO_PATH, 0o775)
|
os.chmod(AUDIO_PATH, 0o775)
|
||||||
os.chmod(OUT_PATH, 0o775)
|
os.chmod(OUT_PATH, 0o775)
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,6 @@ def setup_loader(ap, is_val=False, verbose=False):
|
||||||
if is_val else c.num_loader_workers,
|
if is_val else c.num_loader_workers,
|
||||||
pin_memory=False)
|
pin_memory=False)
|
||||||
|
|
||||||
|
|
||||||
return loader
|
return loader
|
||||||
|
|
||||||
|
|
||||||
|
@ -79,8 +78,8 @@ def format_test_data(data):
|
||||||
return m, x
|
return m, x
|
||||||
|
|
||||||
|
|
||||||
def train(model, criterion, optimizer,
|
def train(model, criterion, optimizer, scheduler, scaler, ap, global_step,
|
||||||
scheduler, scaler, ap, global_step, epoch):
|
epoch):
|
||||||
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
|
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
|
||||||
model.train()
|
model.train()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
|
@ -94,7 +93,8 @@ def train(model, criterion, optimizer,
|
||||||
c_logger.print_train_start()
|
c_logger.print_train_start()
|
||||||
# setup noise schedule
|
# setup noise schedule
|
||||||
noise_schedule = c['train_noise_schedule']
|
noise_schedule = c['train_noise_schedule']
|
||||||
betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps'])
|
betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'],
|
||||||
|
noise_schedule['num_steps'])
|
||||||
if hasattr(model, 'module'):
|
if hasattr(model, 'module'):
|
||||||
model.module.compute_noise_level(betas)
|
model.module.compute_noise_level(betas)
|
||||||
else:
|
else:
|
||||||
|
@ -205,7 +205,8 @@ def train(model, criterion, optimizer,
|
||||||
epoch,
|
epoch,
|
||||||
OUT_PATH,
|
OUT_PATH,
|
||||||
model_losses=loss_dict,
|
model_losses=loss_dict,
|
||||||
scaler=scaler.state_dict() if c.mixed_precision else None)
|
scaler=scaler.state_dict()
|
||||||
|
if c.mixed_precision else None)
|
||||||
|
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|
||||||
|
@ -246,7 +247,6 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
else:
|
else:
|
||||||
noise, x_noisy, noise_scale = model.compute_y_n(x)
|
noise, x_noisy, noise_scale = model.compute_y_n(x)
|
||||||
|
|
||||||
|
|
||||||
# forward pass
|
# forward pass
|
||||||
noise_hat = model(x_noisy, m, noise_scale)
|
noise_hat = model(x_noisy, m, noise_scale)
|
||||||
|
|
||||||
|
@ -254,7 +254,6 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
loss = criterion(noise, noise_hat)
|
loss = criterion(noise, noise_hat)
|
||||||
loss_wavegrad_dict = {'wavegrad_loss': loss}
|
loss_wavegrad_dict = {'wavegrad_loss': loss}
|
||||||
|
|
||||||
|
|
||||||
loss_dict = dict()
|
loss_dict = dict()
|
||||||
for key, value in loss_wavegrad_dict.items():
|
for key, value in loss_wavegrad_dict.items():
|
||||||
if isinstance(value, (int, float)):
|
if isinstance(value, (int, float)):
|
||||||
|
@ -284,7 +283,9 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
|
|
||||||
# setup noise schedule and inference
|
# setup noise schedule and inference
|
||||||
noise_schedule = c['test_noise_schedule']
|
noise_schedule = c['test_noise_schedule']
|
||||||
betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps'])
|
betas = np.linspace(noise_schedule['min_val'],
|
||||||
|
noise_schedule['max_val'],
|
||||||
|
noise_schedule['num_steps'])
|
||||||
if hasattr(model, 'module'):
|
if hasattr(model, 'module'):
|
||||||
model.module.compute_noise_level(betas)
|
model.module.compute_noise_level(betas)
|
||||||
# compute voice
|
# compute voice
|
||||||
|
@ -315,7 +316,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
print(f" > Loading wavs from: {c.data_path}")
|
print(f" > Loading wavs from: {c.data_path}")
|
||||||
if c.feature_path is not None:
|
if c.feature_path is not None:
|
||||||
print(f" > Loading features from: {c.feature_path}")
|
print(f" > Loading features from: {c.feature_path}")
|
||||||
eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size)
|
eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path,
|
||||||
|
c.eval_split_size)
|
||||||
else:
|
else:
|
||||||
eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
|
eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
|
||||||
|
|
||||||
|
@ -395,14 +397,13 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
global_step = args.restore_step
|
global_step = args.restore_step
|
||||||
for epoch in range(0, c.epochs):
|
for epoch in range(0, c.epochs):
|
||||||
c_logger.print_epoch_start(epoch, c.epochs)
|
c_logger.print_epoch_start(epoch, c.epochs)
|
||||||
_, global_step = train(model, criterion, optimizer,
|
_, global_step = train(model, criterion, optimizer, scheduler, scaler,
|
||||||
scheduler, scaler, ap, global_step,
|
ap, global_step, epoch)
|
||||||
epoch)
|
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
|
||||||
eval_avg_loss_dict = evaluate(model, criterion, ap,
|
|
||||||
global_step, epoch)
|
|
||||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||||
target_loss = eval_avg_loss_dict[c.target_loss]
|
target_loss = eval_avg_loss_dict[c.target_loss]
|
||||||
best_loss = save_best_model(target_loss,
|
best_loss = save_best_model(
|
||||||
|
target_loss,
|
||||||
best_loss,
|
best_loss,
|
||||||
model,
|
model,
|
||||||
optimizer,
|
optimizer,
|
||||||
|
@ -486,8 +487,7 @@ if __name__ == '__main__':
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
new_fields["restore_path"] = args.restore_path
|
new_fields["restore_path"] = args.restore_path
|
||||||
new_fields["github_branch"] = get_git_branch()
|
new_fields["github_branch"] = get_git_branch()
|
||||||
copy_model_files(c, args.config_path,
|
copy_model_files(c, args.config_path, OUT_PATH, new_fields)
|
||||||
OUT_PATH, new_fields)
|
|
||||||
os.chmod(AUDIO_PATH, 0o775)
|
os.chmod(AUDIO_PATH, 0o775)
|
||||||
os.chmod(OUT_PATH, 0o775)
|
os.chmod(OUT_PATH, 0o775)
|
||||||
|
|
||||||
|
|
|
@ -200,12 +200,9 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch
|
||||||
train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0]
|
train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0]
|
||||||
wav = ap.load_wav(wav_path)
|
wav = ap.load_wav(wav_path)
|
||||||
ground_mel = ap.melspectrogram(wav)
|
ground_mel = ap.melspectrogram(wav)
|
||||||
sample_wav = model.inference(ground_mel,
|
sample_wav = model.inference(ground_mel, c.batched,
|
||||||
c.batched,
|
c.target_samples, c.overlap_samples,
|
||||||
c.target_samples,
|
use_cuda)
|
||||||
c.overlap_samples,
|
|
||||||
use_cuda
|
|
||||||
)
|
|
||||||
predict_mel = ap.melspectrogram(sample_wav)
|
predict_mel = ap.melspectrogram(sample_wav)
|
||||||
|
|
||||||
# compute spectrograms
|
# compute spectrograms
|
||||||
|
@ -287,12 +284,8 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0]
|
eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0]
|
||||||
wav = ap.load_wav(wav_path)
|
wav = ap.load_wav(wav_path)
|
||||||
ground_mel = ap.melspectrogram(wav)
|
ground_mel = ap.melspectrogram(wav)
|
||||||
sample_wav = model.inference(ground_mel,
|
sample_wav = model.inference(ground_mel, c.batched, c.target_samples,
|
||||||
c.batched,
|
c.overlap_samples, use_cuda)
|
||||||
c.target_samples,
|
|
||||||
c.overlap_samples,
|
|
||||||
use_cuda
|
|
||||||
)
|
|
||||||
predict_mel = ap.melspectrogram(sample_wav)
|
predict_mel = ap.melspectrogram(sample_wav)
|
||||||
|
|
||||||
# Sample audio
|
# Sample audio
|
||||||
|
|
|
@ -87,5 +87,3 @@ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=tot
|
||||||
best_schedule = {'beta': beta}
|
best_schedule = {'beta': beta}
|
||||||
print(f" > Found a better schedule. - MSE: {mse.item()}")
|
print(f" > Found a better schedule. - MSE: {mse.item()}")
|
||||||
np.save(args.output_path, best_schedule)
|
np.save(args.output_path, best_schedule)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
import numpy
|
|
||||||
import numpy as np
|
|
||||||
import queue
|
import queue
|
||||||
import torch
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
class MyDataset(Dataset):
|
class MyDataset(Dataset):
|
||||||
|
@ -155,7 +154,7 @@ class MyDataset(Dataset):
|
||||||
|
|
||||||
# add random gaussian noise
|
# add random gaussian noise
|
||||||
if self.additive_noise > 0:
|
if self.additive_noise > 0:
|
||||||
noises_ = [numpy.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
|
noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
|
||||||
wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
|
wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
|
||||||
|
|
||||||
# get a random subset of each of the wavs and convert to MFCC.
|
# get a random subset of each of the wavs and convert to MFCC.
|
||||||
|
|
|
@ -114,4 +114,3 @@ def check_config_speaker_encoder(c):
|
||||||
check_argument('path', dataset_entry, restricted=True, val_type=str)
|
check_argument('path', dataset_entry, restricted=True, val_type=str)
|
||||||
check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list])
|
check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list])
|
||||||
check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
|
check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
|
||||||
|
|
||||||
|
|
|
@ -90,7 +90,8 @@ class MyDataset(Dataset):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, tp, add_blank):
|
def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners,
|
||||||
|
language, tp, add_blank):
|
||||||
"""generate a phoneme sequence from text.
|
"""generate a phoneme sequence from text.
|
||||||
since the usage is for subsequent caching, we never add bos and
|
since the usage is for subsequent caching, we never add bos and
|
||||||
eos chars here. Instead we add those dynamically later; based on the
|
eos chars here. Instead we add those dynamically later; based on the
|
||||||
|
@ -98,13 +99,16 @@ class MyDataset(Dataset):
|
||||||
phonemes = phoneme_to_sequence(text, [cleaners],
|
phonemes = phoneme_to_sequence(text, [cleaners],
|
||||||
language=language,
|
language=language,
|
||||||
enable_eos_bos=False,
|
enable_eos_bos=False,
|
||||||
tp=tp, add_blank=add_blank)
|
tp=tp,
|
||||||
|
add_blank=add_blank)
|
||||||
phonemes = np.asarray(phonemes, dtype=np.int32)
|
phonemes = np.asarray(phonemes, dtype=np.int32)
|
||||||
np.save(cache_path, phonemes)
|
np.save(cache_path, phonemes)
|
||||||
return phonemes
|
return phonemes
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _load_or_generate_phoneme_sequence(wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, tp, add_blank):
|
def _load_or_generate_phoneme_sequence(wav_file, text, phoneme_cache_path,
|
||||||
|
enable_eos_bos, cleaners, language,
|
||||||
|
tp, add_blank):
|
||||||
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
||||||
|
|
||||||
# different names for normal phonemes and with blank chars.
|
# different names for normal phonemes and with blank chars.
|
||||||
|
@ -143,11 +147,15 @@ class MyDataset(Dataset):
|
||||||
|
|
||||||
if not self.input_seq_computed:
|
if not self.input_seq_computed:
|
||||||
if self.use_phonemes:
|
if self.use_phonemes:
|
||||||
text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank)
|
text = self._load_or_generate_phoneme_sequence(
|
||||||
|
wav_file, text, self.phoneme_cache_path,
|
||||||
|
self.enable_eos_bos, self.cleaners, self.phoneme_language,
|
||||||
|
self.tp, self.add_blank)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
text = np.asarray(text_to_sequence(text, [self.cleaners],
|
text = np.asarray(text_to_sequence(text, [self.cleaners],
|
||||||
tp=self.tp, add_blank=self.add_blank),
|
tp=self.tp,
|
||||||
|
add_blank=self.add_blank),
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
|
|
||||||
assert text.size > 0, self.items[idx][1]
|
assert text.size > 0, self.items[idx][1]
|
||||||
|
@ -177,7 +185,8 @@ class MyDataset(Dataset):
|
||||||
item = args[0]
|
item = args[0]
|
||||||
func_args = args[1]
|
func_args = args[1]
|
||||||
text, wav_file, *_ = item
|
text, wav_file, *_ = item
|
||||||
phonemes = MyDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args)
|
phonemes = MyDataset._load_or_generate_phoneme_sequence(
|
||||||
|
wav_file, text, *func_args)
|
||||||
return phonemes
|
return phonemes
|
||||||
|
|
||||||
def compute_input_seq(self, num_workers=0):
|
def compute_input_seq(self, num_workers=0):
|
||||||
|
@ -188,13 +197,18 @@ class MyDataset(Dataset):
|
||||||
print(" | > Computing input sequences ...")
|
print(" | > Computing input sequences ...")
|
||||||
for idx, item in enumerate(tqdm.tqdm(self.items)):
|
for idx, item in enumerate(tqdm.tqdm(self.items)):
|
||||||
text, *_ = item
|
text, *_ = item
|
||||||
sequence = np.asarray(text_to_sequence(text, [self.cleaners],
|
sequence = np.asarray(text_to_sequence(
|
||||||
tp=self.tp, add_blank=self.add_blank),
|
text, [self.cleaners],
|
||||||
|
tp=self.tp,
|
||||||
|
add_blank=self.add_blank),
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
self.items[idx][0] = sequence
|
self.items[idx][0] = sequence
|
||||||
|
|
||||||
else:
|
else:
|
||||||
func_args = [self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank]
|
func_args = [
|
||||||
|
self.phoneme_cache_path, self.enable_eos_bos, self.cleaners,
|
||||||
|
self.phoneme_language, self.tp, self.add_blank
|
||||||
|
]
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(" | > Computing phonemes ...")
|
print(" | > Computing phonemes ...")
|
||||||
if num_workers == 0:
|
if num_workers == 0:
|
||||||
|
@ -203,7 +217,11 @@ class MyDataset(Dataset):
|
||||||
self.items[idx][0] = phonemes
|
self.items[idx][0] = phonemes
|
||||||
else:
|
else:
|
||||||
with Pool(num_workers) as p:
|
with Pool(num_workers) as p:
|
||||||
phonemes = list(tqdm.tqdm(p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items)))
|
phonemes = list(
|
||||||
|
tqdm.tqdm(p.imap(MyDataset._phoneme_worker,
|
||||||
|
[[item, func_args]
|
||||||
|
for item in self.items]),
|
||||||
|
total=len(self.items)))
|
||||||
for idx, p in enumerate(phonemes):
|
for idx, p in enumerate(phonemes):
|
||||||
self.items[idx][0] = p
|
self.items[idx][0] = p
|
||||||
|
|
||||||
|
|
|
@ -6,8 +6,6 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
def get_git_branch():
|
def get_git_branch():
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -54,6 +54,9 @@
|
||||||
"mulaw": false, // apply mulaw if mode is bits
|
"mulaw": false, // apply mulaw if mode is bits
|
||||||
"padding": 2, // pad the input for resnet to see wider input length
|
"padding": 2, // pad the input for resnet to see wider input length
|
||||||
|
|
||||||
|
// GENERATOR - for backward compatibility
|
||||||
|
"generator_model": "WaveRNN",
|
||||||
|
|
||||||
// DATASET
|
// DATASET
|
||||||
//"use_gta": true, // use computed gta features from the tts model
|
//"use_gta": true, // use computed gta features from the tts model
|
||||||
"data_path": "tests/data/ljspeech/wavs/", // path containing training wav files
|
"data_path": "tests/data/ljspeech/wavs/", // path containing training wav files
|
||||||
|
|
Loading…
Reference in New Issue