Bug solve on attention module and a new Notebook to experiment spectrogram reconstruction

pull/10/head
Eren Golge 2018-01-31 07:21:22 -08:00
parent 9a22f5d085
commit 1320d5344a
17 changed files with 928 additions and 51 deletions

285
PlayGround.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,8 +1,8 @@
{
"num_mels": 80,
"num_freq": 1024,
"num_freq": 1025,
"sample_rate": 20000,
"frame_length_ms": 50.0,
"frame_length_ms": 50,
"frame_shift_ms": 12.5,
"preemphasis": 0.97,
"min_level_db": -100,
@ -12,11 +12,11 @@
"text_cleaner": "english_cleaners",
"epochs": 2000,
"lr": 0.001,
"lr_patience": 2,
"lr": 0.003,
"lr_patience": 5,
"lr_decay": 0.5,
"batch_size": 256,
"griffinf_lim_iters": 60,
"griffin_lim_iters": 60,
"power": 1.5,
"r": 5,

BIN
core Normal file

Binary file not shown.

Binary file not shown.

View File

@ -20,6 +20,8 @@
"power": 1.5,
"r": 5,
"num_loader_workers": 16,
"save_step": 1,
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
"output_path": "result",

Binary file not shown.

Binary file not shown.

View File

@ -73,7 +73,8 @@ class AttentionWrapper(nn.Module):
alignment.data.masked_fill_(mask, self.score_mask_value)
# Normalize attention weight
alignment = F.softmax(alignment, dim=0)
alignment = F.softmax(alignment, dim=-1) ## TODO: might be buggy
print(alignment.size())
# Attention context vector
# (batch, 1, dim)

Binary file not shown.

View File

@ -2,7 +2,7 @@
import torch
from torch.autograd import Variable
from torch import nn
from utils.text.symbols import symbols
from TTS.utils.text.symbols import symbols
from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
class Tacotron(nn.Module):

354
notebooks/PlayGround.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

51
notebooks/utils.py Normal file
View File

@ -0,0 +1,51 @@
import io
import librosa
import torch
import numpy as np
from TTS.utils.text import text_to_sequence
from matplotlib import pylab as plt
hop_length = 250
def create_speech(m, s, CONFIG, use_cuda, ap):
text_cleaner = [CONFIG.text_cleaner]
seq = np.array(text_to_sequence(s, text_cleaner))
# mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)
if use_cuda:
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda()
else:
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0)
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)
mel_out, linear_out, alignments =m.forward(chars_var)
linear_out = linear_out[0].data.cpu().numpy()
alignment = alignments[0].cpu().data.numpy()
spec = ap._denormalize(linear_out)
wav = ap.inv_spectrogram(linear_out.T)
wav = wav[:ap.find_endpoint(wav)]
out = io.BytesIO()
ap.save_wav(wav, out)
return wav, alignment, spec
def visualize(alignment, spectrogram, CONFIG):
label_fontsize = 16
plt.figure(figsize=(16,16))
plt.subplot(2,1,1)
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
plt.colorbar()
plt.subplot(2,1,2)
librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate,
hop_length=hop_length, x_axis="time", y_axis="linear")
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()

View File

@ -38,17 +38,11 @@ def main(args):
# Sentences for generation
sentences = [
"And it is worth mention in passing that, as an example of fine typography,",
# From July 8, 2017 New York Times:
'Scientists at the CERN laboratory say they have discovered a new particle.',
'Theres a way to measure the acute emotional intelligence that has never gone out of style.',
'President Trump met with other leaders at the Group of 20 conference.',
'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
# From Google's Tacotron example page:
'Generative adversarial network or variational auto-encoder.',
'The buses aren\'t the problem, they actually provide a solution.',
'Does the quick brown fox jump over the lazy dog?',
'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
"I try my best to translate text to speech. But I know I need more work",
"The new Firefox, Fast for good.",
"Technology is continually providing us with new ways to create and publish stories.",
"For these stories to achieve their full impact, it requires tool.",
"I am allien and I am here to destron your world."
]
# Synthesis and save to wav files

View File

@ -111,6 +111,8 @@ def main(args):
progbar = Progbar(len(dataset) / c.batch_size)
for i, data in enumerate(dataloader):
start_time = time.time()
text_input = data[0]
magnitude_input = data[1]
mel_input = data[2]
@ -128,42 +130,40 @@ def main(args):
if use_cuda:
text_input_var = Variable(torch.from_numpy(text_input).type(
torch.cuda.LongTensor), requires_grad=False).cuda()
torch.cuda.LongTensor)).cuda()
mel_input_var = Variable(torch.from_numpy(mel_input).type(
torch.cuda.FloatTensor), requires_grad=False).cuda()
torch.cuda.FloatTensor)).cuda()
mel_spec_var = Variable(torch.from_numpy(mel_input).type(
torch.cuda.FloatTensor), requires_grad=False).cuda()
torch.cuda.FloatTensor)).cuda()
linear_spec_var = Variable(torch.from_numpy(magnitude_input)
.type(torch.cuda.FloatTensor), requires_grad=False).cuda()
.type(torch.cuda.FloatTensor)).cuda()
else:
text_input_var = Variable(torch.from_numpy(text_input).type(
torch.LongTensor), requires_grad=False)
torch.LongTensor),)
mel_input_var = Variable(torch.from_numpy(mel_input).type(
torch.FloatTensor), requires_grad=False)
torch.FloatTensor))
mel_spec_var = Variable(torch.from_numpy(
mel_input).type(torch.FloatTensor), requires_grad=False)
mel_input).type(torch.FloatTensor))
linear_spec_var = Variable(torch.from_numpy(
magnitude_input).type(torch.FloatTensor),
requires_grad=False)
magnitude_input).type(torch.FloatTensor))
mel_output, linear_output, alignments =\
model.forward(text_input_var, mel_input_var)
mel_loss = criterion(mel_output, mel_spec_var)
linear_loss = torch.abs(linear_output - linear_spec_var)
linear_loss = 0.5 * \
torch.mean(linear_loss) + 0.5 * \
torch.mean(linear_loss[:, :n_priority_freq, :])
#linear_loss = torch.abs(linear_output - linear_spec_var)
#linear_loss = 0.5 * \
#torch.mean(linear_loss) + 0.5 * \
#torch.mean(linear_loss[:, :n_priority_freq, :])
linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
linear_spec_var[: ,: ,:n_priority_freq])
loss = mel_loss + linear_loss
loss = loss.cuda()
start_time = time.time()
# loss = loss.cuda()
loss.backward()
nn.utils.clip_grad_norm(model.parameters(), 1.)
grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)
optimizer.step()
step_time = time.time() - start_time
@ -171,7 +171,8 @@ def main(args):
progbar.update(i+1, values=[('total_loss', loss.data[0]),
('linear_loss', linear_loss.data[0]),
('mel_loss', mel_loss.data[0])])
('mel_loss', mel_loss.data[0]),
('grad_norm', grad_norm)])
tb.add_scalar('Train/TotalLoss', loss.data[0], current_step)
tb.add_scalar('Train/LinearLoss', linear_loss.data[0],

View File

@ -81,10 +81,10 @@ class AudioProcessor(object):
def inv_spectrogram(self, spectrogram):
'''Converts spectrogram to waveform using librosa'''
S = _denormalize(spectrogram)
S = _db_to_amp(S + self.ref_level_db) # Convert back to linear
S = self._denormalize(spectrogram)
S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear
# Reconstruct phase
return inv_preemphasis(_griffin_lim(S ** self.power))
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
def _griffin_lim(self, S):
@ -93,18 +93,13 @@ class AudioProcessor(object):
'''
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles)
y = self._istft(S_complex * angles)
for i in range(self.griffin_lim_iters):
angles = np.exp(1j * np.angle(_stft(y)))
y = _istft(S_complex * angles)
angles = np.exp(1j * np.angle(self._stft(y)))
y = self._istft(S_complex * angles)
return y
def _istft(self, y):
_, hop_length, win_length = _stft_parameters()
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def melspectrogram(self, y):
D = self._stft(self.apply_preemphasis(y))
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
@ -115,11 +110,15 @@ class AudioProcessor(object):
n_fft, hop_length, win_length = self._stft_parameters()
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
def _istft(self, y):
_, hop_length, win_length = self._stft_parameters()
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(self.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
threshold = _db_to_amp(threshold_db)
threshold = self._db_to_amp(threshold_db)
for x in range(hop_length, len(wav) - window_length, hop_length):
if np.max(wav[x:x + window_length]) < threshold:
return x + hop_length

View File

@ -3,7 +3,9 @@ import numpy as np
def pad_data(x, length):
_pad = 0
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
return np.pad(x, (0, length - x.shape[0]),
mode='constant',
constant_values=_pad)
def prepare_data(inputs):