mirror of https://github.com/MycroftAI/mimic2.git
commit
93b73a4746
14
hparams.py
14
hparams.py
|
@ -10,28 +10,30 @@ hparams = tf.contrib.training.HParams(
|
||||||
# Audio:
|
# Audio:
|
||||||
num_mels=80,
|
num_mels=80,
|
||||||
num_freq=1025,
|
num_freq=1025,
|
||||||
sample_rate=20000,
|
min_mel_freq=125,
|
||||||
|
max_mel_freq=7600,
|
||||||
|
sample_rate=22000,
|
||||||
frame_length_ms=50,
|
frame_length_ms=50,
|
||||||
frame_shift_ms=12.5,
|
frame_shift_ms=12.5,
|
||||||
preemphasis=0.97,
|
|
||||||
min_level_db=-100,
|
min_level_db=-100,
|
||||||
ref_level_db=20,
|
ref_level_db=20,
|
||||||
|
|
||||||
# Model:
|
# Model:
|
||||||
# TODO: add more configurable hparams
|
# TODO: add more configurable hparams
|
||||||
outputs_per_step=5,
|
outputs_per_step=5,
|
||||||
|
embedding_dim=512,
|
||||||
|
|
||||||
# Training:
|
# Training:
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
adam_beta1=0.9,
|
adam_beta1=0.9,
|
||||||
adam_beta2=0.999,
|
adam_beta2=0.999,
|
||||||
initial_learning_rate=0.002,
|
initial_learning_rate=0.0015,
|
||||||
decay_learning_rate=True,
|
learning_rate_decay_halflife=100000,
|
||||||
use_cmudict=False, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes
|
use_cmudict=True, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes
|
||||||
|
|
||||||
# Eval:
|
# Eval:
|
||||||
max_iters=200,
|
max_iters=200,
|
||||||
griffin_lim_iters=60,
|
griffin_lim_iters=50,
|
||||||
power=1.5, # Power to raise magnitudes to prior to Griffin-Lim
|
power=1.5, # Power to raise magnitudes to prior to Griffin-Lim
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ class Tacotron():
|
||||||
|
|
||||||
# Embeddings
|
# Embeddings
|
||||||
embedding_table = tf.get_variable(
|
embedding_table = tf.get_variable(
|
||||||
'embedding', [len(symbols), 256], dtype=tf.float32,
|
'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
|
||||||
initializer=tf.truncated_normal_initializer(stddev=0.5))
|
initializer=tf.truncated_normal_initializer(stddev=0.5))
|
||||||
embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256]
|
embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256]
|
||||||
|
|
||||||
|
@ -127,10 +127,8 @@ class Tacotron():
|
||||||
'''
|
'''
|
||||||
with tf.variable_scope('optimizer') as scope:
|
with tf.variable_scope('optimizer') as scope:
|
||||||
hp = self._hparams
|
hp = self._hparams
|
||||||
if hp.decay_learning_rate:
|
self.learning_rate = tf.train.exponential_decay(
|
||||||
self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step)
|
hp.initial_learning_rate, global_step, hp.learning_rate_decay_halflife, 0.5)
|
||||||
else:
|
|
||||||
self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate)
|
|
||||||
optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
|
optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
|
||||||
gradients, variables = zip(*optimizer.compute_gradients(self.loss))
|
gradients, variables = zip(*optimizer.compute_gradients(self.loss))
|
||||||
self.gradients = gradients
|
self.gradients = gradients
|
||||||
|
@ -141,10 +139,3 @@ class Tacotron():
|
||||||
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
|
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
|
||||||
self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
|
self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
|
||||||
global_step=global_step)
|
global_step=global_step)
|
||||||
|
|
||||||
|
|
||||||
def _learning_rate_decay(init_lr, global_step):
|
|
||||||
# Noam scheme from tensor2tensor:
|
|
||||||
warmup_steps = 4000.0
|
|
||||||
step = tf.cast(global_step + 1, dtype=tf.float32)
|
|
||||||
return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)
|
|
||||||
|
|
|
@ -33,7 +33,6 @@ class Synthesizer:
|
||||||
self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
|
self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
|
||||||
}
|
}
|
||||||
wav = self.session.run(self.wav_output, feed_dict=feed_dict)
|
wav = self.session.run(self.wav_output, feed_dict=feed_dict)
|
||||||
wav = audio.inv_preemphasis(wav)
|
|
||||||
wav = wav[:audio.find_endpoint(wav)]
|
wav = wav[:audio.find_endpoint(wav)]
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
audio.save_wav(wav, out)
|
audio.save_wav(wav, out)
|
||||||
|
|
|
@ -16,16 +16,8 @@ def save_wav(wav, path):
|
||||||
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
|
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
|
||||||
|
|
||||||
|
|
||||||
def preemphasis(x):
|
|
||||||
return signal.lfilter([1, -hparams.preemphasis], [1], x)
|
|
||||||
|
|
||||||
|
|
||||||
def inv_preemphasis(x):
|
|
||||||
return signal.lfilter([1], [1, -hparams.preemphasis], x)
|
|
||||||
|
|
||||||
|
|
||||||
def spectrogram(y):
|
def spectrogram(y):
|
||||||
D = _stft(preemphasis(y))
|
D = _stft(y)
|
||||||
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
|
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
|
||||||
return _normalize(S)
|
return _normalize(S)
|
||||||
|
|
||||||
|
@ -33,21 +25,17 @@ def spectrogram(y):
|
||||||
def inv_spectrogram(spectrogram):
|
def inv_spectrogram(spectrogram):
|
||||||
'''Converts spectrogram to waveform using librosa'''
|
'''Converts spectrogram to waveform using librosa'''
|
||||||
S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear
|
S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear
|
||||||
return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
|
return _griffin_lim(S ** hparams.power) # Reconstruct phase
|
||||||
|
|
||||||
|
|
||||||
def inv_spectrogram_tensorflow(spectrogram):
|
def inv_spectrogram_tensorflow(spectrogram):
|
||||||
'''Builds computational graph to convert spectrogram to waveform using TensorFlow.
|
'''Builds computational graph to convert spectrogram to waveform using TensorFlow.'''
|
||||||
|
|
||||||
Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
|
|
||||||
inv_preemphasis on the output after running the graph.
|
|
||||||
'''
|
|
||||||
S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
|
S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
|
||||||
return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
|
return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
|
||||||
|
|
||||||
|
|
||||||
def melspectrogram(y):
|
def melspectrogram(y):
|
||||||
D = _stft(preemphasis(y))
|
D = _stft(y)
|
||||||
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
|
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
|
||||||
return _normalize(S)
|
return _normalize(S)
|
||||||
|
|
||||||
|
@ -130,7 +118,8 @@ def _linear_to_mel(spectrogram):
|
||||||
|
|
||||||
def _build_mel_basis():
|
def _build_mel_basis():
|
||||||
n_fft = (hparams.num_freq - 1) * 2
|
n_fft = (hparams.num_freq - 1) * 2
|
||||||
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
|
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels,
|
||||||
|
fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq)
|
||||||
|
|
||||||
def _amp_to_db(x):
|
def _amp_to_db(x):
|
||||||
return 20 * np.log10(np.maximum(1e-5, x))
|
return 20 * np.log10(np.maximum(1e-5, x))
|
||||||
|
|
Loading…
Reference in New Issue