diff --git a/hparams.py b/hparams.py index 352c655..dc81a65 100644 --- a/hparams.py +++ b/hparams.py @@ -10,28 +10,30 @@ hparams = tf.contrib.training.HParams( # Audio: num_mels=80, num_freq=1025, - sample_rate=20000, + min_mel_freq=125, + max_mel_freq=7600, + sample_rate=22000, frame_length_ms=50, frame_shift_ms=12.5, - preemphasis=0.97, min_level_db=-100, ref_level_db=20, # Model: # TODO: add more configurable hparams outputs_per_step=5, + embedding_dim=512, # Training: batch_size=32, adam_beta1=0.9, adam_beta2=0.999, - initial_learning_rate=0.002, - decay_learning_rate=True, - use_cmudict=False, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes + initial_learning_rate=0.0015, + learning_rate_decay_halflife=100000, + use_cmudict=True, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes # Eval: max_iters=200, - griffin_lim_iters=60, + griffin_lim_iters=50, power=1.5, # Power to raise magnitudes to prior to Griffin-Lim ) diff --git a/models/tacotron.py b/models/tacotron.py index 46b3376..eb50589 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -39,7 +39,7 @@ class Tacotron(): # Embeddings embedding_table = tf.get_variable( - 'embedding', [len(symbols), 256], dtype=tf.float32, + 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] @@ -127,10 +127,8 @@ class Tacotron(): ''' with tf.variable_scope('optimizer') as scope: hp = self._hparams - if hp.decay_learning_rate: - self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step) - else: - self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate) + self.learning_rate = tf.train.exponential_decay( + hp.initial_learning_rate, global_step, hp.learning_rate_decay_halflife, 0.5) optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2) gradients, variables = zip(*optimizer.compute_gradients(self.loss)) self.gradients = gradients @@ -141,10 +139,3 @@ class Tacotron(): with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), global_step=global_step) - - -def _learning_rate_decay(init_lr, global_step): - # Noam scheme from tensor2tensor: - warmup_steps = 4000.0 - step = tf.cast(global_step + 1, dtype=tf.float32) - return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5) diff --git a/synthesizer.py b/synthesizer.py index ded61c8..3ccbaaa 100644 --- a/synthesizer.py +++ b/synthesizer.py @@ -33,7 +33,6 @@ class Synthesizer: self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) - wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) diff --git a/util/audio.py b/util/audio.py index 7b9cf04..4f8c8c9 100644 --- a/util/audio.py +++ b/util/audio.py @@ -16,16 +16,8 @@ def save_wav(wav, path): librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate) -def preemphasis(x): - return signal.lfilter([1, -hparams.preemphasis], [1], x) - - -def inv_preemphasis(x): - return signal.lfilter([1], [1, -hparams.preemphasis], x) - - def spectrogram(y): - D = _stft(preemphasis(y)) + D = _stft(y) S = _amp_to_db(np.abs(D)) - hparams.ref_level_db return _normalize(S) @@ -33,21 +25,17 @@ def spectrogram(y): def inv_spectrogram(spectrogram): '''Converts spectrogram to waveform using librosa''' S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear - return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase + return _griffin_lim(S ** hparams.power) # Reconstruct phase def inv_spectrogram_tensorflow(spectrogram): - '''Builds computational graph to convert spectrogram to waveform using TensorFlow. - - Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call - inv_preemphasis on the output after running the graph. - ''' + '''Builds computational graph to convert spectrogram to waveform using TensorFlow.''' S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db) return _griffin_lim_tensorflow(tf.pow(S, hparams.power)) def melspectrogram(y): - D = _stft(preemphasis(y)) + D = _stft(y) S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db return _normalize(S) @@ -130,7 +118,8 @@ def _linear_to_mel(spectrogram): def _build_mel_basis(): n_fft = (hparams.num_freq - 1) * 2 - return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels) + return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, + fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq) def _amp_to_db(x): return 20 * np.log10(np.maximum(1e-5, x))