Use vanilla exponential decay for learning rate

2018-03-17 14:24:38 -07:00 · 2018-03-17 14:24:38 -07:00 · 70ba4aaf69
parent 1084310ed1
commit 70ba4aaf69
2 changed files with 4 additions and 13 deletions
--- a/hparams.py
+++ b/hparams.py
@ -26,8 +26,8 @@ hparams = tf.contrib.training.HParams(
  batch_size=32,
  adam_beta1=0.9,
  adam_beta2=0.999,
-  initial_learning_rate=0.002,
-  decay_learning_rate=True,
+  initial_learning_rate=0.0015,
+  learning_rate_decay_halflife=100000,
  use_cmudict=False,  # Use CMUDict during training to learn pronunciation of ARPAbet phonemes

  # Eval:
--- a/models/tacotron.py
+++ b/models/tacotron.py
@ -127,10 +127,8 @@ class Tacotron():
    '''
    with tf.variable_scope('optimizer') as scope:
      hp = self._hparams
-      if hp.decay_learning_rate:
-        self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step)
-      else:
-        self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate)
+      self.learning_rate = tf.train.exponential_decay(
+        hp.initial_learning_rate, global_step, hp.learning_rate_decay_halflife, 0.5)
      optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
      gradients, variables = zip(*optimizer.compute_gradients(self.loss))
      self.gradients = gradients
@ -141,10 +139,3 @@ class Tacotron():
      with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
          global_step=global_step)
-
-
-def _learning_rate_decay(init_lr, global_step):
-  # Noam scheme from tensor2tensor:
-  warmup_steps = 4000.0
-  step = tf.cast(global_step + 1, dtype=tf.float32)
-  return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)