Merge pull request #41 from keithito/tf-griffin-lim

Add TensorFlow implementation of Griffin-Lim
2017-09-12 20:58:42 -07:00 · 2017-09-12 20:58:42 -07:00 · 522826dd77
parent ab5dae1e18 2460969dc9
commit 522826dd77
4 changed files with 83 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -28,17 +28,23 @@ Pull requests are welcome!
 ## Quick Start

 ### Installing dependencies
-Make sure you have installed Python 3 and [TensorFlow](https://www.tensorflow.org/install/). Then:
-```
-pip install -r requirements.txt
-```
+
+1. Install Python 3.
+
+2. Install [TensorFlow 1.3](https://www.tensorflow.org/install/). Install with GPU support if it's
+   available for your platform.
+
+3. Install requirements:
+   ```
+   pip install -r requirements.txt
+   ```


 ### Using a pre-trained model

 1. **Download and unpack a model**:
   ```
-   curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xj -C /tmp
+   curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xjC /tmp
   ```

 2. **Run the demo server**:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,10 @@
+# Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
+# depends on your platform. It is assumed you have already installed tensorflow.
 falcon==1.2.0
 inflect==0.2.5
 librosa==0.5.1
 matplotlib==2.0.2
 numpy==1.13.0
 scipy==0.19.0
-tensorflow==1.2.0
-tensorflow-gpu==1.2.0
 tqdm==4.11.2
 Unidecode==0.4.20
--- a/synthesizer.py
+++ b/synthesizer.py
@ -15,6 +15,7 @@ class Synthesizer:
    with tf.variable_scope('model') as scope:
      self.model = create_model(model_name, hparams)
      self.model.initialize(inputs, input_lengths)
+      self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0])

    print('Loading checkpoint: %s' % checkpoint_path)
    self.session = tf.Session()
@ -30,7 +31,7 @@ class Synthesizer:
      self.model.inputs: [np.asarray(seq, dtype=np.int32)],
      self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
    }
-    spec = self.session.run(self.model.linear_outputs[0], feed_dict=feed_dict)
+    wav = self.session.run(self.wav_output, feed_dict=feed_dict)
    out = io.BytesIO()
-    audio.save_wav(audio.inv_spectrogram(spec.T), out)
+    audio.save_wav(audio.inv_preemphasis(wav), out)
    return out.getvalue()
--- a/util/audio.py
+++ b/util/audio.py
@ -2,6 +2,7 @@ import librosa
 import librosa.filters
 import math
 import numpy as np
+import tensorflow as tf
 from scipy import signal
 from hparams import hparams

@ -15,50 +16,96 @@ def save_wav(wav, path):
  librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)


+def preemphasis(x):
+  return signal.lfilter([1, -hparams.preemphasis], [1], x)
+
+
+def inv_preemphasis(x):
+  return signal.lfilter([1], [1, -hparams.preemphasis], x)
+
+
 def spectrogram(y):
-  D = _stft(_preemphasis(y))
+  D = _stft(preemphasis(y))
  S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
  return _normalize(S)


 def inv_spectrogram(spectrogram):
+  '''Converts spectrogram to waveform using librosa'''
  S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)  # Convert back to linear
-  return _inv_preemphasis(_griffin_lim(S ** hparams.power))         # Reconstruct phase
+  return inv_preemphasis(_griffin_lim(S ** hparams.power))          # Reconstruct phase
+
+
+def inv_spectrogram_tensorflow(spectrogram):
+  '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
+
+  Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
+  inv_preemphasis on the output after running the graph.
+  '''
+  S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
+  return _griffin_lim_tensorflow(tf.pow(S, hparams.power))


 def melspectrogram(y):
-  D = _stft(_preemphasis(y))
+  D = _stft(preemphasis(y))
  S = _amp_to_db(_linear_to_mel(np.abs(D)))
  return _normalize(S)


-def inv_melspectrogram(melspectrogram):
-  S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram)))   # Convert back to linear
-  return _inv_preemphasis(_griffin_lim(S ** hparams.power))      # Reconstruct phase
-
-
-# Based on https://github.com/librosa/librosa/issues/434
 def _griffin_lim(S):
+  '''librosa implementation of Griffin-Lim
+  Based on https://github.com/librosa/librosa/issues/434
+  '''
  angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
  S_complex = np.abs(S).astype(np.complex)
+  y = _istft(S_complex * angles)
  for i in range(hparams.griffin_lim_iters):
-    if i > 0:
-      angles = np.exp(1j * np.angle(_stft(y)))
+    angles = np.exp(1j * np.angle(_stft(y)))
    y = _istft(S_complex * angles)
  return y


+def _griffin_lim_tensorflow(S):
+  '''TensorFlow implementation of Griffin-Lim
+  Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
+  '''
+  with tf.variable_scope('griffinlim'):
+    # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
+    S = tf.expand_dims(S, 0)
+    S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
+    y = _istft_tensorflow(S_complex)
+    for i in range(hparams.griffin_lim_iters):
+      est = _stft_tensorflow(y)
+      angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
+      y = _istft_tensorflow(S_complex * angles)
+    return tf.squeeze(y, 0)
+
+
 def _stft(y):
-  n_fft = (hparams.num_freq - 1) * 2
-  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
-  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
+  n_fft, hop_length, win_length = _stft_parameters()
  return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)


 def _istft(y):
+  _, hop_length, win_length = _stft_parameters()
+  return librosa.istft(y, hop_length=hop_length, win_length=win_length)
+
+
+def _stft_tensorflow(signals):
+  n_fft, hop_length, win_length = _stft_parameters()
+  return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
+
+
+def _istft_tensorflow(stfts):
+  n_fft, hop_length, win_length = _stft_parameters()
+  return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
+
+
+def _stft_parameters():
+  n_fft = (hparams.num_freq - 1) * 2
  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
-  return librosa.istft(y, hop_length=hop_length, win_length=win_length)
+  return n_fft, hop_length, win_length


 # Conversions:
@ -88,14 +135,14 @@ def _amp_to_db(x):
 def _db_to_amp(x):
  return np.power(10.0, x * 0.05)

-def _preemphasis(x):
-  return signal.lfilter([1, -hparams.preemphasis], [1], x)
-
-def _inv_preemphasis(x):
-  return signal.lfilter([1], [1, -hparams.preemphasis], x)
+def _db_to_amp_tensorflow(x):
+  return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)

 def _normalize(S):
  return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)

 def _denormalize(S):
  return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
+
+def _denormalize_tensorflow(S):
+  return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db