Merge pull request #41 from keithito/tf-griffin-lim

Add TensorFlow implementation of Griffin-Lim
pull/2/head v0.2.0
Keith Ito 2017-09-12 20:58:42 -07:00 committed by GitHub
commit 522826dd77
4 changed files with 83 additions and 29 deletions

View File

@ -28,17 +28,23 @@ Pull requests are welcome!
## Quick Start
### Installing dependencies
Make sure you have installed Python 3 and [TensorFlow](https://www.tensorflow.org/install/). Then:
```
pip install -r requirements.txt
```
1. Install Python 3.
2. Install [TensorFlow 1.3](https://www.tensorflow.org/install/). Install with GPU support if it's
available for your platform.
3. Install requirements:
```
pip install -r requirements.txt
```
### Using a pre-trained model
1. **Download and unpack a model**:
```
curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xj -C /tmp
curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xjC /tmp
```
2. **Run the demo server**:

View File

@ -1,10 +1,10 @@
# Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
# depends on your platform. It is assumed you have already installed tensorflow.
falcon==1.2.0
inflect==0.2.5
librosa==0.5.1
matplotlib==2.0.2
numpy==1.13.0
scipy==0.19.0
tensorflow==1.2.0
tensorflow-gpu==1.2.0
tqdm==4.11.2
Unidecode==0.4.20

View File

@ -15,6 +15,7 @@ class Synthesizer:
with tf.variable_scope('model') as scope:
self.model = create_model(model_name, hparams)
self.model.initialize(inputs, input_lengths)
self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0])
print('Loading checkpoint: %s' % checkpoint_path)
self.session = tf.Session()
@ -30,7 +31,7 @@ class Synthesizer:
self.model.inputs: [np.asarray(seq, dtype=np.int32)],
self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
}
spec = self.session.run(self.model.linear_outputs[0], feed_dict=feed_dict)
wav = self.session.run(self.wav_output, feed_dict=feed_dict)
out = io.BytesIO()
audio.save_wav(audio.inv_spectrogram(spec.T), out)
audio.save_wav(audio.inv_preemphasis(wav), out)
return out.getvalue()

View File

@ -2,6 +2,7 @@ import librosa
import librosa.filters
import math
import numpy as np
import tensorflow as tf
from scipy import signal
from hparams import hparams
@ -15,50 +16,96 @@ def save_wav(wav, path):
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
def preemphasis(x):
return signal.lfilter([1, -hparams.preemphasis], [1], x)
def inv_preemphasis(x):
return signal.lfilter([1], [1, -hparams.preemphasis], x)
def spectrogram(y):
D = _stft(_preemphasis(y))
D = _stft(preemphasis(y))
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
return _normalize(S)
def inv_spectrogram(spectrogram):
'''Converts spectrogram to waveform using librosa'''
S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear
return _inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
def inv_spectrogram_tensorflow(spectrogram):
'''Builds computational graph to convert spectrogram to waveform using TensorFlow.
Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
inv_preemphasis on the output after running the graph.
'''
S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
def melspectrogram(y):
D = _stft(_preemphasis(y))
D = _stft(preemphasis(y))
S = _amp_to_db(_linear_to_mel(np.abs(D)))
return _normalize(S)
def inv_melspectrogram(melspectrogram):
S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram))) # Convert back to linear
return _inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
# Based on https://github.com/librosa/librosa/issues/434
def _griffin_lim(S):
'''librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434
'''
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles)
for i in range(hparams.griffin_lim_iters):
if i > 0:
angles = np.exp(1j * np.angle(_stft(y)))
angles = np.exp(1j * np.angle(_stft(y)))
y = _istft(S_complex * angles)
return y
def _griffin_lim_tensorflow(S):
'''TensorFlow implementation of Griffin-Lim
Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
'''
with tf.variable_scope('griffinlim'):
# TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
S = tf.expand_dims(S, 0)
S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
y = _istft_tensorflow(S_complex)
for i in range(hparams.griffin_lim_iters):
est = _stft_tensorflow(y)
angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
y = _istft_tensorflow(S_complex * angles)
return tf.squeeze(y, 0)
def _stft(y):
n_fft = (hparams.num_freq - 1) * 2
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
n_fft, hop_length, win_length = _stft_parameters()
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
def _istft(y):
_, hop_length, win_length = _stft_parameters()
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def _stft_tensorflow(signals):
n_fft, hop_length, win_length = _stft_parameters()
return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
def _istft_tensorflow(stfts):
n_fft, hop_length, win_length = _stft_parameters()
return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
def _stft_parameters():
n_fft = (hparams.num_freq - 1) * 2
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
return n_fft, hop_length, win_length
# Conversions:
@ -88,14 +135,14 @@ def _amp_to_db(x):
def _db_to_amp(x):
return np.power(10.0, x * 0.05)
def _preemphasis(x):
return signal.lfilter([1, -hparams.preemphasis], [1], x)
def _inv_preemphasis(x):
return signal.lfilter([1], [1, -hparams.preemphasis], x)
def _db_to_amp_tensorflow(x):
return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
def _normalize(S):
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
def _denormalize(S):
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
def _denormalize_tensorflow(S):
return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db