mirror of https://github.com/MycroftAI/mimic2.git
Merge pull request #41 from keithito/tf-griffin-lim
Add TensorFlow implementation of Griffin-Limpull/2/head v0.2.0
commit
522826dd77
16
README.md
16
README.md
|
@ -28,17 +28,23 @@ Pull requests are welcome!
|
|||
## Quick Start
|
||||
|
||||
### Installing dependencies
|
||||
Make sure you have installed Python 3 and [TensorFlow](https://www.tensorflow.org/install/). Then:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
1. Install Python 3.
|
||||
|
||||
2. Install [TensorFlow 1.3](https://www.tensorflow.org/install/). Install with GPU support if it's
|
||||
available for your platform.
|
||||
|
||||
3. Install requirements:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
||||
### Using a pre-trained model
|
||||
|
||||
1. **Download and unpack a model**:
|
||||
```
|
||||
curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xj -C /tmp
|
||||
curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xjC /tmp
|
||||
```
|
||||
|
||||
2. **Run the demo server**:
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
# Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
|
||||
# depends on your platform. It is assumed you have already installed tensorflow.
|
||||
falcon==1.2.0
|
||||
inflect==0.2.5
|
||||
librosa==0.5.1
|
||||
matplotlib==2.0.2
|
||||
numpy==1.13.0
|
||||
scipy==0.19.0
|
||||
tensorflow==1.2.0
|
||||
tensorflow-gpu==1.2.0
|
||||
tqdm==4.11.2
|
||||
Unidecode==0.4.20
|
||||
|
|
|
@ -15,6 +15,7 @@ class Synthesizer:
|
|||
with tf.variable_scope('model') as scope:
|
||||
self.model = create_model(model_name, hparams)
|
||||
self.model.initialize(inputs, input_lengths)
|
||||
self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0])
|
||||
|
||||
print('Loading checkpoint: %s' % checkpoint_path)
|
||||
self.session = tf.Session()
|
||||
|
@ -30,7 +31,7 @@ class Synthesizer:
|
|||
self.model.inputs: [np.asarray(seq, dtype=np.int32)],
|
||||
self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
|
||||
}
|
||||
spec = self.session.run(self.model.linear_outputs[0], feed_dict=feed_dict)
|
||||
wav = self.session.run(self.wav_output, feed_dict=feed_dict)
|
||||
out = io.BytesIO()
|
||||
audio.save_wav(audio.inv_spectrogram(spec.T), out)
|
||||
audio.save_wav(audio.inv_preemphasis(wav), out)
|
||||
return out.getvalue()
|
||||
|
|
|
@ -2,6 +2,7 @@ import librosa
|
|||
import librosa.filters
|
||||
import math
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from scipy import signal
|
||||
from hparams import hparams
|
||||
|
||||
|
@ -15,50 +16,96 @@ def save_wav(wav, path):
|
|||
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
|
||||
|
||||
|
||||
def preemphasis(x):
|
||||
return signal.lfilter([1, -hparams.preemphasis], [1], x)
|
||||
|
||||
|
||||
def inv_preemphasis(x):
|
||||
return signal.lfilter([1], [1, -hparams.preemphasis], x)
|
||||
|
||||
|
||||
def spectrogram(y):
|
||||
D = _stft(_preemphasis(y))
|
||||
D = _stft(preemphasis(y))
|
||||
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
|
||||
return _normalize(S)
|
||||
|
||||
|
||||
def inv_spectrogram(spectrogram):
|
||||
'''Converts spectrogram to waveform using librosa'''
|
||||
S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear
|
||||
return _inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
|
||||
return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
|
||||
|
||||
|
||||
def inv_spectrogram_tensorflow(spectrogram):
|
||||
'''Builds computational graph to convert spectrogram to waveform using TensorFlow.
|
||||
|
||||
Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
|
||||
inv_preemphasis on the output after running the graph.
|
||||
'''
|
||||
S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
|
||||
return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
|
||||
|
||||
|
||||
def melspectrogram(y):
|
||||
D = _stft(_preemphasis(y))
|
||||
D = _stft(preemphasis(y))
|
||||
S = _amp_to_db(_linear_to_mel(np.abs(D)))
|
||||
return _normalize(S)
|
||||
|
||||
|
||||
def inv_melspectrogram(melspectrogram):
|
||||
S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram))) # Convert back to linear
|
||||
return _inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase
|
||||
|
||||
|
||||
# Based on https://github.com/librosa/librosa/issues/434
|
||||
def _griffin_lim(S):
|
||||
'''librosa implementation of Griffin-Lim
|
||||
Based on https://github.com/librosa/librosa/issues/434
|
||||
'''
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
y = _istft(S_complex * angles)
|
||||
for i in range(hparams.griffin_lim_iters):
|
||||
if i > 0:
|
||||
angles = np.exp(1j * np.angle(_stft(y)))
|
||||
angles = np.exp(1j * np.angle(_stft(y)))
|
||||
y = _istft(S_complex * angles)
|
||||
return y
|
||||
|
||||
|
||||
def _griffin_lim_tensorflow(S):
|
||||
'''TensorFlow implementation of Griffin-Lim
|
||||
Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
|
||||
'''
|
||||
with tf.variable_scope('griffinlim'):
|
||||
# TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
|
||||
S = tf.expand_dims(S, 0)
|
||||
S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
|
||||
y = _istft_tensorflow(S_complex)
|
||||
for i in range(hparams.griffin_lim_iters):
|
||||
est = _stft_tensorflow(y)
|
||||
angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
|
||||
y = _istft_tensorflow(S_complex * angles)
|
||||
return tf.squeeze(y, 0)
|
||||
|
||||
|
||||
def _stft(y):
|
||||
n_fft = (hparams.num_freq - 1) * 2
|
||||
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
||||
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
|
||||
n_fft, hop_length, win_length = _stft_parameters()
|
||||
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
|
||||
def _istft(y):
|
||||
_, hop_length, win_length = _stft_parameters()
|
||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
|
||||
def _stft_tensorflow(signals):
|
||||
n_fft, hop_length, win_length = _stft_parameters()
|
||||
return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
|
||||
|
||||
|
||||
def _istft_tensorflow(stfts):
|
||||
n_fft, hop_length, win_length = _stft_parameters()
|
||||
return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
|
||||
|
||||
|
||||
def _stft_parameters():
|
||||
n_fft = (hparams.num_freq - 1) * 2
|
||||
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
||||
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
|
||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
||||
return n_fft, hop_length, win_length
|
||||
|
||||
|
||||
# Conversions:
|
||||
|
@ -88,14 +135,14 @@ def _amp_to_db(x):
|
|||
def _db_to_amp(x):
|
||||
return np.power(10.0, x * 0.05)
|
||||
|
||||
def _preemphasis(x):
|
||||
return signal.lfilter([1, -hparams.preemphasis], [1], x)
|
||||
|
||||
def _inv_preemphasis(x):
|
||||
return signal.lfilter([1], [1, -hparams.preemphasis], x)
|
||||
def _db_to_amp_tensorflow(x):
|
||||
return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
|
||||
|
||||
def _normalize(S):
|
||||
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
|
||||
|
||||
def _denormalize(S):
|
||||
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
|
||||
|
||||
def _denormalize_tensorflow(S):
|
||||
return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
|
||||
|
|
Loading…
Reference in New Issue