preprocess.py now can set hparams, added mailabs datasetpull/15/head
@ -0,0 +1,81 @@
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
from util import audio
def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda x: x):
'''Preprocesses the mailabs Speech dataset from a given input path into a given output directory.
in_dir: The directory where you have downloaded the LJ Speech dataset
out_dir: The directory to write the output into
num_workers: Optional number of worker processes to parallelize across
tqdm: You can optionally pass tqdm to get a nice progress bar
A list of tuples describing the training examples. This should be written to train.txt
# We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
# can omit it and just call _process_utterance on each input if you want.
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
books = books.strip().split(',')
print('preprocess these books', books)
for book in books:
book_dir = os.path.join(in_dir, book)
with open(os.path.join(book_dir, 'metadata.csv'), encoding='utf-8') as f:
for line in f:
parts = line.strip().split('|')
name = parts[0]
wav_path = os.path.join(book_dir, 'wavs', '%s.wav' % name)
# normalized version of text i.e numbers convered to words
text = parts[2]
_process_utterance, out_dir, name, wav_path, text, hparams)
return [future.result() for future in tqdm(futures)]
def _process_utterance(out_dir, name, wav_path, text, hparams):
'''Preprocesses a single utterance audio/text pair.
This writes the mel and linear scale spectrograms to disk and returns a tuple to write
to the train.txt file.
out_dir: The directory to write the spectrograms into
index: The numeric index to use in the spectrogram filenames.
wav_path: Path to the audio file containing the speech input
text: The text spoken in the input audio file
A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path, hparams)
# trim silences here
wav = audio.trim_silence(wav, hparams)
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav, hparams).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
# Write the spectrograms to disk:
spectrogram_filename = 'mailabs-spec-{}.npy'.format(name)
mel_filename = 'mailabs-mel-{}.npy'.format(name)
np.save(os.path.join(out_dir, spectrogram_filename),
spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename),
mel_spectrogram.T, allow_pickle=False)
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text)
@ -18,6 +18,11 @@ hparams = tf.contrib.training.HParams(
#MAILABS trim params
# Model:
# TODO: add more configurable hparams
@ -2,7 +2,7 @@ import argparse
import os
from multiprocessing import cpu_count
from tqdm import tqdm
from datasets import amy, blizzard, ljspeech, kusal
from datasets import amy, blizzard, ljspeech, kusal, mailabs
from hparams import hparams
@ -10,7 +10,8 @@ def preprocess_blizzard(args):
in_dir = os.path.join(args.base_dir, 'Blizzard2012')
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
metadata = blizzard.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
metadata = blizzard.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)
@ -18,7 +19,8 @@ def preprocess_ljspeech(args):
in_dir = os.path.join(args.base_dir, 'LJSpeech-1.0')
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
metadata = ljspeech.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
metadata = ljspeech.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)
@ -34,17 +36,29 @@ def preprocess_kusal(args):
in_dir = os.path.join(args.base_dir, 'kusal')
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
metadata = kusal.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
metadata = kusal.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir):
def preprocess_mailabs(args):
in_dir = os.path.join(args.mailabs_books_dir)
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
books = args.books
metadata = mailabs.build_from_path(
in_dir, out_dir, books, args.hparams, args.num_workers, tqdm)
write_metadata(metadata, out_dir, args.hparams)
def write_metadata(metadata, out_dir, hparams=hparams):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n')
frames = sum([m[2] for m in metadata])
hours = frames * hparams.frame_shift_ms / (3600 * 1000)
print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
print('Wrote %d utterances, %d frames (%.2f hours)' %
(len(metadata), frames, hours))
print('Max input length: %d' % max(len(m[3]) for m in metadata))
print('Max output length: %d' % max(m[2] for m in metadata))
@ -53,9 +67,30 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
parser.add_argument('--output', default='training')
parser.add_argument('--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal'])
'--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal', 'mailabs']
help='absolute directory to the books for the mlailabs')
help='comma-seperated and no space name of books i.e hunter_space,pink_fairy_book,etc.',
'--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--num_workers', type=int, default=cpu_count())
args = parser.parse_args()
if args.dataset == 'mailabs' and args.books is None:
parser.error("--books required if mailabs is chosen for dataset.")
if args.dataset == 'mailabs' and args.mailabs_books_dir is None:
"--mailabs_books_dir required if mailabs is chosen for dataset.")
args.hparams = hparams.parse(args.hparams)
if args.dataset == 'amy':
elif args.dataset == 'blizzard':
@ -64,6 +99,8 @@ def main():
elif args.dataset == 'kusal':
elif args.dataset == 'mailabs':
if __name__ == "__main__":
@ -73,7 +73,8 @@ def train(log_dir, args):
saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)
# Train!
with tf.Session() as sess:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_fraction)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
@ -140,6 +141,8 @@ def main():
parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.')
parser.add_argument('--gpu_assignment', default='0', help='Set the gpu the model should run on')
parser.add_argument('--gpu_fraction', type=float, default='1.0',
help='Set the fraction of gpu memory to allocate. 0 - 1')
args = parser.parse_args()
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_assignment
os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
@ -7,40 +7,50 @@ from scipy import signal
from hparams import hparams
def load_wav(path):
def load_wav(path, hparams=hparams):
return librosa.core.load(path, sr=hparams.sample_rate)[0]
def save_wav(wav, path):
def save_wav(wav, path, hparams=hparams):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
def spectrogram(y):
def trim_silence(wav, hparams=hparams):
return librosa.effects.trim(
wav, top_db=hparams.trim_top_db,
def spectrogram(y, hparams=hparams):
D = _stft(y)
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
return _normalize(S)
def inv_spectrogram(spectrogram):
def inv_spectrogram(spectrogram, hparams=hparams):
'''Converts spectrogram to waveform using librosa'''
S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear
return _griffin_lim(S ** hparams.power) # Reconstruct phase
S = _db_to_amp(_denormalize(spectrogram) +
hparams.ref_level_db) # Convert back to linear
# Reconstruct phase
return _griffin_lim(S ** hparams.power)
def inv_spectrogram_tensorflow(spectrogram):
def inv_spectrogram_tensorflow(spectrogram, hparams=hparams):
'''Builds computational graph to convert spectrogram to waveform using TensorFlow.'''
S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
S = _db_to_amp_tensorflow(_denormalize_tensorflow(
spectrogram) + hparams.ref_level_db)
return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
def melspectrogram(y):
def melspectrogram(y, hparams=hparams):
D = _stft(y)
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
return _normalize(S)
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams):
window_length = int(hparams.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
threshold = _db_to_amp(threshold_db)
@ -50,7 +60,7 @@ def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
return len(wav)
def _griffin_lim(S):
def _griffin_lim(S, hparams=hparams):
'''librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434
@ -63,7 +73,7 @@ def _griffin_lim(S):
return y
def _griffin_lim_tensorflow(S):
def _griffin_lim_tensorflow(S, hparams=hparams):
'''TensorFlow implementation of Griffin-Lim
Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
@ -99,7 +109,7 @@ def _istft_tensorflow(stfts):
return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
def _stft_parameters():
def _stft_parameters(hparams=hparams):
n_fft = (hparams.num_freq - 1) * 2
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
@ -110,31 +120,39 @@ def _stft_parameters():
_mel_basis = None
def _linear_to_mel(spectrogram):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis()
return np.dot(_mel_basis, spectrogram)
def _build_mel_basis():
def _build_mel_basis(hparams=hparams):
n_fft = (hparams.num_freq - 1) * 2
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels,
fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq)
def _amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x))
def _db_to_amp(x):
return np.power(10.0, x * 0.05)
def _db_to_amp_tensorflow(x):
return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
def _normalize(S):
def _normalize(S, hparams=hparams):
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
def _denormalize(S):
def _denormalize(S, hparams=hparams):
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
def _denormalize_tensorflow(S):
def _denormalize_tensorflow(S, hparams=hparams):
return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
