Merge pull request #9 from MycroftAI/kusal

preprocess.py now can set hparams, added mailabs dataset
pull/15/head
Michael Nguyen 2018-06-27 14:14:14 -05:00 committed by GitHub
commit 0d77c0c135
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 199 additions and 55 deletions

81
datasets/mailabs.py Normal file
View File

@ -0,0 +1,81 @@
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
from util import audio
def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda x: x):
'''Preprocesses the mailabs Speech dataset from a given input path into a given output directory.
Args:
in_dir: The directory where you have downloaded the LJ Speech dataset
out_dir: The directory to write the output into
num_workers: Optional number of worker processes to parallelize across
tqdm: You can optionally pass tqdm to get a nice progress bar
Returns:
A list of tuples describing the training examples. This should be written to train.txt
'''
# We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
# can omit it and just call _process_utterance on each input if you want.
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
books = books.strip().split(',')
print('preprocess these books', books)
for book in books:
book_dir = os.path.join(in_dir, book)
with open(os.path.join(book_dir, 'metadata.csv'), encoding='utf-8') as f:
for line in f:
parts = line.strip().split('|')
name = parts[0]
wav_path = os.path.join(book_dir, 'wavs', '%s.wav' % name)
# normalized version of text i.e numbers convered to words
text = parts[2]
futures.append(
executor.submit(partial(
_process_utterance, out_dir, name, wav_path, text, hparams)
))
return [future.result() for future in tqdm(futures)]
def _process_utterance(out_dir, name, wav_path, text, hparams):
'''Preprocesses a single utterance audio/text pair.
This writes the mel and linear scale spectrograms to disk and returns a tuple to write
to the train.txt file.
Args:
out_dir: The directory to write the spectrograms into
index: The numeric index to use in the spectrogram filenames.
wav_path: Path to the audio file containing the speech input
text: The text spoken in the input audio file
Returns:
A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
'''
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path, hparams)
# trim silences here
wav = audio.trim_silence(wav, hparams)
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav, hparams).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
# Write the spectrograms to disk:
spectrogram_filename = 'mailabs-spec-{}.npy'.format(name)
mel_filename = 'mailabs-mel-{}.npy'.format(name)
np.save(os.path.join(out_dir, spectrogram_filename),
spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename),
mel_spectrogram.T, allow_pickle=False)
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text)

View File

@ -3,38 +3,43 @@ import tensorflow as tf
# Default hyperparameters: # Default hyperparameters:
hparams = tf.contrib.training.HParams( hparams = tf.contrib.training.HParams(
# Comma-separated list of cleaners to run on text prior to training and eval. For non-English # Comma-separated list of cleaners to run on text prior to training and eval. For non-English
# text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md. # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
cleaners='english_cleaners', cleaners='english_cleaners',
# Audio: # Audio:
num_mels=80, num_mels=80,
num_freq=1025, num_freq=1025,
min_mel_freq=125, min_mel_freq=125,
max_mel_freq=7600, max_mel_freq=7600,
sample_rate=22000, sample_rate=22000,
frame_length_ms=50, frame_length_ms=50,
frame_shift_ms=12.5, frame_shift_ms=12.5,
min_level_db=-100, min_level_db=-100,
ref_level_db=20, ref_level_db=20,
# Model: #MAILABS trim params
# TODO: add more configurable hparams trim_fft_size=1024,
outputs_per_step=5, trim_hop_size=256,
embedding_dim=512, trim_top_db=40,
# Training: # Model:
batch_size=32, # TODO: add more configurable hparams
adam_beta1=0.9, outputs_per_step=5,
adam_beta2=0.999, embedding_dim=512,
initial_learning_rate=0.0015,
learning_rate_decay_halflife=100000,
use_cmudict=True, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes
# Eval: # Training:
max_iters=200, batch_size=32,
griffin_lim_iters=50, adam_beta1=0.9,
power=1.5, # Power to raise magnitudes to prior to Griffin-Lim adam_beta2=0.999,
initial_learning_rate=0.0015,
learning_rate_decay_halflife=100000,
use_cmudict=True, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes
# Eval:
max_iters=200,
griffin_lim_iters=50,
power=1.5, # Power to raise magnitudes to prior to Griffin-Lim
) )

View File

@ -2,7 +2,7 @@ import argparse
import os import os
from multiprocessing import cpu_count from multiprocessing import cpu_count
from tqdm import tqdm from tqdm import tqdm
from datasets import amy, blizzard, ljspeech, kusal from datasets import amy, blizzard, ljspeech, kusal, mailabs
from hparams import hparams from hparams import hparams
@ -10,7 +10,8 @@ def preprocess_blizzard(args):
in_dir = os.path.join(args.base_dir, 'Blizzard2012') in_dir = os.path.join(args.base_dir, 'Blizzard2012')
out_dir = os.path.join(args.base_dir, args.output) out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir, exist_ok=True)
metadata = blizzard.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) metadata = blizzard.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir) write_metadata(metadata, out_dir)
@ -18,7 +19,8 @@ def preprocess_ljspeech(args):
in_dir = os.path.join(args.base_dir, 'LJSpeech-1.0') in_dir = os.path.join(args.base_dir, 'LJSpeech-1.0')
out_dir = os.path.join(args.base_dir, args.output) out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir, exist_ok=True)
metadata = ljspeech.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) metadata = ljspeech.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir) write_metadata(metadata, out_dir)
@ -34,17 +36,29 @@ def preprocess_kusal(args):
in_dir = os.path.join(args.base_dir, 'kusal') in_dir = os.path.join(args.base_dir, 'kusal')
out_dir = os.path.join(args.base_dir, args.output) out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir, exist_ok=True)
metadata = kusal.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) metadata = kusal.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir) write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir): def preprocess_mailabs(args):
in_dir = os.path.join(args.mailabs_books_dir)
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
books = args.books
metadata = mailabs.build_from_path(
in_dir, out_dir, books, args.hparams, args.num_workers, tqdm)
write_metadata(metadata, out_dir, args.hparams)
def write_metadata(metadata, out_dir, hparams=hparams):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata: for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n') f.write('|'.join([str(x) for x in m]) + '\n')
frames = sum([m[2] for m in metadata]) frames = sum([m[2] for m in metadata])
hours = frames * hparams.frame_shift_ms / (3600 * 1000) hours = frames * hparams.frame_shift_ms / (3600 * 1000)
print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours)) print('Wrote %d utterances, %d frames (%.2f hours)' %
(len(metadata), frames, hours))
print('Max input length: %d' % max(len(m[3]) for m in metadata)) print('Max input length: %d' % max(len(m[3]) for m in metadata))
print('Max output length: %d' % max(m[2] for m in metadata)) print('Max output length: %d' % max(m[2] for m in metadata))
@ -53,9 +67,30 @@ def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron')) parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
parser.add_argument('--output', default='training') parser.add_argument('--output', default='training')
parser.add_argument('--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal']) parser.add_argument(
'--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal', 'mailabs']
)
parser.add_argument('--mailabs_books_dir',
help='absolute directory to the books for the mlailabs')
parser.add_argument(
'--books',
help='comma-seperated and no space name of books i.e hunter_space,pink_fairy_book,etc.',
)
parser.add_argument(
'--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--num_workers', type=int, default=cpu_count()) parser.add_argument('--num_workers', type=int, default=cpu_count())
args = parser.parse_args() args = parser.parse_args()
if args.dataset == 'mailabs' and args.books is None:
parser.error("--books required if mailabs is chosen for dataset.")
if args.dataset == 'mailabs' and args.mailabs_books_dir is None:
parser.error(
"--mailabs_books_dir required if mailabs is chosen for dataset.")
args.hparams = hparams.parse(args.hparams)
if args.dataset == 'amy': if args.dataset == 'amy':
preprocess_amy(args) preprocess_amy(args)
elif args.dataset == 'blizzard': elif args.dataset == 'blizzard':
@ -64,6 +99,8 @@ def main():
preprocess_ljspeech(args) preprocess_ljspeech(args)
elif args.dataset == 'kusal': elif args.dataset == 'kusal':
preprocess_kusal(args) preprocess_kusal(args)
elif args.dataset == 'mailabs':
preprocess_mailabs(args)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -73,7 +73,8 @@ def train(log_dir, args):
saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)
# Train! # Train!
with tf.Session() as sess: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_fraction)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
try: try:
summary_writer = tf.summary.FileWriter(log_dir, sess.graph) summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
sess.run(tf.global_variables_initializer()) sess.run(tf.global_variables_initializer())
@ -140,6 +141,8 @@ def main():
parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.') parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.')
parser.add_argument('--gpu_assignment', default='0', help='Set the gpu the model should run on') parser.add_argument('--gpu_assignment', default='0', help='Set the gpu the model should run on')
parser.add_argument('--gpu_fraction', type=float, default='1.0',
help='Set the fraction of gpu memory to allocate. 0 - 1')
args = parser.parse_args() args = parser.parse_args()
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_assignment os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_assignment
os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)

View File

@ -7,50 +7,60 @@ from scipy import signal
from hparams import hparams from hparams import hparams
def load_wav(path): def load_wav(path, hparams=hparams):
return librosa.core.load(path, sr=hparams.sample_rate)[0] return librosa.core.load(path, sr=hparams.sample_rate)[0]
def save_wav(wav, path): def save_wav(wav, path, hparams=hparams):
wav *= 32767 / max(0.01, np.max(np.abs(wav))) wav *= 32767 / max(0.01, np.max(np.abs(wav)))
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate) librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
def spectrogram(y): def trim_silence(wav, hparams=hparams):
return librosa.effects.trim(
wav, top_db=hparams.trim_top_db,
frame_length=hparams.trim_fft_size,
hop_length=hparams.trim_hop_size)[0]
def spectrogram(y, hparams=hparams):
D = _stft(y) D = _stft(y)
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
return _normalize(S) return _normalize(S)
def inv_spectrogram(spectrogram): def inv_spectrogram(spectrogram, hparams=hparams):
'''Converts spectrogram to waveform using librosa''' '''Converts spectrogram to waveform using librosa'''
S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear S = _db_to_amp(_denormalize(spectrogram) +
return _griffin_lim(S ** hparams.power) # Reconstruct phase hparams.ref_level_db) # Convert back to linear
# Reconstruct phase
return _griffin_lim(S ** hparams.power)
def inv_spectrogram_tensorflow(spectrogram): def inv_spectrogram_tensorflow(spectrogram, hparams=hparams):
'''Builds computational graph to convert spectrogram to waveform using TensorFlow.''' '''Builds computational graph to convert spectrogram to waveform using TensorFlow.'''
S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db) S = _db_to_amp_tensorflow(_denormalize_tensorflow(
spectrogram) + hparams.ref_level_db)
return _griffin_lim_tensorflow(tf.pow(S, hparams.power)) return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
def melspectrogram(y): def melspectrogram(y, hparams=hparams):
D = _stft(y) D = _stft(y)
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
return _normalize(S) return _normalize(S)
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8): def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams):
window_length = int(hparams.sample_rate * min_silence_sec) window_length = int(hparams.sample_rate * min_silence_sec)
hop_length = int(window_length / 4) hop_length = int(window_length / 4)
threshold = _db_to_amp(threshold_db) threshold = _db_to_amp(threshold_db)
for x in range(hop_length, len(wav) - window_length, hop_length): for x in range(hop_length, len(wav) - window_length, hop_length):
if np.max(wav[x:x+window_length]) < threshold: if np.max(wav[x:x + window_length]) < threshold:
return x + hop_length return x + hop_length
return len(wav) return len(wav)
def _griffin_lim(S): def _griffin_lim(S, hparams=hparams):
'''librosa implementation of Griffin-Lim '''librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434 Based on https://github.com/librosa/librosa/issues/434
''' '''
@ -63,7 +73,7 @@ def _griffin_lim(S):
return y return y
def _griffin_lim_tensorflow(S): def _griffin_lim_tensorflow(S, hparams=hparams):
'''TensorFlow implementation of Griffin-Lim '''TensorFlow implementation of Griffin-Lim
Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
''' '''
@ -99,7 +109,7 @@ def _istft_tensorflow(stfts):
return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
def _stft_parameters(): def _stft_parameters(hparams=hparams):
n_fft = (hparams.num_freq - 1) * 2 n_fft = (hparams.num_freq - 1) * 2
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
@ -110,31 +120,39 @@ def _stft_parameters():
_mel_basis = None _mel_basis = None
def _linear_to_mel(spectrogram): def _linear_to_mel(spectrogram):
global _mel_basis global _mel_basis
if _mel_basis is None: if _mel_basis is None:
_mel_basis = _build_mel_basis() _mel_basis = _build_mel_basis()
return np.dot(_mel_basis, spectrogram) return np.dot(_mel_basis, spectrogram)
def _build_mel_basis():
def _build_mel_basis(hparams=hparams):
n_fft = (hparams.num_freq - 1) * 2 n_fft = (hparams.num_freq - 1) * 2
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels,
fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq) fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq)
def _amp_to_db(x): def _amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x)) return 20 * np.log10(np.maximum(1e-5, x))
def _db_to_amp(x): def _db_to_amp(x):
return np.power(10.0, x * 0.05) return np.power(10.0, x * 0.05)
def _db_to_amp_tensorflow(x): def _db_to_amp_tensorflow(x):
return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
def _normalize(S):
def _normalize(S, hparams=hparams):
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
def _denormalize(S):
def _denormalize(S, hparams=hparams):
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
def _denormalize_tensorflow(S):
def _denormalize_tensorflow(S, hparams=hparams):
return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db