From 8f1f0155c550fb9725c6748366a111df1f379194 Mon Sep 17 00:00:00 2001 From: Michael Nguyen Date: Wed, 27 Jun 2018 14:12:39 -0500 Subject: [PATCH] preprocess.py now can set hparams, added mailabs dataset --- datasets/mailabs.py | 81 +++++++++++++++++++++++++++++++++++++++++++++ hparams.py | 61 ++++++++++++++++++---------------- preprocess.py | 51 ++++++++++++++++++++++++---- train.py | 5 ++- util/audio.py | 56 ++++++++++++++++++++----------- 5 files changed, 199 insertions(+), 55 deletions(-) create mode 100644 datasets/mailabs.py diff --git a/datasets/mailabs.py b/datasets/mailabs.py new file mode 100644 index 0000000..5eefa53 --- /dev/null +++ b/datasets/mailabs.py @@ -0,0 +1,81 @@ +from concurrent.futures import ProcessPoolExecutor +from functools import partial +import numpy as np +import os +from util import audio + + +def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda x: x): + '''Preprocesses the mailabs Speech dataset from a given input path into a given output directory. + + Args: + in_dir: The directory where you have downloaded the LJ Speech dataset + out_dir: The directory to write the output into + num_workers: Optional number of worker processes to parallelize across + tqdm: You can optionally pass tqdm to get a nice progress bar + + Returns: + A list of tuples describing the training examples. This should be written to train.txt + ''' + + # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you + # can omit it and just call _process_utterance on each input if you want. + executor = ProcessPoolExecutor(max_workers=num_workers) + futures = [] + books = books.strip().split(',') + print('preprocess these books', books) + for book in books: + book_dir = os.path.join(in_dir, book) + with open(os.path.join(book_dir, 'metadata.csv'), encoding='utf-8') as f: + for line in f: + parts = line.strip().split('|') + name = parts[0] + wav_path = os.path.join(book_dir, 'wavs', '%s.wav' % name) + # normalized version of text i.e numbers convered to words + text = parts[2] + futures.append( + executor.submit(partial( + _process_utterance, out_dir, name, wav_path, text, hparams) + )) + return [future.result() for future in tqdm(futures)] + + +def _process_utterance(out_dir, name, wav_path, text, hparams): + '''Preprocesses a single utterance audio/text pair. + + This writes the mel and linear scale spectrograms to disk and returns a tuple to write + to the train.txt file. + + Args: + out_dir: The directory to write the spectrograms into + index: The numeric index to use in the spectrogram filenames. + wav_path: Path to the audio file containing the speech input + text: The text spoken in the input audio file + + Returns: + A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt + ''' + + # Load the audio to a numpy array: + wav = audio.load_wav(wav_path, hparams) + + # trim silences here + wav = audio.trim_silence(wav, hparams) + + # Compute the linear-scale spectrogram from the wav: + spectrogram = audio.spectrogram(wav, hparams).astype(np.float32) + n_frames = spectrogram.shape[1] + + # Compute a mel-scale spectrogram from the wav: + mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) + + # Write the spectrograms to disk: + spectrogram_filename = 'mailabs-spec-{}.npy'.format(name) + mel_filename = 'mailabs-mel-{}.npy'.format(name) + np.save(os.path.join(out_dir, spectrogram_filename), + spectrogram.T, allow_pickle=False) + np.save(os.path.join(out_dir, mel_filename), + mel_spectrogram.T, allow_pickle=False) + + # Return a tuple describing this training example: + return (spectrogram_filename, mel_filename, n_frames, text) diff --git a/hparams.py b/hparams.py index dc81a65..152ac06 100644 --- a/hparams.py +++ b/hparams.py @@ -3,38 +3,43 @@ import tensorflow as tf # Default hyperparameters: hparams = tf.contrib.training.HParams( - # Comma-separated list of cleaners to run on text prior to training and eval. For non-English - # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md. - cleaners='english_cleaners', + # Comma-separated list of cleaners to run on text prior to training and eval. For non-English + # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md. + cleaners='english_cleaners', - # Audio: - num_mels=80, - num_freq=1025, - min_mel_freq=125, - max_mel_freq=7600, - sample_rate=22000, - frame_length_ms=50, - frame_shift_ms=12.5, - min_level_db=-100, - ref_level_db=20, + # Audio: + num_mels=80, + num_freq=1025, + min_mel_freq=125, + max_mel_freq=7600, + sample_rate=22000, + frame_length_ms=50, + frame_shift_ms=12.5, + min_level_db=-100, + ref_level_db=20, - # Model: - # TODO: add more configurable hparams - outputs_per_step=5, - embedding_dim=512, + #MAILABS trim params + trim_fft_size=1024, + trim_hop_size=256, + trim_top_db=40, - # Training: - batch_size=32, - adam_beta1=0.9, - adam_beta2=0.999, - initial_learning_rate=0.0015, - learning_rate_decay_halflife=100000, - use_cmudict=True, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes + # Model: + # TODO: add more configurable hparams + outputs_per_step=5, + embedding_dim=512, - # Eval: - max_iters=200, - griffin_lim_iters=50, - power=1.5, # Power to raise magnitudes to prior to Griffin-Lim + # Training: + batch_size=32, + adam_beta1=0.9, + adam_beta2=0.999, + initial_learning_rate=0.0015, + learning_rate_decay_halflife=100000, + use_cmudict=True, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes + + # Eval: + max_iters=200, + griffin_lim_iters=50, + power=1.5, # Power to raise magnitudes to prior to Griffin-Lim ) diff --git a/preprocess.py b/preprocess.py index 1fef213..56fe0c1 100644 --- a/preprocess.py +++ b/preprocess.py @@ -2,7 +2,7 @@ import argparse import os from multiprocessing import cpu_count from tqdm import tqdm -from datasets import amy, blizzard, ljspeech, kusal +from datasets import amy, blizzard, ljspeech, kusal, mailabs from hparams import hparams @@ -10,7 +10,8 @@ def preprocess_blizzard(args): in_dir = os.path.join(args.base_dir, 'Blizzard2012') out_dir = os.path.join(args.base_dir, args.output) os.makedirs(out_dir, exist_ok=True) - metadata = blizzard.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) + metadata = blizzard.build_from_path( + in_dir, out_dir, args.num_workers, tqdm=tqdm) write_metadata(metadata, out_dir) @@ -18,7 +19,8 @@ def preprocess_ljspeech(args): in_dir = os.path.join(args.base_dir, 'LJSpeech-1.0') out_dir = os.path.join(args.base_dir, args.output) os.makedirs(out_dir, exist_ok=True) - metadata = ljspeech.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) + metadata = ljspeech.build_from_path( + in_dir, out_dir, args.num_workers, tqdm=tqdm) write_metadata(metadata, out_dir) @@ -34,17 +36,29 @@ def preprocess_kusal(args): in_dir = os.path.join(args.base_dir, 'kusal') out_dir = os.path.join(args.base_dir, args.output) os.makedirs(out_dir, exist_ok=True) - metadata = kusal.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) + metadata = kusal.build_from_path( + in_dir, out_dir, args.num_workers, tqdm=tqdm) write_metadata(metadata, out_dir) -def write_metadata(metadata, out_dir): +def preprocess_mailabs(args): + in_dir = os.path.join(args.mailabs_books_dir) + out_dir = os.path.join(args.base_dir, args.output) + os.makedirs(out_dir, exist_ok=True) + books = args.books + metadata = mailabs.build_from_path( + in_dir, out_dir, books, args.hparams, args.num_workers, tqdm) + write_metadata(metadata, out_dir, args.hparams) + + +def write_metadata(metadata, out_dir, hparams=hparams): with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: for m in metadata: f.write('|'.join([str(x) for x in m]) + '\n') frames = sum([m[2] for m in metadata]) hours = frames * hparams.frame_shift_ms / (3600 * 1000) - print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours)) + print('Wrote %d utterances, %d frames (%.2f hours)' % + (len(metadata), frames, hours)) print('Max input length: %d' % max(len(m[3]) for m in metadata)) print('Max output length: %d' % max(m[2] for m in metadata)) @@ -53,9 +67,30 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron')) parser.add_argument('--output', default='training') - parser.add_argument('--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal']) + parser.add_argument( + '--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal', 'mailabs'] + ) + parser.add_argument('--mailabs_books_dir', + help='absolute directory to the books for the mlailabs') + parser.add_argument( + '--books', + help='comma-seperated and no space name of books i.e hunter_space,pink_fairy_book,etc.', + ) + parser.add_argument( + '--hparams', default='', + help='Hyperparameter overrides as a comma-separated list of name=value pairs') parser.add_argument('--num_workers', type=int, default=cpu_count()) args = parser.parse_args() + + if args.dataset == 'mailabs' and args.books is None: + parser.error("--books required if mailabs is chosen for dataset.") + + if args.dataset == 'mailabs' and args.mailabs_books_dir is None: + parser.error( + "--mailabs_books_dir required if mailabs is chosen for dataset.") + + args.hparams = hparams.parse(args.hparams) + if args.dataset == 'amy': preprocess_amy(args) elif args.dataset == 'blizzard': @@ -64,6 +99,8 @@ def main(): preprocess_ljspeech(args) elif args.dataset == 'kusal': preprocess_kusal(args) + elif args.dataset == 'mailabs': + preprocess_mailabs(args) if __name__ == "__main__": diff --git a/train.py b/train.py index 773e7d0..cd28c41 100644 --- a/train.py +++ b/train.py @@ -73,7 +73,8 @@ def train(log_dir, args): saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) # Train! - with tf.Session() as sess: + gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_fraction) + with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) @@ -140,6 +141,8 @@ def main(): parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.') parser.add_argument('--gpu_assignment', default='0', help='Set the gpu the model should run on') + parser.add_argument('--gpu_fraction', type=float, default='1.0', + help='Set the fraction of gpu memory to allocate. 0 - 1') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_assignment os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) diff --git a/util/audio.py b/util/audio.py index 4f8c8c9..89b38e3 100644 --- a/util/audio.py +++ b/util/audio.py @@ -7,50 +7,60 @@ from scipy import signal from hparams import hparams -def load_wav(path): +def load_wav(path, hparams=hparams): return librosa.core.load(path, sr=hparams.sample_rate)[0] -def save_wav(wav, path): +def save_wav(wav, path, hparams=hparams): wav *= 32767 / max(0.01, np.max(np.abs(wav))) librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate) -def spectrogram(y): +def trim_silence(wav, hparams=hparams): + return librosa.effects.trim( + wav, top_db=hparams.trim_top_db, + frame_length=hparams.trim_fft_size, + hop_length=hparams.trim_hop_size)[0] + + +def spectrogram(y, hparams=hparams): D = _stft(y) S = _amp_to_db(np.abs(D)) - hparams.ref_level_db return _normalize(S) -def inv_spectrogram(spectrogram): +def inv_spectrogram(spectrogram, hparams=hparams): '''Converts spectrogram to waveform using librosa''' - S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear - return _griffin_lim(S ** hparams.power) # Reconstruct phase + S = _db_to_amp(_denormalize(spectrogram) + + hparams.ref_level_db) # Convert back to linear + # Reconstruct phase + return _griffin_lim(S ** hparams.power) -def inv_spectrogram_tensorflow(spectrogram): +def inv_spectrogram_tensorflow(spectrogram, hparams=hparams): '''Builds computational graph to convert spectrogram to waveform using TensorFlow.''' - S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db) + S = _db_to_amp_tensorflow(_denormalize_tensorflow( + spectrogram) + hparams.ref_level_db) return _griffin_lim_tensorflow(tf.pow(S, hparams.power)) -def melspectrogram(y): +def melspectrogram(y, hparams=hparams): D = _stft(y) S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db return _normalize(S) -def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8): +def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams): window_length = int(hparams.sample_rate * min_silence_sec) hop_length = int(window_length / 4) threshold = _db_to_amp(threshold_db) for x in range(hop_length, len(wav) - window_length, hop_length): - if np.max(wav[x:x+window_length]) < threshold: + if np.max(wav[x:x + window_length]) < threshold: return x + hop_length return len(wav) -def _griffin_lim(S): +def _griffin_lim(S, hparams=hparams): '''librosa implementation of Griffin-Lim Based on https://github.com/librosa/librosa/issues/434 ''' @@ -63,7 +73,7 @@ def _griffin_lim(S): return y -def _griffin_lim_tensorflow(S): +def _griffin_lim_tensorflow(S, hparams=hparams): '''TensorFlow implementation of Griffin-Lim Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb ''' @@ -99,7 +109,7 @@ def _istft_tensorflow(stfts): return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) -def _stft_parameters(): +def _stft_parameters(hparams=hparams): n_fft = (hparams.num_freq - 1) * 2 hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) @@ -110,31 +120,39 @@ def _stft_parameters(): _mel_basis = None + def _linear_to_mel(spectrogram): global _mel_basis if _mel_basis is None: _mel_basis = _build_mel_basis() return np.dot(_mel_basis, spectrogram) -def _build_mel_basis(): + +def _build_mel_basis(hparams=hparams): n_fft = (hparams.num_freq - 1) * 2 return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, - fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq) + fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq) + def _amp_to_db(x): return 20 * np.log10(np.maximum(1e-5, x)) + def _db_to_amp(x): return np.power(10.0, x * 0.05) + def _db_to_amp_tensorflow(x): return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) -def _normalize(S): + +def _normalize(S, hparams=hparams): return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) -def _denormalize(S): + +def _denormalize(S, hparams=hparams): return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db -def _denormalize_tensorflow(S): + +def _denormalize_tensorflow(S, hparams=hparams): return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db