took out hparams for preprocessing

pull/15/head
Michael Nguyen 2018-07-03 19:41:44 -05:00
parent bda7cdad7a
commit ea21e9d30e
3 changed files with 33 additions and 32 deletions

View File

@ -1,3 +1,8 @@
"""mailabs dataset is sampled at 16000 kHz with 0.5 seconds of silence
in the start and end of the audio data. Make sure to change the
sample_size hparams to match this.
"""
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
from functools import partial from functools import partial
import numpy as np import numpy as np
@ -5,11 +10,11 @@ import os
from util import audio from util import audio
def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda x: x): def build_from_path(in_dir, out_dir, books, num_workers=1, tqdm=lambda x: x):
'''Preprocesses the mailabs Speech dataset from a given input path into a given output directory. '''Preprocesses the mailabs Speech dataset from a given input path into a given output directory.
Args: Args:
in_dir: The directory where you have downloaded the LJ Speech dataset in_dir: The directory where you have downloaded the mailabs Speech dataset
out_dir: The directory to write the output into out_dir: The directory to write the output into
num_workers: Optional number of worker processes to parallelize across num_workers: Optional number of worker processes to parallelize across
tqdm: You can optionally pass tqdm to get a nice progress bar tqdm: You can optionally pass tqdm to get a nice progress bar
@ -35,12 +40,12 @@ def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda
text = parts[2] text = parts[2]
futures.append( futures.append(
executor.submit(partial( executor.submit(partial(
_process_utterance, out_dir, name, wav_path, text, hparams) _process_utterance, out_dir, name, wav_path, text)
)) ))
return [future.result() for future in tqdm(futures)] return [future.result() for future in tqdm(futures)]
def _process_utterance(out_dir, name, wav_path, text, hparams): def _process_utterance(out_dir, name, wav_path, text):
'''Preprocesses a single utterance audio/text pair. '''Preprocesses a single utterance audio/text pair.
This writes the mel and linear scale spectrograms to disk and returns a tuple to write This writes the mel and linear scale spectrograms to disk and returns a tuple to write
@ -57,17 +62,17 @@ def _process_utterance(out_dir, name, wav_path, text, hparams):
''' '''
# Load the audio to a numpy array: # Load the audio to a numpy array:
wav = audio.load_wav(wav_path, hparams) wav = audio.load_wav(wav_path)
# trim silences here # trim silences here
wav = audio.trim_silence(wav, hparams) wav = audio.trim_silence(wav)
# Compute the linear-scale spectrogram from the wav: # Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav, hparams).astype(np.float32) spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1] n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav: # Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk: # Write the spectrograms to disk:
spectrogram_filename = 'mailabs-spec-{}.npy'.format(name) spectrogram_filename = 'mailabs-spec-{}.npy'.format(name)

View File

@ -3,7 +3,7 @@ import os
from multiprocessing import cpu_count from multiprocessing import cpu_count
from tqdm import tqdm from tqdm import tqdm
from datasets import amy, blizzard, ljspeech, kusal, mailabs from datasets import amy, blizzard, ljspeech, kusal, mailabs
from hparams import hparams from hparams import hparams, hparams_debug_string
def preprocess_blizzard(args): def preprocess_blizzard(args):
@ -47,11 +47,11 @@ def preprocess_mailabs(args):
os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir, exist_ok=True)
books = args.books books = args.books
metadata = mailabs.build_from_path( metadata = mailabs.build_from_path(
in_dir, out_dir, books, args.hparams, args.num_workers, tqdm) in_dir, out_dir, books, args.num_workers, tqdm)
write_metadata(metadata, out_dir, args.hparams) write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir, hparams=hparams): def write_metadata(metadata, out_dir):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata: for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n') f.write('|'.join([str(x) for x in m]) + '\n')
@ -76,9 +76,6 @@ def main():
'--books', '--books',
help='comma-seperated and no space name of books i.e hunter_space,pink_fairy_book,etc.', help='comma-seperated and no space name of books i.e hunter_space,pink_fairy_book,etc.',
) )
parser.add_argument(
'--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--num_workers', type=int, default=cpu_count()) parser.add_argument('--num_workers', type=int, default=cpu_count())
args = parser.parse_args() args = parser.parse_args()
@ -89,7 +86,7 @@ def main():
parser.error( parser.error(
"--mailabs_books_dir required if mailabs is chosen for dataset.") "--mailabs_books_dir required if mailabs is chosen for dataset.")
args.hparams = hparams.parse(args.hparams) print(hparams_debug_string())
if args.dataset == 'amy': if args.dataset == 'amy':
preprocess_amy(args) preprocess_amy(args)

View File

@ -6,30 +6,29 @@ import tensorflow as tf
from scipy import signal from scipy import signal
from hparams import hparams from hparams import hparams
def load_wav(path):
def load_wav(path, hparams=hparams):
return librosa.core.load(path, sr=hparams.sample_rate)[0] return librosa.core.load(path, sr=hparams.sample_rate)[0]
def save_wav(wav, path, hparams=hparams): def save_wav(wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav))) wav *= 32767 / max(0.01, np.max(np.abs(wav)))
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate) librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
def trim_silence(wav, hparams=hparams): def trim_silence(wav):
return librosa.effects.trim( return librosa.effects.trim(
wav, top_db=hparams.trim_top_db, wav, top_db=hparams.trim_top_db,
frame_length=hparams.trim_fft_size, frame_length=hparams.trim_fft_size,
hop_length=hparams.trim_hop_size)[0] hop_length=hparams.trim_hop_size)[0]
def spectrogram(y, hparams=hparams): def spectrogram(y):
D = _stft(y) D = _stft(y)
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
return _normalize(S) return _normalize(S)
def inv_spectrogram(spectrogram, hparams=hparams): def inv_spectrogram(spectrogram):
'''Converts spectrogram to waveform using librosa''' '''Converts spectrogram to waveform using librosa'''
S = _db_to_amp(_denormalize(spectrogram) + S = _db_to_amp(_denormalize(spectrogram) +
hparams.ref_level_db) # Convert back to linear hparams.ref_level_db) # Convert back to linear
@ -37,20 +36,20 @@ def inv_spectrogram(spectrogram, hparams=hparams):
return _griffin_lim(S ** hparams.power) return _griffin_lim(S ** hparams.power)
def inv_spectrogram_tensorflow(spectrogram, hparams=hparams): def inv_spectrogram_tensorflow(spectrogram):
'''Builds computational graph to convert spectrogram to waveform using TensorFlow.''' '''Builds computational graph to convert spectrogram to waveform using TensorFlow.'''
S = _db_to_amp_tensorflow(_denormalize_tensorflow( S = _db_to_amp_tensorflow(_denormalize_tensorflow(
spectrogram) + hparams.ref_level_db) spectrogram) + hparams.ref_level_db)
return _griffin_lim_tensorflow(tf.pow(S, hparams.power)) return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
def melspectrogram(y, hparams=hparams): def melspectrogram(y):
D = _stft(y) D = _stft(y)
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
return _normalize(S) return _normalize(S)
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams): def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(hparams.sample_rate * min_silence_sec) window_length = int(hparams.sample_rate * min_silence_sec)
hop_length = int(window_length / 4) hop_length = int(window_length / 4)
threshold = _db_to_amp(threshold_db) threshold = _db_to_amp(threshold_db)
@ -60,7 +59,7 @@ def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams):
return len(wav) return len(wav)
def _griffin_lim(S, hparams=hparams): def _griffin_lim(S):
'''librosa implementation of Griffin-Lim '''librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434 Based on https://github.com/librosa/librosa/issues/434
''' '''
@ -73,7 +72,7 @@ def _griffin_lim(S, hparams=hparams):
return y return y
def _griffin_lim_tensorflow(S, hparams=hparams): def _griffin_lim_tensorflow(S):
'''TensorFlow implementation of Griffin-Lim '''TensorFlow implementation of Griffin-Lim
Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
''' '''
@ -109,7 +108,7 @@ def _istft_tensorflow(stfts):
return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
def _stft_parameters(hparams=hparams): def _stft_parameters():
n_fft = (hparams.num_freq - 1) * 2 n_fft = (hparams.num_freq - 1) * 2
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
@ -128,7 +127,7 @@ def _linear_to_mel(spectrogram):
return np.dot(_mel_basis, spectrogram) return np.dot(_mel_basis, spectrogram)
def _build_mel_basis(hparams=hparams): def _build_mel_basis():
n_fft = (hparams.num_freq - 1) * 2 n_fft = (hparams.num_freq - 1) * 2
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels,
fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq) fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq)
@ -146,13 +145,13 @@ def _db_to_amp_tensorflow(x):
return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
def _normalize(S, hparams=hparams): def _normalize(S):
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
def _denormalize(S, hparams=hparams): def _denormalize(S):
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
def _denormalize_tensorflow(S, hparams=hparams): def _denormalize_tensorflow(S):
return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db