mirror of https://github.com/MycroftAI/mimic2.git
took out hparams for preprocessing
parent
bda7cdad7a
commit
ea21e9d30e
|
@ -1,3 +1,8 @@
|
|||
"""mailabs dataset is sampled at 16000 kHz with 0.5 seconds of silence
|
||||
in the start and end of the audio data. Make sure to change the
|
||||
sample_size hparams to match this.
|
||||
"""
|
||||
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from functools import partial
|
||||
import numpy as np
|
||||
|
@ -5,11 +10,11 @@ import os
|
|||
from util import audio
|
||||
|
||||
|
||||
def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda x: x):
|
||||
def build_from_path(in_dir, out_dir, books, num_workers=1, tqdm=lambda x: x):
|
||||
'''Preprocesses the mailabs Speech dataset from a given input path into a given output directory.
|
||||
|
||||
Args:
|
||||
in_dir: The directory where you have downloaded the LJ Speech dataset
|
||||
in_dir: The directory where you have downloaded the mailabs Speech dataset
|
||||
out_dir: The directory to write the output into
|
||||
num_workers: Optional number of worker processes to parallelize across
|
||||
tqdm: You can optionally pass tqdm to get a nice progress bar
|
||||
|
@ -35,12 +40,12 @@ def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda
|
|||
text = parts[2]
|
||||
futures.append(
|
||||
executor.submit(partial(
|
||||
_process_utterance, out_dir, name, wav_path, text, hparams)
|
||||
_process_utterance, out_dir, name, wav_path, text)
|
||||
))
|
||||
return [future.result() for future in tqdm(futures)]
|
||||
|
||||
|
||||
def _process_utterance(out_dir, name, wav_path, text, hparams):
|
||||
def _process_utterance(out_dir, name, wav_path, text):
|
||||
'''Preprocesses a single utterance audio/text pair.
|
||||
|
||||
This writes the mel and linear scale spectrograms to disk and returns a tuple to write
|
||||
|
@ -57,17 +62,17 @@ def _process_utterance(out_dir, name, wav_path, text, hparams):
|
|||
'''
|
||||
|
||||
# Load the audio to a numpy array:
|
||||
wav = audio.load_wav(wav_path, hparams)
|
||||
wav = audio.load_wav(wav_path)
|
||||
|
||||
# trim silences here
|
||||
wav = audio.trim_silence(wav, hparams)
|
||||
wav = audio.trim_silence(wav)
|
||||
|
||||
# Compute the linear-scale spectrogram from the wav:
|
||||
spectrogram = audio.spectrogram(wav, hparams).astype(np.float32)
|
||||
spectrogram = audio.spectrogram(wav).astype(np.float32)
|
||||
n_frames = spectrogram.shape[1]
|
||||
|
||||
# Compute a mel-scale spectrogram from the wav:
|
||||
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
||||
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
|
||||
|
||||
# Write the spectrograms to disk:
|
||||
spectrogram_filename = 'mailabs-spec-{}.npy'.format(name)
|
||||
|
|
|
@ -3,7 +3,7 @@ import os
|
|||
from multiprocessing import cpu_count
|
||||
from tqdm import tqdm
|
||||
from datasets import amy, blizzard, ljspeech, kusal, mailabs
|
||||
from hparams import hparams
|
||||
from hparams import hparams, hparams_debug_string
|
||||
|
||||
|
||||
def preprocess_blizzard(args):
|
||||
|
@ -47,11 +47,11 @@ def preprocess_mailabs(args):
|
|||
os.makedirs(out_dir, exist_ok=True)
|
||||
books = args.books
|
||||
metadata = mailabs.build_from_path(
|
||||
in_dir, out_dir, books, args.hparams, args.num_workers, tqdm)
|
||||
write_metadata(metadata, out_dir, args.hparams)
|
||||
in_dir, out_dir, books, args.num_workers, tqdm)
|
||||
write_metadata(metadata, out_dir)
|
||||
|
||||
|
||||
def write_metadata(metadata, out_dir, hparams=hparams):
|
||||
def write_metadata(metadata, out_dir):
|
||||
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
|
||||
for m in metadata:
|
||||
f.write('|'.join([str(x) for x in m]) + '\n')
|
||||
|
@ -76,9 +76,6 @@ def main():
|
|||
'--books',
|
||||
help='comma-seperated and no space name of books i.e hunter_space,pink_fairy_book,etc.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--hparams', default='',
|
||||
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
|
||||
parser.add_argument('--num_workers', type=int, default=cpu_count())
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -89,7 +86,7 @@ def main():
|
|||
parser.error(
|
||||
"--mailabs_books_dir required if mailabs is chosen for dataset.")
|
||||
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
print(hparams_debug_string())
|
||||
|
||||
if args.dataset == 'amy':
|
||||
preprocess_amy(args)
|
||||
|
|
|
@ -6,30 +6,29 @@ import tensorflow as tf
|
|||
from scipy import signal
|
||||
from hparams import hparams
|
||||
|
||||
|
||||
def load_wav(path, hparams=hparams):
|
||||
def load_wav(path):
|
||||
return librosa.core.load(path, sr=hparams.sample_rate)[0]
|
||||
|
||||
|
||||
def save_wav(wav, path, hparams=hparams):
|
||||
def save_wav(wav, path):
|
||||
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
||||
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
|
||||
|
||||
|
||||
def trim_silence(wav, hparams=hparams):
|
||||
def trim_silence(wav):
|
||||
return librosa.effects.trim(
|
||||
wav, top_db=hparams.trim_top_db,
|
||||
frame_length=hparams.trim_fft_size,
|
||||
hop_length=hparams.trim_hop_size)[0]
|
||||
|
||||
|
||||
def spectrogram(y, hparams=hparams):
|
||||
def spectrogram(y):
|
||||
D = _stft(y)
|
||||
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
|
||||
return _normalize(S)
|
||||
|
||||
|
||||
def inv_spectrogram(spectrogram, hparams=hparams):
|
||||
def inv_spectrogram(spectrogram):
|
||||
'''Converts spectrogram to waveform using librosa'''
|
||||
S = _db_to_amp(_denormalize(spectrogram) +
|
||||
hparams.ref_level_db) # Convert back to linear
|
||||
|
@ -37,20 +36,20 @@ def inv_spectrogram(spectrogram, hparams=hparams):
|
|||
return _griffin_lim(S ** hparams.power)
|
||||
|
||||
|
||||
def inv_spectrogram_tensorflow(spectrogram, hparams=hparams):
|
||||
def inv_spectrogram_tensorflow(spectrogram):
|
||||
'''Builds computational graph to convert spectrogram to waveform using TensorFlow.'''
|
||||
S = _db_to_amp_tensorflow(_denormalize_tensorflow(
|
||||
spectrogram) + hparams.ref_level_db)
|
||||
return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
|
||||
|
||||
|
||||
def melspectrogram(y, hparams=hparams):
|
||||
def melspectrogram(y):
|
||||
D = _stft(y)
|
||||
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
|
||||
return _normalize(S)
|
||||
|
||||
|
||||
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams):
|
||||
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
|
||||
window_length = int(hparams.sample_rate * min_silence_sec)
|
||||
hop_length = int(window_length / 4)
|
||||
threshold = _db_to_amp(threshold_db)
|
||||
|
@ -60,7 +59,7 @@ def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams):
|
|||
return len(wav)
|
||||
|
||||
|
||||
def _griffin_lim(S, hparams=hparams):
|
||||
def _griffin_lim(S):
|
||||
'''librosa implementation of Griffin-Lim
|
||||
Based on https://github.com/librosa/librosa/issues/434
|
||||
'''
|
||||
|
@ -73,7 +72,7 @@ def _griffin_lim(S, hparams=hparams):
|
|||
return y
|
||||
|
||||
|
||||
def _griffin_lim_tensorflow(S, hparams=hparams):
|
||||
def _griffin_lim_tensorflow(S):
|
||||
'''TensorFlow implementation of Griffin-Lim
|
||||
Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
|
||||
'''
|
||||
|
@ -109,7 +108,7 @@ def _istft_tensorflow(stfts):
|
|||
return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
|
||||
|
||||
|
||||
def _stft_parameters(hparams=hparams):
|
||||
def _stft_parameters():
|
||||
n_fft = (hparams.num_freq - 1) * 2
|
||||
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
||||
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
|
||||
|
@ -128,7 +127,7 @@ def _linear_to_mel(spectrogram):
|
|||
return np.dot(_mel_basis, spectrogram)
|
||||
|
||||
|
||||
def _build_mel_basis(hparams=hparams):
|
||||
def _build_mel_basis():
|
||||
n_fft = (hparams.num_freq - 1) * 2
|
||||
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels,
|
||||
fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq)
|
||||
|
@ -146,13 +145,13 @@ def _db_to_amp_tensorflow(x):
|
|||
return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
|
||||
|
||||
|
||||
def _normalize(S, hparams=hparams):
|
||||
def _normalize(S):
|
||||
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
|
||||
|
||||
|
||||
def _denormalize(S, hparams=hparams):
|
||||
def _denormalize(S):
|
||||
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
|
||||
|
||||
|
||||
def _denormalize_tensorflow(S, hparams=hparams):
|
||||
def _denormalize_tensorflow(S):
|
||||
return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
|
||||
|
|
Loading…
Reference in New Issue