mimic2/datasets/amy.py

58 lines
2.0 KiB
Python

from concurrent.futures import ProcessPoolExecutor
from functools import partial
import glob
import librosa
import numpy as np
import os
from hparams import hparams
from util import audio
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
'''Preprocesses the Amy dataset from a given input path into a given output directory.'''
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
# Read all of the .wav files:
paths = {}
for path in glob.glob(os.path.join(in_dir, 'audio', '*.wav')):
prompt_id = os.path.basename(path).split('-')[-2]
paths[prompt_id] = path
# Read the prompts file:
with open(os.path.join(in_dir, 'prompts.txt'), encoding='utf-8') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) == 3 and parts[0] in paths:
path = paths[parts[0]]
text = parts[2]
futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0], path, text)))
return [future.result() for future in tqdm(futures)]
def _process_utterance(out_dir, prompt_id, wav_path, text):
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
# Trim leading and trailing silence:
margin = int(hparams.sample_rate * 0.1)
wav = wav[margin:-margin]
wav, _ = librosa.effects.trim(wav, top_db=40, frame_length=1024, hop_length=256)
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk:
spectrogram_filename = 'amy-spec-%s.npy' % prompt_id
mel_filename = 'amy-mel-%s.npy' % prompt_id
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text)