mimic2/datasets/datafeeder.py

152 lines
5.5 KiB
Python

import numpy as np
import os
import random
import tensorflow as tf
import threading
import time
import traceback
from text import cmudict, text_to_sequence
from util.infolog import log
_batches_per_group = 32
_p_cmudict = 0.5
_pad = 0
class DataFeeder(threading.Thread):
'''Feeds batches of data into a queue on a background thread.'''
def __init__(self, coordinator, metadata_filename, hparams):
super(DataFeeder, self).__init__()
self._coord = coordinator
self._hparams = hparams
self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
self._offset = 0
# Load metadata:
self._datadir = os.path.dirname(metadata_filename)
with open(metadata_filename, encoding='utf-8') as f:
self._metadata = [line.strip().split('|') for line in f]
hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000)
log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours))
# Create placeholders for inputs and targets. Don't specify batch size because we want to
# be able to feed different sized batches at eval time.
self._placeholders = [
tf.placeholder(tf.int32, [None, None], 'inputs'),
tf.placeholder(tf.int32, [None], 'input_lengths'),
tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets')
]
# Create queue for buffering data:
queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue')
self._enqueue_op = queue.enqueue(self._placeholders)
self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue()
self.inputs.set_shape(self._placeholders[0].shape)
self.input_lengths.set_shape(self._placeholders[1].shape)
self.mel_targets.set_shape(self._placeholders[2].shape)
self.linear_targets.set_shape(self._placeholders[3].shape)
# Load CMUDict: If enabled, this will randomly substitute some words in the training data with
# their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
# synthesis (useful for proper nouns, etc.)
if hparams.use_cmudict:
cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
if not os.path.isfile(cmudict_path):
raise Exception('If use_cmudict=True, you must download ' +
'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path)
self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict))
else:
self._cmudict = None
def start_in_session(self, session):
self._session = session
self.start()
def run(self):
try:
while not self._coord.should_stop():
self._enqueue_next_group()
except Exception as e:
traceback.print_exc()
self._coord.request_stop(e)
def _enqueue_next_group(self):
start = time.time()
# Read a group of examples:
n = self._hparams.batch_size
r = self._hparams.outputs_per_step
examples = [self._get_next_example() for i in range(n * _batches_per_group)]
# Bucket examples based on similar output sequence length for efficiency:
examples.sort(key=lambda x: x[-1])
batches = [examples[i:i+n] for i in range(0, len(examples), n)]
random.shuffle(batches)
log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
for batch in batches:
feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
self._session.run(self._enqueue_op, feed_dict=feed_dict)
def _get_next_example(self):
'''Loads a single example (input, mel_target, linear_target, cost) from disk'''
if self._offset >= len(self._metadata):
self._offset = 0
random.shuffle(self._metadata)
meta = self._metadata[self._offset]
self._offset += 1
text = meta[3]
if self._cmudict and random.random() < _p_cmudict:
text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])
input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
linear_target = np.load(os.path.join(self._datadir, meta[0]))
mel_target = np.load(os.path.join(self._datadir, meta[1]))
return (input_data, mel_target, linear_target, len(linear_target))
def _maybe_get_arpabet(self, word):
arpabet = self._cmudict.lookup(word)
return '{%s}' % arpabet[0] if arpabet is not None and random.random() < 0.5 else word
def _prepare_batch(batch, outputs_per_step):
random.shuffle(batch)
inputs = _prepare_inputs([x[0] for x in batch])
input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step)
return (inputs, input_lengths, mel_targets, linear_targets)
def _prepare_inputs(inputs):
max_len = max((len(x) for x in inputs))
return np.stack([_pad_input(x, max_len) for x in inputs])
def _prepare_targets(targets, alignment):
max_len = max((len(t) for t in targets)) + 1
return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
def _pad_input(x, length):
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
def _pad_target(t, length):
return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad)
def _round_up(x, multiple):
remainder = x % multiple
return x if remainder == 0 else x + multiple - remainder