mirror of https://github.com/MycroftAI/mimic2.git
Initial commit
parent
3ab028c4e6
commit
bc706654ad
|
@ -0,0 +1,5 @@
|
|||
__pycache__/
|
||||
.cache/
|
||||
*.pyc
|
||||
.DS_Store
|
||||
run*.sh
|
|
@ -0,0 +1,19 @@
|
|||
Copyright (c) 2017 Keith Ito
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
110
README.md
110
README.md
|
@ -1,2 +1,112 @@
|
|||
# tacotron
|
||||
|
||||
An implementation of Google's Tacotron speech synthesis model in Tensorflow.
|
||||
|
||||
|
||||
## Overview
|
||||
|
||||
Earlier this year, Google published a paper, [Tacotron: A Fully End-to-End Text-To-Speech Synthesis Model](https://arxiv.org/pdf/1703.10135.pdf),
|
||||
where they present a neural text-to-speech model that learns to synthesize speech directly from
|
||||
(text, audio) pairs.
|
||||
|
||||
Google [released](https://google.github.io/tacotron) some nice audio samples that their model
|
||||
generated but didn't provide their source code or training data. This is an attempt to
|
||||
implement the model described in their paper.
|
||||
|
||||
Output after training for 185K steps (~2 days):
|
||||
|
||||
* [Audio Samples](https://keithito.github.io/audio-samples/)
|
||||
|
||||
The quality isn't as good as what Google demoed. But hopefully it will get there someday :-).
|
||||
|
||||
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Installing dependencies
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
||||
### Using a pre-trained model
|
||||
|
||||
1. Download and unpack a model:
|
||||
```
|
||||
curl http://data.keithito.com/data/speech/tacotron-20170708.tar.bz2 | tar x -C /tmp
|
||||
```
|
||||
|
||||
2. Run the demo server:
|
||||
```
|
||||
python3 demo_server.py --checkpoint /tmp/tacotron-20170708/model.ckpt
|
||||
```
|
||||
|
||||
3. Point your browser at [localhost:9000](http://localhost:9000) and type!
|
||||
|
||||
|
||||
|
||||
### Training
|
||||
|
||||
1. Download a speech dataset. The following are supported out of the box:
|
||||
* [LJ Speech](https://keithito.com/LJ-Speech-Dataset) (Public Domain)
|
||||
* [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
|
||||
|
||||
You can use other datasets if you convert them to the right format. See
|
||||
[ljspeech.py](datasets/ljspeech.py) for an example.
|
||||
|
||||
|
||||
2. Unpack the dataset into `~/tacotron`. After unpacking, your tree should look like this for
|
||||
LJ Speech:
|
||||
```
|
||||
tacotron
|
||||
|- LJSpeech-1.0
|
||||
|- metadata.csv
|
||||
|- wavs
|
||||
```
|
||||
|
||||
or like this for Blizzard 2012:
|
||||
```
|
||||
tacotron
|
||||
|- Blizzard2012
|
||||
|- ATrampAbroad
|
||||
| |- sentence_index.txt
|
||||
| |- lab
|
||||
| |- wav
|
||||
|- TheManThatCorruptedHadleyburg
|
||||
|- sentence_index.txt
|
||||
|- lab
|
||||
|- wav
|
||||
```
|
||||
|
||||
3. Preprocess the data
|
||||
```
|
||||
python3 preprocess.py --dataset ljspeech
|
||||
```
|
||||
*Use --dataset blizzard for Blizzard data*
|
||||
|
||||
4. Train
|
||||
```
|
||||
python3 train.py
|
||||
```
|
||||
*Note: using [TCMalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html) seems to
|
||||
improve training performance.*
|
||||
|
||||
5. Monitor with Tensorboard (optional)
|
||||
```
|
||||
tensorboard --logdir ~/tacotron/logs-tacotron
|
||||
```
|
||||
|
||||
The trainer dumps audio and alignments every 1000 steps. You can find these in
|
||||
`~/tacotron/logs-tacotron`. You can also pass a Slack webhook URL as the `--slack_url`
|
||||
flag, and it will send you progress updates.
|
||||
|
||||
|
||||
|
||||
## Other Implementations
|
||||
|
||||
* Alex Barron has some nice results from his implementation trained on the
|
||||
[Nancy Corpus](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011):
|
||||
https://github.com/barronalex/Tacotron
|
||||
|
||||
* Kyubyong Park has a very promising implementation trained on the World English Bible here:
|
||||
https://github.com/Kyubyong/tacotron
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
from concurrent.futures import ProcessPoolExecutor
|
||||
from functools import partial
|
||||
import numpy as np
|
||||
import os
|
||||
from hparams import hparams
|
||||
from util import audio
|
||||
|
||||
|
||||
_max_out_length = 700
|
||||
_end_buffer = 0.05
|
||||
_min_confidence = 90
|
||||
|
||||
# Note: "A Tramp Abroad" & "The Man That Corrupted Hadleyburg" are higher quality than the others.
|
||||
books = [
|
||||
'ATrampAbroad',
|
||||
'TheManThatCorruptedHadleyburg',
|
||||
# 'LifeOnTheMississippi',
|
||||
# 'TheAdventuresOfTomSawyer',
|
||||
]
|
||||
|
||||
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
|
||||
executor = ProcessPoolExecutor(max_workers=num_workers)
|
||||
futures = []
|
||||
index = 1
|
||||
for book in books:
|
||||
with open(os.path.join(in_dir, book, 'sentence_index.txt')) as f:
|
||||
for line in f:
|
||||
parts = line.strip().split('\t')
|
||||
if line[0] is not '#' and len(parts) == 8 and float(parts[3]) > _min_confidence:
|
||||
wav_path = os.path.join(in_dir, book, 'wav', '%s.wav' % parts[0])
|
||||
labels_path = os.path.join(in_dir, book, 'lab', '%s.lab' % parts[0])
|
||||
text = parts[5]
|
||||
task = partial(_process_utterance, out_dir, index, wav_path, labels_path, text)
|
||||
futures.append(executor.submit(task))
|
||||
index += 1
|
||||
results = [future.result() for future in tqdm(futures)]
|
||||
return [r for r in results if r is not None]
|
||||
|
||||
|
||||
def _process_utterance(out_dir, index, wav_path, labels_path, text):
|
||||
# Load the wav file and trim silence from the ends:
|
||||
wav = audio.load_wav(wav_path)
|
||||
start_offset, end_offset = _parse_labels(labels_path)
|
||||
start = int(start_offset * hparams.sample_rate)
|
||||
end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
|
||||
wav = wav[start:end]
|
||||
max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
|
||||
if len(wav) > max_samples:
|
||||
return None
|
||||
spectrogram = audio.spectrogram(wav).astype(np.float32)
|
||||
n_frames = spectrogram.shape[1]
|
||||
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
|
||||
spectrogram_filename = 'blizzard-spec-%05d.npy' % index
|
||||
mel_filename = 'blizzard-mel-%05d.npy' % index
|
||||
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
|
||||
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
|
||||
return (spectrogram_filename, mel_filename, n_frames, text)
|
||||
|
||||
|
||||
def _parse_labels(path):
|
||||
labels = []
|
||||
with open(os.path.join(path)) as f:
|
||||
for line in f:
|
||||
parts = line.strip().split(' ')
|
||||
if len(parts) >= 3:
|
||||
labels.append((float(parts[0]), ' '.join(parts[2:])))
|
||||
start = 0
|
||||
end = None
|
||||
if labels[0][1] == 'sil':
|
||||
start = labels[0][0]
|
||||
if labels[-1][1] == 'sil':
|
||||
end = labels[-2][0] + _end_buffer
|
||||
return (start, end)
|
|
@ -0,0 +1,150 @@
|
|||
import numpy as np
|
||||
import os
|
||||
import random
|
||||
import tensorflow as tf
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from util import cmudict, textinput
|
||||
from util.infolog import log
|
||||
|
||||
|
||||
_batches_per_group = 32
|
||||
_p_cmudict = 0.5
|
||||
_pad = 0
|
||||
|
||||
|
||||
class DataFeeder(threading.Thread):
|
||||
'''Feeds batches of data into a queue on a background thread.'''
|
||||
|
||||
def __init__(self, coordinator, metadata_filename, hparams):
|
||||
super(DataFeeder, self).__init__()
|
||||
self._coord = coordinator
|
||||
self._hparams = hparams
|
||||
self._offset = 0
|
||||
|
||||
# Load metadata:
|
||||
self._datadir = os.path.dirname(metadata_filename)
|
||||
with open(metadata_filename) as f:
|
||||
self._metadata = [line.strip().split('|') for line in f]
|
||||
hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000)
|
||||
log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours))
|
||||
|
||||
# Create placeholders for inputs and targets. Don't specify batch size because we want to
|
||||
# be able to feed different sized batches at eval time.
|
||||
self._placeholders = [
|
||||
tf.placeholder(tf.int32, [None, None], 'inputs'),
|
||||
tf.placeholder(tf.int32, [None], 'input_lengths'),
|
||||
tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
|
||||
tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets')
|
||||
]
|
||||
|
||||
# Create queue for buffering data:
|
||||
queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue')
|
||||
self._enqueue_op = queue.enqueue(self._placeholders)
|
||||
self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue()
|
||||
self.inputs.set_shape(self._placeholders[0].shape)
|
||||
self.input_lengths.set_shape(self._placeholders[1].shape)
|
||||
self.mel_targets.set_shape(self._placeholders[2].shape)
|
||||
self.linear_targets.set_shape(self._placeholders[3].shape)
|
||||
|
||||
# Load CMUDict: If enabled, this will randomly substitute some words in the training data with
|
||||
# their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
|
||||
# synthesis (useful for proper nouns, etc.)
|
||||
if hparams.use_cmudict:
|
||||
cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
|
||||
if not os.path.isfile(cmudict_path):
|
||||
raise Exception('If use_cmudict=True, you must download ' +
|
||||
'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path)
|
||||
self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
|
||||
log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict))
|
||||
else:
|
||||
self._cmudict = None
|
||||
|
||||
|
||||
def start_in_session(self, session):
|
||||
self._session = session
|
||||
self.start()
|
||||
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
while not self._coord.should_stop():
|
||||
self._enqueue_next_group()
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
self._coord.request_stop(e)
|
||||
|
||||
|
||||
def _enqueue_next_group(self):
|
||||
start = time.time()
|
||||
|
||||
# Read a group of examples:
|
||||
n = self._hparams.batch_size
|
||||
r = self._hparams.outputs_per_step
|
||||
examples = [self._get_next_example() for i in range(n * _batches_per_group)]
|
||||
|
||||
# Bucket examples based on similar output sequence length for efficiency:
|
||||
examples.sort(key=lambda x: x[-1])
|
||||
batches = [examples[i:i+n] for i in range(0, len(examples), n)]
|
||||
random.shuffle(batches)
|
||||
|
||||
log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
|
||||
for batch in batches:
|
||||
feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
|
||||
self._session.run(self._enqueue_op, feed_dict=feed_dict)
|
||||
|
||||
|
||||
def _get_next_example(self):
|
||||
'''Loads a single example (input, mel_target, linear_target, cost) from disk'''
|
||||
if self._offset >= len(self._metadata):
|
||||
self._offset = 0
|
||||
random.shuffle(self._metadata)
|
||||
meta = self._metadata[self._offset]
|
||||
self._offset += 1
|
||||
|
||||
text = meta[3]
|
||||
if self._cmudict and random.random() < _p_cmudict:
|
||||
text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])
|
||||
|
||||
input_data = np.asarray(textinput.to_sequence(text), dtype=np.int32)
|
||||
linear_target = np.load(os.path.join(self._datadir, meta[0]))
|
||||
mel_target = np.load(os.path.join(self._datadir, meta[1]))
|
||||
return (input_data, mel_target, linear_target, len(linear_target))
|
||||
|
||||
|
||||
def _maybe_get_arpabet(self, word):
|
||||
pron = self._cmudict.lookup(word)
|
||||
return '{%s}' % pron[0] if pron is not None and random.random() < 0.5 else word
|
||||
|
||||
|
||||
def _prepare_batch(batch, outputs_per_step):
|
||||
random.shuffle(batch)
|
||||
inputs = _prepare_inputs([x[0] for x in batch])
|
||||
input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
|
||||
mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
|
||||
linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step)
|
||||
return (inputs, input_lengths, mel_targets, linear_targets)
|
||||
|
||||
|
||||
def _prepare_inputs(inputs):
|
||||
max_len = max((len(x) for x in inputs))
|
||||
return np.stack([_pad_input(x, max_len) for x in inputs])
|
||||
|
||||
|
||||
def _prepare_targets(targets, alignment):
|
||||
max_len = max((len(t) for t in targets)) + 1
|
||||
return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
|
||||
|
||||
|
||||
def _pad_input(x, length):
|
||||
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
|
||||
|
||||
|
||||
def _pad_target(t, length):
|
||||
return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad)
|
||||
|
||||
|
||||
def _round_up(x, multiple):
|
||||
remainder = x % multiple
|
||||
return x if remainder == 0 else x + multiple - remainder
|
|
@ -0,0 +1,31 @@
|
|||
from concurrent.futures import ProcessPoolExecutor
|
||||
from functools import partial
|
||||
import numpy as np
|
||||
import os
|
||||
from util import audio
|
||||
|
||||
|
||||
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
|
||||
executor = ProcessPoolExecutor(max_workers=num_workers)
|
||||
futures = []
|
||||
index = 1
|
||||
with open(os.path.join(in_dir, 'metadata.csv')) as f:
|
||||
for line in f:
|
||||
parts = line.strip().split('|')
|
||||
wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
|
||||
text = parts[2]
|
||||
futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text)))
|
||||
index += 1
|
||||
return [future.result() for future in tqdm(futures)]
|
||||
|
||||
|
||||
def _process_utterance(out_dir, index, wav_path, text):
|
||||
wav = audio.load_wav(wav_path)
|
||||
spectrogram = audio.spectrogram(wav).astype(np.float32)
|
||||
n_frames = spectrogram.shape[1]
|
||||
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
|
||||
spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
|
||||
mel_filename = 'ljspeech-mel-%05d.npy' % index
|
||||
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
|
||||
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
|
||||
return (spectrogram_filename, mel_filename, n_frames, text)
|
|
@ -0,0 +1,96 @@
|
|||
import argparse
|
||||
import falcon
|
||||
from hparams import hparams, hparams_debug_string
|
||||
import os
|
||||
from synthesizer import Synthesizer
|
||||
|
||||
|
||||
html_body = '''<html><title>Demo</title>
|
||||
<style>
|
||||
body {padding: 16px; font-family: sans-serif; font-size: 14px; color: #444}
|
||||
input {font-size: 14px; padding: 8px 12px; outline: none; border: 1px solid #ddd}
|
||||
input:focus {box-shadow: 0 1px 2px rgba(0,0,0,.15)}
|
||||
p {padding: 12px}
|
||||
button {background: #28d; padding: 9px 14px; margin-left: 8px; border: none; outline: none;
|
||||
color: #fff; font-size: 14px; border-radius: 4px; cursor: pointer;}
|
||||
button:hover {box-shadow: 0 1px 2px rgba(0,0,0,.15); opacity: 0.9;}
|
||||
button:active {background: #29f;}
|
||||
button[disabled] {opacity: 0.4; cursor: default}
|
||||
</style>
|
||||
<body>
|
||||
<form>
|
||||
<input id="text" type="text" size="40" placeholder="Enter Text">
|
||||
<button id="button" name="synthesize">Speak</button>
|
||||
</form>
|
||||
<p id="message"></p>
|
||||
<audio id="audio" controls autoplay hidden></audio>
|
||||
<script>
|
||||
function q(selector) {return document.querySelector(selector)}
|
||||
q('#text').focus()
|
||||
q('#button').addEventListener('click', function(e) {
|
||||
text = q('#text').value.trim()
|
||||
if (text) {
|
||||
q('#message').textContent = 'Synthesizing...'
|
||||
q('#button').disabled = true
|
||||
q('#audio').hidden = true
|
||||
synthesize(text)
|
||||
}
|
||||
e.preventDefault()
|
||||
return false
|
||||
})
|
||||
function synthesize(text) {
|
||||
fetch('/synthesize?text=' + encodeURIComponent(text), {cache: 'no-cache'})
|
||||
.then(function(res) {
|
||||
if (!res.ok) throw Error(response.statusText)
|
||||
return res.blob()
|
||||
}).then(function(blob) {
|
||||
q('#message').textContent = ''
|
||||
q('#button').disabled = false
|
||||
q('#audio').src = URL.createObjectURL(blob)
|
||||
q('#audio').hidden = false
|
||||
}).catch(function(err) {
|
||||
q('#message').textContent = 'Error: ' + err.message
|
||||
q('#button').disabled = false
|
||||
})
|
||||
}
|
||||
</script></body></html>
|
||||
'''
|
||||
|
||||
|
||||
class UIResource:
|
||||
def on_get(self, req, res):
|
||||
res.content_type = 'text/html'
|
||||
res.body = html_body
|
||||
|
||||
|
||||
class SynthesisResource:
|
||||
def on_get(self, req, res):
|
||||
if not req.params.get('text'):
|
||||
raise falcon.HTTPBadRequest()
|
||||
res.data = synthesizer.synthesize(req.params.get('text'))
|
||||
res.content_type = 'audio/wav'
|
||||
|
||||
|
||||
synthesizer = Synthesizer()
|
||||
api = falcon.API()
|
||||
api.add_route('/synthesize', SynthesisResource())
|
||||
api.add_route('/', UIResource())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from wsgiref import simple_server
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--checkpoint', required=True, help='Full path to model checkpoint')
|
||||
parser.add_argument('--port', type=int, default=9000)
|
||||
parser.add_argument('--hparams', default='',
|
||||
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
|
||||
args = parser.parse_args()
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
||||
hparams.max_iters = 100
|
||||
hparams.parse(args.hparams)
|
||||
print(hparams_debug_string())
|
||||
synthesizer.load(args.checkpoint)
|
||||
print('Serving on port %d' % args.port)
|
||||
simple_server.make_server('0.0.0.0', args.port, api).serve_forever()
|
||||
else:
|
||||
synthesizer.load(os.environ['CHECKPOINT'])
|
|
@ -0,0 +1,54 @@
|
|||
import argparse
|
||||
import os
|
||||
import re
|
||||
from hparams import hparams, hparams_debug_string
|
||||
from synthesizer import Synthesizer
|
||||
|
||||
|
||||
sentences = [
|
||||
# From July 8, 2017 New York Times:
|
||||
'Scientists at the CERN laboratory say they have discovered a new particle.',
|
||||
'There’s a way to measure the acute emotional intelligence that has never gone out of style.',
|
||||
'President Trump met with other leaders at the Group of 20 conference.',
|
||||
'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
|
||||
# From Google's Tacotron example page:
|
||||
'Generative adversarial network or variational auto-encoder.',
|
||||
'The buses aren\'t the problem, they actually provide a solution.',
|
||||
'Does the quick brown fox jump over the lazy dog?',
|
||||
'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
|
||||
]
|
||||
|
||||
|
||||
def get_output_base_path(checkpoint_path):
|
||||
base_dir = os.path.dirname(checkpoint_path)
|
||||
m = re.compile(r'.*?\.ckpt\-([0-9]+)').match(checkpoint_path)
|
||||
name = 'eval-%d' % int(m.group(1)) if m else 'eval'
|
||||
return os.path.join(base_dir, name)
|
||||
|
||||
|
||||
def run_eval(args):
|
||||
print(hparams_debug_string())
|
||||
synth = Synthesizer()
|
||||
synth.load(args.checkpoint)
|
||||
base_path = get_output_base_path(args.checkpoint)
|
||||
for i, text in enumerate(sentences):
|
||||
path = '%s-%d.wav' % (base_path, i)
|
||||
print('Synthesizing: %s' % path)
|
||||
with open(path, 'wb') as f:
|
||||
f.write(synth.synthesize(text))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--checkpoint', required=True, help='Path to model checkpoint')
|
||||
parser.add_argument('--hparams', default='',
|
||||
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
|
||||
args = parser.parse_args()
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
||||
hparams.max_iters = 100
|
||||
hparams.parse(args.hparams)
|
||||
run_eval(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,41 @@
|
|||
import tensorflow as tf
|
||||
|
||||
|
||||
# Default hyperparameters:
|
||||
hparams = tf.contrib.training.HParams(
|
||||
# Text:
|
||||
force_lowercase=True,
|
||||
expand_abbreviations=True,
|
||||
use_cmudict=False,
|
||||
|
||||
# Audio:
|
||||
num_mels=80,
|
||||
num_freq=1025,
|
||||
sample_rate=20000,
|
||||
frame_length_ms=50,
|
||||
frame_shift_ms=12.5,
|
||||
preemphasis=0.97,
|
||||
min_level_db=-100,
|
||||
ref_level_db=20,
|
||||
|
||||
# Model:
|
||||
# TODO: add more configurable hparams
|
||||
outputs_per_step=5,
|
||||
|
||||
# Training:
|
||||
batch_size=32,
|
||||
adam_beta1=0.9,
|
||||
adam_beta2=0.999,
|
||||
initial_learning_rate=0.002,
|
||||
decay_learning_rate=True,
|
||||
|
||||
# Eval:
|
||||
max_iters=200,
|
||||
griffin_lim_iters=60
|
||||
)
|
||||
|
||||
|
||||
def hparams_debug_string():
|
||||
values = hparams.values()
|
||||
hp = [' %s: %s' % (name, values[name]) for name in sorted(values)]
|
||||
return 'Hyperparameters:\n' + '\n'.join(hp)
|
|
@ -0,0 +1,8 @@
|
|||
from .tacotron import Tacotron
|
||||
|
||||
|
||||
def create_model(name, hparams):
|
||||
if name == 'tacotron':
|
||||
return Tacotron(hparams)
|
||||
else:
|
||||
raise Exception('Unknown model: ' + name)
|
|
@ -0,0 +1,67 @@
|
|||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.seq2seq import Helper
|
||||
|
||||
|
||||
# Adapted from tf.contrib.seq2seq.GreedyEmbeddingHelper
|
||||
class TacoTestHelper(Helper):
|
||||
def __init__(self, batch_size, output_dim, r):
|
||||
with tf.name_scope('TacoTestHelper'):
|
||||
self._batch_size = batch_size
|
||||
self._output_dim = output_dim
|
||||
self._end_token = tf.tile([0.0], [output_dim * r])
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self._batch_size
|
||||
|
||||
def initialize(self, name=None):
|
||||
return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
|
||||
|
||||
def sample(self, time, outputs, state, name=None):
|
||||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
|
||||
|
||||
def next_inputs(self, time, outputs, state, sample_ids, name=None):
|
||||
'''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.'''
|
||||
with tf.name_scope('TacoTestHelper'):
|
||||
finished = tf.reduce_all(tf.equal(outputs, self._end_token), axis=1)
|
||||
# Feed last output frame as next input. outputs is [N, output_dim * r]
|
||||
next_inputs = outputs[:, -self._output_dim:]
|
||||
return (finished, next_inputs, state)
|
||||
|
||||
|
||||
class TacoTrainingHelper(Helper):
|
||||
def __init__(self, inputs, targets, output_dim, r):
|
||||
# inputs is [N, T_in], targets is [N, T_out, D]
|
||||
with tf.name_scope('TacoTrainingHelper'):
|
||||
self._batch_size = tf.shape(inputs)[0]
|
||||
self._output_dim = output_dim
|
||||
|
||||
# Feed every r-th target frame as input
|
||||
self._targets = targets[:, r-1::r, :]
|
||||
|
||||
# Use full length for every target because we don't want to mask the padding frames
|
||||
num_steps = tf.shape(self._targets)[1]
|
||||
self._lengths = tf.tile([num_steps], [self._batch_size])
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self._batch_size
|
||||
|
||||
def initialize(self, name=None):
|
||||
return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
|
||||
|
||||
def sample(self, time, outputs, state, name=None):
|
||||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
|
||||
|
||||
def next_inputs(self, time, outputs, state, sample_ids, name=None):
|
||||
with tf.name_scope(name or 'TacoTrainingHelper'):
|
||||
finished = (time + 1 >= self._lengths)
|
||||
next_inputs = self._targets[:, time, :]
|
||||
return (finished, next_inputs, state)
|
||||
|
||||
|
||||
def _go_frames(batch_size, output_dim):
|
||||
'''Returns all-zero <GO> frames for a given batch size and output dimension'''
|
||||
return tf.tile([[0.0]], [batch_size, output_dim])
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow.contrib.rnn import GRUCell
|
||||
|
||||
|
||||
def prenet(inputs, is_training, layer_sizes=[256, 128], scope=None):
|
||||
x = inputs
|
||||
drop_rate = 0.5 if is_training else 0.0
|
||||
with tf.variable_scope(scope or 'prenet'):
|
||||
for i, size in enumerate(layer_sizes):
|
||||
dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i+1))
|
||||
x = tf.layers.dropout(dense, rate=drop_rate, name='dropout_%d' % (i+1))
|
||||
return x
|
||||
|
||||
|
||||
def encoder_cbhg(inputs, input_lengths, is_training):
|
||||
return cbhg(
|
||||
inputs,
|
||||
input_lengths,
|
||||
is_training,
|
||||
scope='encoder_cbhg',
|
||||
K=16,
|
||||
projections=[128, 128])
|
||||
|
||||
|
||||
def post_cbhg(inputs, input_dim, is_training):
|
||||
return cbhg(
|
||||
inputs,
|
||||
None,
|
||||
is_training,
|
||||
scope='post_cbhg',
|
||||
K=8,
|
||||
projections=[256, input_dim])
|
||||
|
||||
|
||||
def cbhg(inputs, input_lengths, is_training, scope, K, projections):
|
||||
with tf.variable_scope(scope):
|
||||
with tf.variable_scope('conv_bank'):
|
||||
# Convolution bank: concatenate on the last axis to stack channels from all convolutions
|
||||
conv_outputs = tf.concat(
|
||||
[conv1d(inputs, k, 128, tf.nn.relu, is_training, 'conv1d_%d' % k) for k in range(1, K+1)],
|
||||
axis=-1
|
||||
)
|
||||
|
||||
# Maxpooling:
|
||||
maxpool_output = tf.layers.max_pooling1d(
|
||||
conv_outputs,
|
||||
pool_size=2,
|
||||
strides=1,
|
||||
padding='same')
|
||||
|
||||
# Two projection layers:
|
||||
proj1_output = conv1d(maxpool_output, 3, projections[0], tf.nn.relu, is_training, 'proj_1')
|
||||
proj2_output = conv1d(proj1_output, 3, projections[1], None, is_training, 'proj_2')
|
||||
|
||||
# Residual connection:
|
||||
highway_input = proj2_output + inputs
|
||||
|
||||
# Handle dimensionality mismatch:
|
||||
if highway_input.shape[2] != 128:
|
||||
highway_input = tf.layers.dense(highway_input, 128)
|
||||
|
||||
# 4-layer HighwayNet:
|
||||
for i in range(4):
|
||||
highway_input = highwaynet(highway_input, 'highway_%d' % (i+1))
|
||||
rnn_input = highway_input
|
||||
|
||||
# Bidirectional RNN
|
||||
outputs, states = tf.nn.bidirectional_dynamic_rnn(
|
||||
GRUCell(128),
|
||||
GRUCell(128),
|
||||
rnn_input,
|
||||
sequence_length=input_lengths,
|
||||
dtype=tf.float32)
|
||||
return tf.concat(outputs, axis=2) # Concat forward and backward
|
||||
|
||||
|
||||
def highwaynet(inputs, scope):
|
||||
with tf.variable_scope(scope):
|
||||
H = tf.layers.dense(
|
||||
inputs,
|
||||
units=128,
|
||||
activation=tf.nn.relu,
|
||||
name='H')
|
||||
T = tf.layers.dense(
|
||||
inputs,
|
||||
units=128,
|
||||
activation=tf.nn.sigmoid,
|
||||
name='T',
|
||||
bias_initializer=tf.constant_initializer(-1.0))
|
||||
return H * T + inputs * (1.0 - T)
|
||||
|
||||
|
||||
def conv1d(inputs, kernel_size, channels, activation, is_training, scope):
|
||||
with tf.variable_scope(scope):
|
||||
conv1d_output = tf.layers.conv1d(
|
||||
inputs,
|
||||
filters=channels,
|
||||
kernel_size=kernel_size,
|
||||
activation=activation,
|
||||
padding='same')
|
||||
return tf.layers.batch_normalization(conv1d_output, training=is_training)
|
|
@ -0,0 +1,55 @@
|
|||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.rnn import RNNCell
|
||||
from .modules import prenet
|
||||
|
||||
|
||||
class DecoderPrenetWrapper(RNNCell):
|
||||
'''Runs RNN inputs through a prenet before sending them to the cell.'''
|
||||
def __init__(self, cell, is_training):
|
||||
super(DecoderPrenetWrapper, self).__init__()
|
||||
self._cell = cell
|
||||
self._is_training = is_training
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return self._cell.state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self._cell.output_size
|
||||
|
||||
def call(self, inputs, state):
|
||||
prenet_out = prenet(inputs, self._is_training, scope='decoder_prenet')
|
||||
return self._cell(prenet_out, state)
|
||||
|
||||
def zero_state(self, batch_size, dtype):
|
||||
return self._cell.zero_state(batch_size, dtype)
|
||||
|
||||
|
||||
|
||||
class ConcatOutputAndAttentionWrapper(RNNCell):
|
||||
'''Concatenates RNN cell output with the attention context vector.
|
||||
|
||||
This is expected to wrap a cell wrapped with an AttentionWrapper constructed with
|
||||
attention_layer_size=None and output_attention=False. Such a cell's state will include an
|
||||
"attention" field that is the context vector.
|
||||
'''
|
||||
def __init__(self, cell):
|
||||
super(ConcatOutputAndAttentionWrapper, self).__init__()
|
||||
self._cell = cell
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return self._cell.state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self._cell.output_size + self._cell.state_size.attention
|
||||
|
||||
def call(self, inputs, state):
|
||||
output, res_state = self._cell(inputs, state)
|
||||
return tf.concat([output, res_state.attention], axis=-1), res_state
|
||||
|
||||
def zero_state(self, batch_size, dtype):
|
||||
return self._cell.zero_state(batch_size, dtype)
|
|
@ -0,0 +1,149 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper
|
||||
from tensorflow.contrib.seq2seq import BasicDecoder, BahdanauAttention, AttentionWrapper
|
||||
from util import textinput
|
||||
from util.infolog import log
|
||||
from .helpers import TacoTestHelper, TacoTrainingHelper
|
||||
from .modules import encoder_cbhg, post_cbhg, prenet
|
||||
from .rnn_wrappers import DecoderPrenetWrapper, ConcatOutputAndAttentionWrapper
|
||||
|
||||
|
||||
|
||||
class Tacotron():
|
||||
def __init__(self, hparams):
|
||||
self._hparams = hparams
|
||||
|
||||
|
||||
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None):
|
||||
'''Initializes the model for inference.
|
||||
|
||||
Sets "mel_outputs", "linear_outputs", and "alignments" fields.
|
||||
|
||||
Args:
|
||||
inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
|
||||
steps in the input time series, and values are character IDs
|
||||
input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
|
||||
of each sequence in inputs.
|
||||
mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
|
||||
of steps in the output time series, M is num_mels, and values are entries in the mel
|
||||
spectrogram. Only needed for training.
|
||||
linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
|
||||
of steps in the output time series, F is num_freq, and values are entries in the linear
|
||||
spectrogram. Only needed for training.
|
||||
'''
|
||||
with tf.variable_scope('inference') as scope:
|
||||
is_training = linear_targets is not None
|
||||
batch_size = tf.shape(inputs)[0]
|
||||
hp = self._hparams
|
||||
|
||||
# Embeddings
|
||||
embedding_table = tf.get_variable(
|
||||
'embedding', [textinput.num_symbols(), 256], dtype=tf.float32,
|
||||
initializer=tf.truncated_normal_initializer(stddev=0.5))
|
||||
embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256]
|
||||
|
||||
# Encoder
|
||||
prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128]
|
||||
encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256]
|
||||
|
||||
# Attention
|
||||
attention_cell = AttentionWrapper(
|
||||
DecoderPrenetWrapper(GRUCell(256), is_training),
|
||||
BahdanauAttention(256, encoder_outputs),
|
||||
alignment_history=True,
|
||||
output_attention=False) # [N, T_in, 256]
|
||||
|
||||
# Concatenate attention context vector and RNN cell output into a 512D vector.
|
||||
concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512]
|
||||
|
||||
# Decoder (layers specified bottom to top):
|
||||
decoder_cell = MultiRNNCell([
|
||||
OutputProjectionWrapper(concat_cell, 256),
|
||||
ResidualWrapper(GRUCell(256)),
|
||||
ResidualWrapper(GRUCell(256))
|
||||
], state_is_tuple=True) # [N, T_in, 256]
|
||||
|
||||
# Project onto r mel spectrograms (predict r outputs at each RNN step):
|
||||
output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
|
||||
decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
|
||||
|
||||
if is_training:
|
||||
helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
|
||||
else:
|
||||
helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)
|
||||
|
||||
(decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
|
||||
BasicDecoder(output_cell, helper, decoder_init_state),
|
||||
maximum_iterations=hp.max_iters) # [N, T_out/r, M*r]
|
||||
|
||||
# Reshape outputs to be one output per entry
|
||||
mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M]
|
||||
|
||||
# Add post-processing CBHG:
|
||||
post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256]
|
||||
linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F]
|
||||
|
||||
# Grab alignments from the final decoder state:
|
||||
alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])
|
||||
|
||||
self.inputs = inputs
|
||||
self.input_lengths = input_lengths
|
||||
self.mel_outputs = mel_outputs
|
||||
self.linear_outputs = linear_outputs
|
||||
self.alignments = alignments
|
||||
self.mel_targets = mel_targets
|
||||
self.linear_targets = linear_targets
|
||||
log('Initialized Tacotron model. Dimensions: ')
|
||||
log(' embedding: %d' % embedded_inputs.shape[-1])
|
||||
log(' prenet out: %d' % prenet_outputs.shape[-1])
|
||||
log(' encoder out: %d' % encoder_outputs.shape[-1])
|
||||
log(' attention out: %d' % attention_cell.output_size)
|
||||
log(' concat attn & out: %d' % concat_cell.output_size)
|
||||
log(' decoder cell out: %d' % decoder_cell.output_size)
|
||||
log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
|
||||
log(' decoder out (1 frame): %d' % mel_outputs.shape[-1])
|
||||
log(' postnet out: %d' % post_outputs.shape[-1])
|
||||
log(' linear out: %d' % linear_outputs.shape[-1])
|
||||
|
||||
|
||||
def add_loss(self):
|
||||
'''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
|
||||
with tf.variable_scope('loss') as scope:
|
||||
hp = self._hparams
|
||||
self.mel_loss = tf.reduce_mean(tf.abs(self.mel_targets - self.mel_outputs))
|
||||
l1 = tf.abs(self.linear_targets - self.linear_outputs)
|
||||
# Prioritize loss for frequencies under 3000 Hz.
|
||||
n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq)
|
||||
self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:,:,0:n_priority_freq])
|
||||
self.loss = self.mel_loss + self.linear_loss
|
||||
|
||||
|
||||
def add_optimizer(self, global_step):
|
||||
'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
|
||||
|
||||
Args:
|
||||
global_step: int32 scalar Tensor representing current global step in training
|
||||
'''
|
||||
with tf.variable_scope('optimizer') as scope:
|
||||
hp = self._hparams
|
||||
if hp.decay_learning_rate:
|
||||
self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step)
|
||||
else:
|
||||
self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate)
|
||||
optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
|
||||
gradients, variables = zip(*optimizer.compute_gradients(self.loss))
|
||||
self.gradients = gradients
|
||||
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
|
||||
|
||||
# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
|
||||
# https://github.com/tensorflow/tensorflow/issues/1122
|
||||
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
|
||||
self.optimize = optimizer.apply_gradients(zip(gradients, variables),
|
||||
global_step=global_step)
|
||||
|
||||
|
||||
def _learning_rate_decay(init_lr, global_step):
|
||||
# Noam scheme from tensor2tensor:
|
||||
warmup_steps = 4000.0
|
||||
step = tf.cast(global_step + 1, dtype=tf.float32)
|
||||
return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)
|
|
@ -0,0 +1,50 @@
|
|||
import argparse
|
||||
import os
|
||||
from multiprocessing import cpu_count
|
||||
from tqdm import tqdm
|
||||
from datasets import blizzard, ljspeech
|
||||
from hparams import hparams
|
||||
|
||||
|
||||
def preprocess_blizzard(args):
|
||||
in_dir = os.path.join(args.base_dir, 'Blizzard2012')
|
||||
out_dir = os.path.join(args.base_dir, args.output)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
metadata = blizzard.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
|
||||
write_metadata(metadata, out_dir)
|
||||
|
||||
|
||||
def preprocess_ljspeech(args):
|
||||
in_dir = os.path.join(args.base_dir, 'LJSpeech-1.0')
|
||||
out_dir = os.path.join(args.base_dir, args.output)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
metadata = ljspeech.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
|
||||
write_metadata(metadata, out_dir)
|
||||
|
||||
|
||||
def write_metadata(metadata, out_dir):
|
||||
with open(os.path.join(out_dir, 'train.txt'), 'w') as f:
|
||||
for m in metadata:
|
||||
f.write('|'.join([str(x) for x in m]) + '\n')
|
||||
frames = sum([m[2] for m in metadata])
|
||||
hours = frames * hparams.frame_shift_ms / (3600 * 1000)
|
||||
print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
|
||||
print('Max input length: %d' % max(len(m[3]) for m in metadata))
|
||||
print('Max output length: %d' % max(m[2] for m in metadata))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
|
||||
parser.add_argument('--output', default='training')
|
||||
parser.add_argument('--dataset', required=True, choices=['blizzard', 'ljspeech'])
|
||||
parser.add_argument('--num_workers', type=int, default=cpu_count())
|
||||
args = parser.parse_args()
|
||||
if args.dataset == 'blizzard':
|
||||
preprocess_blizzard(args)
|
||||
elif args.dataset == 'ljspeech':
|
||||
preprocess_ljspeech(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,9 @@
|
|||
falcon==1.2.0
|
||||
inflect==0.2.5
|
||||
librosa==0.5.1
|
||||
matplotlib==2.0.2
|
||||
numpy==1.13.0
|
||||
scipy==0.19.0
|
||||
tensorflow==1.2.0
|
||||
tensorflow-gpu==1.2.0
|
||||
tqdm==4.11.2
|
|
@ -0,0 +1,36 @@
|
|||
import io
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from hparams import hparams
|
||||
from models import create_model
|
||||
from util import audio, textinput
|
||||
|
||||
|
||||
class Synthesizer:
|
||||
def load(self, checkpoint_path, model_name='tacotron'):
|
||||
print('Constructing model: %s' % model_name)
|
||||
inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
|
||||
input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
|
||||
with tf.variable_scope('model') as scope:
|
||||
self.model = create_model(model_name, hparams)
|
||||
self.model.initialize(inputs, input_lengths)
|
||||
|
||||
print('Loading checkpoint: %s' % checkpoint_path)
|
||||
self.session = tf.Session()
|
||||
self.session.run(tf.global_variables_initializer())
|
||||
saver = tf.train.Saver()
|
||||
saver.restore(self.session, checkpoint_path)
|
||||
|
||||
|
||||
def synthesize(self, text):
|
||||
seq = textinput.to_sequence(text,
|
||||
force_lowercase=hparams.force_lowercase,
|
||||
expand_abbreviations=hparams.expand_abbreviations)
|
||||
feed_dict = {
|
||||
self.model.inputs: [np.asarray(seq, dtype=np.int32)],
|
||||
self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
|
||||
}
|
||||
spec = self.session.run(self.model.linear_outputs[0], feed_dict=feed_dict)
|
||||
out = io.BytesIO()
|
||||
audio.save_wav(audio.inv_spectrogram(spec.T), out)
|
||||
return out.getvalue()
|
|
@ -0,0 +1,41 @@
|
|||
import io
|
||||
from util import cmudict
|
||||
|
||||
|
||||
test_data = '''
|
||||
;;; # CMUdict -- Major Version: 0.07
|
||||
)PAREN P ER EH N
|
||||
'TIS T IH Z
|
||||
ADVERSE AE0 D V ER1 S
|
||||
ADVERSE(1) AE1 D V ER2 S
|
||||
ADVERSE(2) AE2 D V ER1 S
|
||||
ADVERSELY AE0 D V ER1 S L IY0
|
||||
ADVERSITY AE0 D V ER1 S IH0 T IY2
|
||||
BARBERSHOP B AA1 R B ER0 SH AA2 P
|
||||
YOU'LL Y UW1 L
|
||||
'''
|
||||
|
||||
|
||||
def test_cmudict():
|
||||
c = cmudict.CMUDict(io.StringIO(test_data))
|
||||
assert len(c) == 6
|
||||
assert len(cmudict.valid_symbols) == 84
|
||||
assert c.lookup('ADVERSITY') == ['AE0 D V ER1 S IH0 T IY2']
|
||||
assert c.lookup('BarberShop') == ['B AA1 R B ER0 SH AA2 P']
|
||||
assert c.lookup("You'll") == ['Y UW1 L']
|
||||
assert c.lookup("'tis") == ['T IH Z']
|
||||
assert c.lookup('adverse') == [
|
||||
'AE0 D V ER1 S',
|
||||
'AE1 D V ER2 S',
|
||||
'AE2 D V ER1 S',
|
||||
]
|
||||
assert c.lookup('') == None
|
||||
assert c.lookup('foo') == None
|
||||
assert c.lookup(')paren') == None
|
||||
|
||||
|
||||
def test_cmudict_no_keep_ambiguous():
|
||||
c = cmudict.CMUDict(io.StringIO(test_data), keep_ambiguous=False)
|
||||
assert len(c) == 5
|
||||
assert c.lookup('ADVERSITY') == ['AE0 D V ER1 S IH0 T IY2']
|
||||
assert c.lookup('adverse') == None
|
|
@ -0,0 +1,51 @@
|
|||
from util.numbers import normalize
|
||||
|
||||
|
||||
def test_normalize_numbers():
|
||||
assert normalize('1') == 'one'
|
||||
assert normalize('15') == 'fifteen'
|
||||
assert normalize('24') == 'twenty-four'
|
||||
assert normalize('100') == 'one hundred'
|
||||
assert normalize('101') == 'one hundred one'
|
||||
assert normalize('456') == 'four hundred fifty-six'
|
||||
assert normalize('1000') == 'one thousand'
|
||||
assert normalize('1800') == 'eighteen hundred'
|
||||
assert normalize('2,000') == 'two thousand'
|
||||
assert normalize('3000') == 'three thousand'
|
||||
assert normalize('18000') == 'eighteen thousand'
|
||||
assert normalize('24,000') == 'twenty-four thousand'
|
||||
assert normalize('124,001') == 'one hundred twenty-four thousand one'
|
||||
assert normalize('6.4 sec') == 'six point four sec'
|
||||
|
||||
|
||||
def test_normalize_ordinals():
|
||||
assert normalize('1st') == 'first'
|
||||
assert normalize('2nd') == 'second'
|
||||
assert normalize('9th') == 'ninth'
|
||||
assert normalize('243rd place') == 'two hundred and forty-third place'
|
||||
|
||||
|
||||
def test_normalize_dates():
|
||||
assert normalize('1400') == 'fourteen hundred'
|
||||
assert normalize('1901') == 'nineteen oh one'
|
||||
assert normalize('1999') == 'nineteen ninety-nine'
|
||||
assert normalize('2000') == 'two thousand'
|
||||
assert normalize('2004') == 'two thousand four'
|
||||
assert normalize('2010') == 'twenty ten'
|
||||
assert normalize('2012') == 'twenty twelve'
|
||||
assert normalize('2025') == 'twenty twenty-five'
|
||||
assert normalize('September 11, 2001') == 'September eleven, two thousand one'
|
||||
assert normalize('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.'
|
||||
|
||||
|
||||
def test_normalize_money():
|
||||
assert normalize('$0.00') == 'zero dollars'
|
||||
assert normalize('$1') == 'one dollar'
|
||||
assert normalize('$10') == 'ten dollars'
|
||||
assert normalize('$.01') == 'one cent'
|
||||
assert normalize('$0.25') == 'twenty-five cents'
|
||||
assert normalize('$5.00') == 'five dollars'
|
||||
assert normalize('$5.01') == 'five dollars, one cent'
|
||||
assert normalize('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.'
|
||||
assert normalize('$40,000') == 'forty thousand dollars'
|
||||
assert normalize('for £2500!') == 'for twenty-five hundred pounds!'
|
|
@ -0,0 +1,59 @@
|
|||
from util.textinput import num_symbols, to_sequence, to_string
|
||||
|
||||
|
||||
def text_num_symbols():
|
||||
assert num_symbols() == 147
|
||||
|
||||
|
||||
def test_to_sequence():
|
||||
assert to_sequence('') == [1]
|
||||
assert to_sequence('H', force_lowercase=False) == [9, 1]
|
||||
assert to_sequence('H', force_lowercase=True) == [35, 1]
|
||||
assert to_sequence('Hi.', force_lowercase=False) == [9, 36, 60, 1]
|
||||
|
||||
|
||||
def test_whitespace_nomalization():
|
||||
assert round_trip('') == '~'
|
||||
assert round_trip(' ') == '~'
|
||||
assert round_trip('x') == 'x~'
|
||||
assert round_trip(' x ') == 'x~'
|
||||
assert round_trip(' x. y,z ') == 'x. y,z~'
|
||||
assert round_trip('X: Y') == 'X: Y~'
|
||||
|
||||
|
||||
def test_valid_chars():
|
||||
assert round_trip('x') == 'x~'
|
||||
assert round_trip('Hello') == 'Hello~'
|
||||
assert round_trip('3 apples and 44 bananas') == 'three apples and forty-four bananas~'
|
||||
assert round_trip('$3.50 for gas.') == 'three dollars, fifty cents for gas.~'
|
||||
assert round_trip('Hello, world!') == 'Hello, world!~'
|
||||
assert round_trip("What (time-out)! He\'s going where?") == "What (time-out)! He\'s going where?~"
|
||||
|
||||
|
||||
def test_invalid_chars():
|
||||
assert round_trip('^') == ' ~'
|
||||
assert round_trip('A~^B') == 'A B~'
|
||||
assert round_trip('"Finally," she said, "it ended."') == 'Finally, she said, it ended.~'
|
||||
|
||||
|
||||
def test_unicode():
|
||||
assert round_trip('naïve café') == 'naive cafe~'
|
||||
assert round_trip("raison d'être") == "raison d'etre~"
|
||||
|
||||
|
||||
def test_arpabet():
|
||||
assert to_sequence('{AE0 D}') == [70, 91, 1]
|
||||
assert round_trip('{AE0 D V ER1 S}') == '{AE0 D V ER1 S}~'
|
||||
assert round_trip('{AE0 D V ER1 S} circumstances') == '{AE0 D V ER1 S} circumstances~'
|
||||
assert round_trip('In {AE0 D V ER1 S} circumstances') == 'In {AE0 D V ER1 S} circumstances~'
|
||||
assert round_trip('{AE0 D V ER1 S} {AE0 D S}') == '{AE0 D V ER1 S} {AE0 D S}~'
|
||||
assert round_trip('X {AE0 D} Y {AE0 D} Z') == 'X {AE0 D} Y {AE0 D} Z~'
|
||||
|
||||
|
||||
def test_abbreviations():
|
||||
assert round_trip('mr. rogers and dr. smith.') == 'mister rogers and doctor smith.~'
|
||||
assert round_trip('hit it with a hammr.') == 'hit it with a hammr.~'
|
||||
|
||||
|
||||
def round_trip(x):
|
||||
return to_string(to_sequence(x, force_lowercase=False, expand_abbreviations=True))
|
|
@ -0,0 +1,152 @@
|
|||
import argparse
|
||||
from datetime import datetime
|
||||
import math
|
||||
import numpy as np
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import tensorflow as tf
|
||||
import traceback
|
||||
|
||||
from datasets.datafeeder import DataFeeder
|
||||
from hparams import hparams, hparams_debug_string
|
||||
from models import create_model
|
||||
from util import audio, infolog, plot, textinput, ValueWindow
|
||||
log = infolog.log
|
||||
|
||||
|
||||
def get_git_commit():
|
||||
subprocess.check_output(['git', 'diff-index', '--quiet', 'HEAD']) # Verify client is clean
|
||||
commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()[:10]
|
||||
log('Git commit: %s' % commit)
|
||||
return commit
|
||||
|
||||
|
||||
def add_stats(model):
|
||||
with tf.variable_scope('stats') as scope:
|
||||
tf.summary.histogram('linear_outputs', model.linear_outputs)
|
||||
tf.summary.histogram('linear_targets', model.linear_targets)
|
||||
tf.summary.histogram('mel_outputs', model.mel_outputs)
|
||||
tf.summary.histogram('mel_targets', model.mel_targets)
|
||||
tf.summary.scalar('loss_mel', model.mel_loss)
|
||||
tf.summary.scalar('loss_linear', model.linear_loss)
|
||||
tf.summary.scalar('learning_rate', model.learning_rate)
|
||||
tf.summary.scalar('loss', model.loss)
|
||||
gradient_norms = [tf.norm(grad) for grad in model.gradients]
|
||||
tf.summary.histogram('gradient_norm', gradient_norms)
|
||||
tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms))
|
||||
return tf.summary.merge_all()
|
||||
|
||||
|
||||
def time_string():
|
||||
return datetime.now().strftime('%Y-%m-%d %H:%M')
|
||||
|
||||
|
||||
def train(log_dir, args):
|
||||
commit = get_git_commit() if args.git else 'None'
|
||||
checkpoint_path = os.path.join(log_dir, 'model.ckpt')
|
||||
input_path = os.path.join(args.base_dir, args.input)
|
||||
log('Checkpoint path: %s' % checkpoint_path)
|
||||
log('Loading training data from: %s' % input_path)
|
||||
log('Using model: %s' % args.model)
|
||||
log(hparams_debug_string())
|
||||
|
||||
# Set up DataFeeder:
|
||||
coord = tf.train.Coordinator()
|
||||
with tf.variable_scope('datafeeder') as scope:
|
||||
feeder = DataFeeder(coord, input_path, hparams)
|
||||
|
||||
# Set up model:
|
||||
global_step = tf.Variable(0, name='global_step', trainable=False)
|
||||
with tf.variable_scope('model') as scope:
|
||||
model = create_model(args.model, hparams)
|
||||
model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets)
|
||||
model.add_loss()
|
||||
model.add_optimizer(global_step)
|
||||
stats = add_stats(model)
|
||||
|
||||
# Bookkeeping:
|
||||
step = 0
|
||||
time_window = ValueWindow(100)
|
||||
loss_window = ValueWindow(100)
|
||||
saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)
|
||||
|
||||
# Train!
|
||||
with tf.Session() as sess:
|
||||
try:
|
||||
summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
|
||||
sess.run(tf.global_variables_initializer())
|
||||
|
||||
if args.restore_step:
|
||||
# Restore from a checkpoint if the user requested it.
|
||||
restore_path = '%s-%d' % (checkpoint_path, args.restore_step)
|
||||
saver.restore(sess, restore_path)
|
||||
log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
|
||||
else:
|
||||
log('Starting new training run at commit: %s' % commit, slack=True)
|
||||
|
||||
feeder.start_in_session(sess)
|
||||
|
||||
while not coord.should_stop():
|
||||
start_time = time.time()
|
||||
step, loss, opt = sess.run([global_step, model.loss, model.optimize])
|
||||
time_window.append(time.time() - start_time)
|
||||
loss_window.append(loss)
|
||||
message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
|
||||
step, time_window.average, loss, loss_window.average)
|
||||
log(message, slack=(step % args.checkpoint_interval == 0))
|
||||
|
||||
if loss > 100 or math.isnan(loss):
|
||||
log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True)
|
||||
raise Exception('Loss Exploded')
|
||||
|
||||
if step % args.summary_interval == 0:
|
||||
log('Writing summary at step: %d' % step)
|
||||
summary_writer.add_summary(sess.run(stats), step)
|
||||
|
||||
if step % args.checkpoint_interval == 0:
|
||||
log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
|
||||
saver.save(sess, checkpoint_path, global_step=step)
|
||||
log('Saving audio and alignment...')
|
||||
input_seq, spectrogram, alignment = sess.run([
|
||||
model.inputs[0], model.linear_outputs[0], model.alignments[0]])
|
||||
waveform = audio.inv_spectrogram(spectrogram.T)
|
||||
audio.save_wav(waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step))
|
||||
plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-align.png' % step),
|
||||
info='%s, %s, %s, step=%d, loss=%.5f' % (args.model, commit, time_string(), step, loss))
|
||||
log('Input: %s' % textinput.to_string(input_seq))
|
||||
|
||||
except Exception as e:
|
||||
log('Exiting due to exception: %s' % e, slack=True)
|
||||
traceback.print_exc()
|
||||
coord.request_stop(e)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
|
||||
parser.add_argument('--input', default='training/train.txt')
|
||||
parser.add_argument('--model', default='tacotron')
|
||||
parser.add_argument('--name', help='Name of the run. Used for logging. Defaults to model name.')
|
||||
parser.add_argument('--hparams', default='',
|
||||
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
|
||||
parser.add_argument('--restore_step', type=int, help='Global step to restore from checkpoint.')
|
||||
parser.add_argument('--summary_interval', type=int, default=100,
|
||||
help='Steps between running summary ops.')
|
||||
parser.add_argument('--checkpoint_interval', type=int, default=1000,
|
||||
help='Steps between writing checkpoints.')
|
||||
parser.add_argument('--slack_url', help='Slack webhook URL to get periodic reports.')
|
||||
parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
|
||||
parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.')
|
||||
args = parser.parse_args()
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
|
||||
run_name = args.name or args.model
|
||||
log_dir = os.path.join(args.base_dir, 'logs-%s' % run_name)
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
infolog.init(os.path.join(log_dir, 'train.log'), run_name, args.slack_url)
|
||||
hparams.parse(args.hparams)
|
||||
train(log_dir, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,22 @@
|
|||
class ValueWindow():
|
||||
def __init__(self, window_size=100):
|
||||
self._window_size = window_size
|
||||
self._values = []
|
||||
|
||||
def append(self, x):
|
||||
self._values = self._values[-(self._window_size - 1):] + [x]
|
||||
|
||||
@property
|
||||
def sum(self):
|
||||
return sum(self._values)
|
||||
|
||||
@property
|
||||
def count(self):
|
||||
return len(self._values)
|
||||
|
||||
@property
|
||||
def average(self):
|
||||
return self.sum / max(1, self.count)
|
||||
|
||||
def reset(self):
|
||||
self._values = []
|
|
@ -0,0 +1,101 @@
|
|||
import librosa
|
||||
import librosa.filters
|
||||
import math
|
||||
import numpy as np
|
||||
from scipy import signal
|
||||
from hparams import hparams
|
||||
|
||||
|
||||
def load_wav(path):
|
||||
return librosa.core.load(path, sr=hparams.sample_rate)[0]
|
||||
|
||||
|
||||
def save_wav(wav, path):
|
||||
wav = (wav * 32767).astype(np.int16)
|
||||
librosa.output.write_wav(path, wav, hparams.sample_rate)
|
||||
|
||||
|
||||
def spectrogram(y):
|
||||
D = _stft(_preemphasis(y))
|
||||
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
|
||||
return _normalize(S)
|
||||
|
||||
|
||||
def inv_spectrogram(spectrogram):
|
||||
S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear
|
||||
return _inv_preemphasis(_griffin_lim(S ** 1.2)) # Reconstruct phase
|
||||
|
||||
|
||||
def melspectrogram(y):
|
||||
D = _stft(_preemphasis(y))
|
||||
S = _amp_to_db(_linear_to_mel(np.abs(D)))
|
||||
return _normalize(S)
|
||||
|
||||
|
||||
def inv_melspectrogram(melspectrogram):
|
||||
S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram))) # Convert back to linear
|
||||
return _inv_preemphasis(_griffin_lim(S ** 1.2)) # Reconstruct phase
|
||||
|
||||
|
||||
# Based on https://github.com/librosa/librosa/issues/434
|
||||
def _griffin_lim(S):
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
for i in range(hparams.griffin_lim_iters):
|
||||
if i > 0:
|
||||
angles = np.exp(1j * np.angle(_stft(y)))
|
||||
y = _istft(S_complex * angles)
|
||||
return y
|
||||
|
||||
|
||||
def _stft(y):
|
||||
n_fft = (hparams.num_freq - 1) * 2
|
||||
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
||||
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
|
||||
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
|
||||
def _istft(y):
|
||||
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
||||
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
|
||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
|
||||
# Conversions:
|
||||
|
||||
_mel_basis = None
|
||||
_inv_mel_basis = None
|
||||
|
||||
def _linear_to_mel(spectrogram):
|
||||
global _mel_basis
|
||||
if _mel_basis is None:
|
||||
_mel_basis = _build_mel_basis()
|
||||
return np.dot(_mel_basis, spectrogram)
|
||||
|
||||
def _mel_to_linear(mel_spectrogram):
|
||||
global _inv_mel_basis
|
||||
if _inv_mel_basis is None:
|
||||
_inv_mel_basis = np.linalg.pinv(_build_mel_basis())
|
||||
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
|
||||
|
||||
def _build_mel_basis():
|
||||
n_fft = (hparams.num_freq - 1) * 2
|
||||
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
|
||||
|
||||
def _amp_to_db(x):
|
||||
return 20 * np.log10(np.maximum(1e-5, x))
|
||||
|
||||
def _db_to_amp(x):
|
||||
return np.power(10.0, x * 0.05)
|
||||
|
||||
def _preemphasis(x):
|
||||
return signal.lfilter([1, -hparams.preemphasis], [1], x)
|
||||
|
||||
def _inv_preemphasis(x):
|
||||
return signal.lfilter([1], [1, -hparams.preemphasis], x)
|
||||
|
||||
def _normalize(S):
|
||||
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
|
||||
|
||||
def _denormalize(S):
|
||||
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
|
|
@ -0,0 +1,63 @@
|
|||
import re
|
||||
|
||||
|
||||
valid_symbols = [
|
||||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
|
||||
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
|
||||
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
|
||||
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
|
||||
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
|
||||
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
|
||||
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
|
||||
]
|
||||
|
||||
_valid_symbol_set = set(valid_symbols)
|
||||
|
||||
|
||||
class CMUDict:
|
||||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
|
||||
def __init__(self, file_or_path, keep_ambiguous=True):
|
||||
if isinstance(file_or_path, str):
|
||||
with open(file_or_path, encoding='latin-1') as f:
|
||||
entries = _parse_cmudict(f)
|
||||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
||||
self._entries = entries
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self._entries)
|
||||
|
||||
|
||||
def lookup(self, word):
|
||||
'''Returns list of ARPAbet pronunciations of the given word.'''
|
||||
return self._entries.get(word.upper())
|
||||
|
||||
|
||||
|
||||
_alt_re = re.compile(r'\([0-9]+\)')
|
||||
|
||||
|
||||
def _parse_cmudict(file):
|
||||
cmudict = {}
|
||||
for line in file:
|
||||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
|
||||
parts = line.split(' ')
|
||||
word = re.sub(_alt_re, '', parts[0])
|
||||
pronunciation = _get_pronunciation(parts[1])
|
||||
if pronunciation:
|
||||
if word in cmudict:
|
||||
cmudict[word].append(pronunciation)
|
||||
else:
|
||||
cmudict[word] = [pronunciation]
|
||||
return cmudict
|
||||
|
||||
|
||||
def _get_pronunciation(s):
|
||||
parts = s.strip().split(' ')
|
||||
for part in parts:
|
||||
if part not in _valid_symbol_set:
|
||||
return None
|
||||
return ' '.join(parts)
|
|
@ -0,0 +1,50 @@
|
|||
import atexit
|
||||
from datetime import datetime
|
||||
import json
|
||||
from threading import Thread
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
|
||||
_format = '%Y-%m-%d %H:%M:%S.%f'
|
||||
_file = None
|
||||
_run_name = None
|
||||
_slack_url = None
|
||||
|
||||
|
||||
def init(filename, run_name, slack_url=None):
|
||||
global _file, _run_name, _slack_url
|
||||
_close_logfile()
|
||||
_file = open(filename, 'a')
|
||||
_file.write('\n-----------------------------------------------------------------\n')
|
||||
_file.write('Starting new training run\n')
|
||||
_file.write('-----------------------------------------------------------------\n')
|
||||
_run_name = run_name
|
||||
_slack_url = slack_url
|
||||
|
||||
|
||||
def log(msg, slack=False):
|
||||
print(msg)
|
||||
if _file is not None:
|
||||
_file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg))
|
||||
if slack and _slack_url is not None:
|
||||
Thread(target=_send_slack, args=(msg,)).start()
|
||||
|
||||
|
||||
def _close_logfile():
|
||||
global _file
|
||||
if _file is not None:
|
||||
_file.close()
|
||||
_file = None
|
||||
|
||||
|
||||
def _send_slack(msg):
|
||||
req = Request(_slack_url)
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
urlopen(req, json.dumps({
|
||||
'username': 'tacotron',
|
||||
'icon_emoji': ':taco:',
|
||||
'text': '*%s*: %s' % (_run_name, msg)
|
||||
}).encode())
|
||||
|
||||
|
||||
atexit.register(_close_logfile)
|
|
@ -0,0 +1,69 @@
|
|||
import inflect
|
||||
import re
|
||||
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace('.', ' point ')
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
|
||||
|
||||
def normalize(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r'\1 pounds', text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
|
@ -0,0 +1,20 @@
|
|||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def plot_alignment(alignment, path, info=None):
|
||||
fig, ax = plt.subplots()
|
||||
im = ax.imshow(
|
||||
alignment,
|
||||
aspect='auto',
|
||||
origin='lower',
|
||||
interpolation='none')
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = 'Decoder timestep'
|
||||
if info is not None:
|
||||
xlabel += '\n\n' + info
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel('Encoder timestep')
|
||||
plt.tight_layout()
|
||||
plt.savefig(path, format='png')
|
|
@ -0,0 +1,105 @@
|
|||
import re
|
||||
import unicodedata
|
||||
from util import cmudict, numbers
|
||||
|
||||
|
||||
# Input alphabet (63 symbols), plus ARPAbet (84 symbols):
|
||||
_pad = '_'
|
||||
_eos = '~'
|
||||
_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
_lowercase = 'abcdefghijklmnopqrstuvwxyz'
|
||||
_punctuation = '!\'(),-.:;?'
|
||||
_space = ' '
|
||||
|
||||
_valid_input_chars = _uppercase + _lowercase + _punctuation + _space
|
||||
_trans_table = str.maketrans({chr(i): ' ' for i in range(256) if chr(i) not in _valid_input_chars})
|
||||
|
||||
_normal_symbols = _pad + _eos + _valid_input_chars
|
||||
_num_normal_symbols = len(_normal_symbols)
|
||||
_char_to_id = {c: i for i, c in enumerate(_normal_symbols)}
|
||||
_id_to_char = {i: c for i, c in enumerate(_normal_symbols)}
|
||||
_arpabet_to_id = {sym: i + _num_normal_symbols for i, sym in enumerate(cmudict.valid_symbols)}
|
||||
_id_to_arpabet = {i + _num_normal_symbols: sym for i, sym in enumerate(cmudict.valid_symbols)}
|
||||
_arpabet_re = re.compile(r'(.*?)\{([A-Z0-2 ]+?)\}(.*)')
|
||||
_num_symbols = _num_normal_symbols + len(cmudict.valid_symbols)
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
||||
|
||||
def num_symbols():
|
||||
'''Returns number of symbols in the alphabet.'''
|
||||
return _num_symbols
|
||||
|
||||
|
||||
def to_sequence(text, force_lowercase=True, expand_abbreviations=True):
|
||||
'''Converts a string of text to a sequence of IDs for the symbols in the text'''
|
||||
text = text.strip()
|
||||
text = text.replace('"', '')
|
||||
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
|
||||
|
||||
sequence = []
|
||||
while len(text):
|
||||
m = _arpabet_re.match(text)
|
||||
if not m:
|
||||
sequence += _text_to_sequence(text, force_lowercase, expand_abbreviations)
|
||||
break
|
||||
sequence += _text_to_sequence(m.group(1), force_lowercase, expand_abbreviations)
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
sequence.append(_char_to_id[_eos])
|
||||
return sequence
|
||||
|
||||
|
||||
def to_string(sequence, remove_eos=False):
|
||||
'''Returns the string for a sequence of characters.'''
|
||||
s = ''
|
||||
for sym in sequence:
|
||||
if sym < _num_normal_symbols:
|
||||
s += _id_to_char[sym]
|
||||
elif sym < _num_symbols:
|
||||
s += '{%s}' % _id_to_arpabet[sym]
|
||||
s = s.replace('}{', ' ')
|
||||
if remove_eos and s[-1] == _eos:
|
||||
s = s[:-1]
|
||||
return s
|
||||
|
||||
|
||||
def _text_to_sequence(text, force_lowercase, expand_abbreviations):
|
||||
text = numbers.normalize(text)
|
||||
text = text.translate(_trans_table)
|
||||
if force_lowercase:
|
||||
text = text.lower()
|
||||
if expand_abbreviations:
|
||||
text = _expand_abbreviations(text)
|
||||
text = re.sub(_whitespace_re, ' ', text)
|
||||
return [_char_to_id[c] for c in text]
|
||||
|
||||
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
|
||||
def _expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def _arpabet_to_sequence(text):
|
||||
return [_arpabet_to_id[s] for s in text.split() if s in _arpabet_to_id]
|
Loading…
Reference in New Issue