Add support for CSS10 datasets and improved docker image to reuse pip install

2018-11-25 21:18:45 +01:00 · 2018-11-25 21:18:45 +01:00 · 531ef65f87
parent 00b92b5c84
commit 531ef65f87
5 changed files with 75 additions and 6 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1 @@
+.git/
--- a/README.md
+++ b/README.md
@ -63,6 +63,7 @@ Contributions are accepted! We'd love the communities help in building a better
    * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain)
    * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
    * [M-ailabs](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/)
+    * [CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages](https://github.com/Kyubyong/css10)

   You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info.

@ -90,7 +91,17 @@ Contributions are accepted! We'd love the communities help in building a better
             |- lab
             |- wav
   ```
-   
+
+   alternatively, like this for CSS10, German dataset (make sure to adjust `text/symbols.py` in order to meet the character set):
+   ```
+   tacotron
+     |- css10
+         |- achtgesichterambiwasse
+         |- meisterfloh
+         |- serapionsbruederauswahl
+         |- transcript.txt
+   ```
+
   For M-AILABS follow the directory structure from [here](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/)

 3. **Preprocess the data**
--- a/cpu.Dockerfile
+++ b/cpu.Dockerfile
@ -1,8 +1,12 @@
 FROM tensorflow/tensorflow:1.8.0-py3

 RUN mkdir /root/mimic2
-COPY . /root/mimic2
 WORKDIR /root/mimic2
-RUN pip install  --no-cache-dir -r requirements.txt

-ENTRYPOINT [ "/bin/bash" ]
+COPY requirements.txt /root/mimic2/requirements.txt
+RUN pip install --upgrade pip && pip install  --no-cache-dir -r requirements.txt
+RUN apt update && apt install -y ffmpeg
+
+COPY . /root/mimic2
+
+ENTRYPOINT [ "/bin/bash" ]
--- a/datasets/css10.py
+++ b/datasets/css10.py
@ -0,0 +1,43 @@
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+import numpy as np
+import os
+
+from util import audio
+
+
+def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
+    '''Preprocesses the css10 dataset from a given input path into a given output directory.'''
+    executor = ProcessPoolExecutor(max_workers=num_workers)
+    futures = []
+
+    # Read the transcript file
+    with open(os.path.join(in_dir, 'transcript.txt'), encoding='utf-8') as f:
+        for line in f:
+            parts = line.strip().split('|')
+            path = os.path.join(in_dir, parts[0])
+            text = parts[1]
+            futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0].split('/')[1], path, text)))
+
+    return [future.result() for future in tqdm(futures)]
+
+
+def _process_utterance(out_dir, prompt_id, wav_path, text):
+    # Load the audio to a numpy array:
+    wav = audio.load_wav(wav_path)
+
+    # Compute the linear-scale spectrogram from the wav:
+    spectrogram = audio.spectrogram(wav).astype(np.float32)
+
+    # Compute a mel-scale spectrogram from the wav:
+    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
+
+    # Write the spectrograms to disk:
+    spectrogram_filename = 'css10-spec-%s.npy' % prompt_id
+    mel_filename = 'css10css10-mel-%s.npy' % prompt_id
+    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
+    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T,  allow_pickle=False)
+
+    # Return a tuple describing this training example:
+    n_frames = spectrogram.shape[1]
+    return (spectrogram_filename, mel_filename, n_frames, text)
--- a/preprocess.py
+++ b/preprocess.py
@ -2,7 +2,7 @@ import argparse
 import os
 from multiprocessing import cpu_count
 from tqdm import tqdm
-from datasets import amy, blizzard, ljspeech, kusal, mailabs
+from datasets import amy, blizzard, css10, ljspeech, kusal, mailabs
 from hparams import hparams, hparams_debug_string


@ -32,6 +32,14 @@ def preprocess_amy(args):
  write_metadata(metadata, out_dir)


+def preprocess_css10_de(args):
+  in_dir = os.path.join(args.base_dir, 'css10')
+  out_dir = os.path.join(args.base_dir, args.output)
+  os.makedirs(out_dir, exist_ok=True)
+  metadata = css10.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
+  write_metadata(metadata, out_dir)
+
+
 def preprocess_kusal(args):
  in_dir = os.path.join(args.base_dir, 'kusal')
  out_dir = os.path.join(args.base_dir, args.output)
@ -79,7 +87,7 @@ def main():
  parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
  parser.add_argument('--output', default='training')
  parser.add_argument(
-      '--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal', 'mailabs']
+      '--dataset', required=True, choices=['amy', 'blizzard', 'css10', 'ljspeech', 'kusal', 'mailabs']
  )
  parser.add_argument('--mailabs_books_dir',
                      help='absolute directory to the books for the mlailabs')
@ -103,6 +111,8 @@ def main():
    preprocess_amy(args)
  elif args.dataset == 'blizzard':
    preprocess_blizzard(args)
+  elif args.dataset == 'css10':
+    preprocess_css10_de(args)
  elif args.dataset == 'ljspeech':
    preprocess_ljspeech(args)
  elif args.dataset == 'kusal':