Merge pull request #8 from MycroftAI/kusal

added ability to parallelized preprocessing in kusal data
2018-06-21 12:24:47 -05:00 · 2018-06-21 12:24:47 -05:00 · 4077ab6e20
parent f015401666 bb88150a3d
commit 4077ab6e20
1 changed files with 3 additions and 17 deletions
--- a/datasets/kusal.py
+++ b/datasets/kusal.py
@ -11,10 +11,8 @@ from util import audio

 def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
  '''Preprocesses the Amy dataset from a given input path into a given output directory.'''
-  # executor = ProcessPoolExecutor(max_workers=num_workers)
+  executor = ProcessPoolExecutor(max_workers=num_workers)
  futures = []
-  count = 0
-  len_files = 0
  # Read all of the .wav files:
  paths = {}
  for path in glob.glob(os.path.join(in_dir, 'audio', '*.wav')):
@ -28,20 +26,8 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
      if len(parts) == 4 and parts[0] in paths:
        path = paths[parts[0]]
        text = parts[2]
-        # futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0], path, text)))
-        futures.append(partial(_process_utterance, out_dir, parts[0], path, text))
-        len_files += 1
-  # return [future.result() for future in futures]
-    # return [future() for future in futures]
-  metadata = []
-  for future in tqdm(futures):
-    try:
-      data = future()
-      metadata.append(data)
-    except:
-      count += 1
-      print("failed to process" , count, "/", len_files)
-  return metadata
+        futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0], path, text)))
+  return [future.result() for future in tqdm(futures)]

 def _process_utterance(out_dir, prompt_id, wav_path, text):
  # Load the audio to a numpy array: