From bb88150a3d74388da33c7bfa689d5fb7bea69668 Mon Sep 17 00:00:00 2001
From: Michael Nguyen <ppnguyen91@gmail.com>
Date: Thu, 21 Jun 2018 12:24:05 -0500
Subject: [PATCH] added ability to parallelized preprocessing in kusal data

---
 datasets/kusal.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/datasets/kusal.py b/datasets/kusal.py
index b870956..b718011 100644
--- a/datasets/kusal.py
+++ b/datasets/kusal.py
@@ -11,10 +11,8 @@ from util import audio
 
 def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
   '''Preprocesses the Amy dataset from a given input path into a given output directory.'''
-  # executor = ProcessPoolExecutor(max_workers=num_workers)
+  executor = ProcessPoolExecutor(max_workers=num_workers)
   futures = []
-  count = 0
-  len_files = 0
   # Read all of the .wav files:
   paths = {}
   for path in glob.glob(os.path.join(in_dir, 'audio', '*.wav')):
@@ -28,20 +26,8 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
       if len(parts) == 4 and parts[0] in paths:
         path = paths[parts[0]]
         text = parts[2]
-        # futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0], path, text)))
-        futures.append(partial(_process_utterance, out_dir, parts[0], path, text))
-        len_files += 1
-  # return [future.result() for future in futures]
-    # return [future() for future in futures]
-  metadata = []
-  for future in tqdm(futures):
-    try:
-      data = future()
-      metadata.append(data)
-    except:
-      count += 1
-      print("failed to process" , count, "/", len_files)
-  return metadata
+        futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0], path, text)))
+  return [future.result() for future in tqdm(futures)]
 
 def _process_utterance(out_dir, prompt_id, wav_path, text):
   # Load the audio to a numpy array: