diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 9dd7a610..e5f4e1a2 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -75,21 +75,19 @@ def mailabs(root_path, meta_files=None): speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") if meta_files is None: csv_files = glob(root_path+"/**/metadata.csv", recursive=True) - folders = [os.path.dirname(f) for f in csv_files] else: csv_files = meta_files - folders = [f.strip().split("by_book")[1][1:] for f in csv_files] # meta_files = [f.strip() for f in meta_files.split(",")] items = [] - for idx, csv_file in enumerate(csv_files): + for csv_file in csv_files: + txt_file = os.path.join(root_path, csv_file) + folder = os.path.dirname(txt_file) # determine speaker based on folder structure... - speaker_name_match = speaker_regex.search(csv_file) + speaker_name_match = speaker_regex.search(txt_file) if speaker_name_match is None: continue speaker_name = speaker_name_match.group("speaker_name") print(" | > {}".format(csv_file)) - folder = folders[idx] - txt_file = os.path.join(root_path, csv_file) with open(txt_file, 'r') as ttf: for line in ttf: cols = line.split('|')