find unique characters in a dataset

pull/367/head
Eren Gölge 2021-02-12 09:46:11 +00:00 committed by Eren Gölge
parent dad3565379
commit c6702b5b9f
1 changed files with 48 additions and 0 deletions

View File

@ -0,0 +1,48 @@
"""Find all the unique characters in a dataset"""
import os
import argparse
from argparse import RawTextHelpFormatter
from TTS.tts.datasets.preprocess import get_preprocessor_by_name
def main():
parser = argparse.ArgumentParser(description='''Find all the unique characters or phonemes in a dataset.\n\n'''
'''Target dataset must be defined in TTS.tts.datasets.preprocess\n\n'''\
'''
Example runs:
python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv
''',
formatter_class=RawTextHelpFormatter)
parser.add_argument(
'--dataset',
type=str,
default='',
help='One of the target dataset names in TTS.tts.datasets.preprocess.'
)
parser.add_argument(
'--meta_file',
type=str,
default=None,
help='Path to the transcriptions file of the dataset.'
)
args = parser.parse_args()
preprocessor = get_preprocessor_by_name(args.dataset)
items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file))
texts = " ".join([item[0] for item in items])
chars = set(texts)
lower_chars = set(texts.lower())
print(f" > Number of unique characters: {len(chars)}")
print(f" > Unique characters: {''.join(sorted(chars))}")
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
if __name__ == "__main__":
main()