From c6702b5b9fc8a97cdc4eedda23bfe4ad42420e9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 12 Feb 2021 09:46:11 +0000 Subject: [PATCH] find unique characters in a dataset --- TTS/bin/find_unique_chars.py | 48 ++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 TTS/bin/find_unique_chars.py diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py new file mode 100644 index 00000000..e6c35878 --- /dev/null +++ b/TTS/bin/find_unique_chars.py @@ -0,0 +1,48 @@ +"""Find all the unique characters in a dataset""" +import os +import argparse +from argparse import RawTextHelpFormatter + +from TTS.tts.datasets.preprocess import get_preprocessor_by_name + + +def main(): + parser = argparse.ArgumentParser(description='''Find all the unique characters or phonemes in a dataset.\n\n''' + + '''Target dataset must be defined in TTS.tts.datasets.preprocess\n\n'''\ + + ''' + Example runs: + + python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv + ''', + formatter_class=RawTextHelpFormatter) + + parser.add_argument( + '--dataset', + type=str, + default='', + help='One of the target dataset names in TTS.tts.datasets.preprocess.' + ) + + parser.add_argument( + '--meta_file', + type=str, + default=None, + help='Path to the transcriptions file of the dataset.' + ) + + args = parser.parse_args() + + preprocessor = get_preprocessor_by_name(args.dataset) + items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file)) + texts = " ".join([item[0] for item in items]) + chars = set(texts) + lower_chars = set(texts.lower()) + print(f" > Number of unique characters: {len(chars)}") + print(f" > Unique characters: {''.join(sorted(chars))}") + print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") + + +if __name__ == "__main__": + main() \ No newline at end of file