From c6702b5b9fc8a97cdc4eedda23bfe4ad42420e9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Fri, 12 Feb 2021 09:46:11 +0000
Subject: [PATCH] find unique characters in a dataset

---
 TTS/bin/find_unique_chars.py | 48 ++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 TTS/bin/find_unique_chars.py

diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py
new file mode 100644
index 00000000..e6c35878
--- /dev/null
+++ b/TTS/bin/find_unique_chars.py
@@ -0,0 +1,48 @@
+"""Find all the unique characters in a dataset"""
+import os
+import argparse
+from argparse import RawTextHelpFormatter
+
+from TTS.tts.datasets.preprocess import get_preprocessor_by_name
+
+
+def main():
+    parser = argparse.ArgumentParser(description='''Find all the unique characters or phonemes in a dataset.\n\n'''
+
+    '''Target dataset must be defined in TTS.tts.datasets.preprocess\n\n'''\
+
+    '''
+    Example runs:
+
+    python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv
+    ''',
+        formatter_class=RawTextHelpFormatter)
+
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='',
+        help='One of the target dataset names in TTS.tts.datasets.preprocess.'
+        )
+
+    parser.add_argument(
+        '--meta_file',
+        type=str,
+        default=None,
+        help='Path to the transcriptions file of the dataset.'
+    )
+
+    args = parser.parse_args()
+
+    preprocessor = get_preprocessor_by_name(args.dataset)
+    items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file))
+    texts = " ".join([item[0] for item in items])
+    chars = set(texts)
+    lower_chars = set(texts.lower())
+    print(f" > Number of unique characters: {len(chars)}")
+    print(f" > Unique characters: {''.join(sorted(chars))}")
+    print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file