mirror of https://github.com/coqui-ai/TTS.git
find unique characters in a dataset
parent
dad3565379
commit
c6702b5b9f
|
@ -0,0 +1,48 @@
|
|||
"""Find all the unique characters in a dataset"""
|
||||
import os
|
||||
import argparse
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
from TTS.tts.datasets.preprocess import get_preprocessor_by_name
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='''Find all the unique characters or phonemes in a dataset.\n\n'''
|
||||
|
||||
'''Target dataset must be defined in TTS.tts.datasets.preprocess\n\n'''\
|
||||
|
||||
'''
|
||||
Example runs:
|
||||
|
||||
python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv
|
||||
''',
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
type=str,
|
||||
default='',
|
||||
help='One of the target dataset names in TTS.tts.datasets.preprocess.'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--meta_file',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to the transcriptions file of the dataset.'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
preprocessor = get_preprocessor_by_name(args.dataset)
|
||||
items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file))
|
||||
texts = " ".join([item[0] for item in items])
|
||||
chars = set(texts)
|
||||
lower_chars = set(texts.lower())
|
||||
print(f" > Number of unique characters: {len(chars)}")
|
||||
print(f" > Unique characters: {''.join(sorted(chars))}")
|
||||
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue