From 103c010eca4fc2c7d516eb09386dfd025f4a8679 Mon Sep 17 00:00:00 2001 From: loganhart420 Date: Thu, 16 Dec 2021 07:21:27 -0500 Subject: [PATCH] Add addtional datasets --- TTS/utils/download.py | 22 +++++++++ TTS/utils/downloaders.py | 101 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 119 insertions(+), 4 deletions(-) diff --git a/TTS/utils/download.py b/TTS/utils/download.py index 5cfb69cd..241a106b 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -7,6 +7,7 @@ import tarfile import urllib import urllib.request import zipfile +from os.path import expanduser from typing import Any, Iterable, List, Optional from torch.utils.model_zoo import tqdm @@ -183,3 +184,24 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo pass raise NotImplementedError(" > [!] only supports tar.gz, tgz, and zip achives.") + + +def download_kaggle_dataset(dataset_path: str, dataset_name: str, output_path: str): + """Download dataset from kaggle. + Args: + dataset_path (str): + This the kaggle link to the dataset. for example vctk is 'mfekadu/english-multispeaker-corpus-for-voice-cloning' + dataset_name (str): Name of the folder the dataset will be saved in. + output_path (str): Path of the location you want the dataset folder to be saved to. + """ + data_path = os.path.join(output_path, dataset_name) + try: + import kaggle # pylint: disable=import-outside-toplevel + + kaggle.api.authenticate() + print(f"""\nDownloading {dataset_name}...""") + kaggle.api.dataset_download_files(dataset_path, path=data_path, unzip=True) + except OSError: + print( + f"""[!] in order to download kaggle datasets, you need to have a kaggle api token stored in your {os.path.join(expanduser('~'), '.kaggle/kaggle.json')}""" + ) diff --git a/TTS/utils/downloaders.py b/TTS/utils/downloaders.py index 89f2148f..104dc7b9 100644 --- a/TTS/utils/downloaders.py +++ b/TTS/utils/downloaders.py @@ -1,6 +1,7 @@ import os +from typing import Optional -from TTS.utils.download import download_url, extract_archive +from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive def download_ljspeech(path: str): @@ -18,14 +19,106 @@ def download_ljspeech(path: str): extract_archive(archive) -def download_vctk(path: str): - """Download and extract VCTK dataset +def download_vctk(path: str, use_kaggle: Optional[bool] = False): + """Download and extract VCTK dataset. Args: path (str): path to the directory where the dataset will be stored. + + use_kaggle (bool, optional): Downloads vctk dataset from kaggle. Is generally faster. Defaults to False. + """ + if use_kaggle: + download_kaggle_dataset("mfekadu/english-multispeaker-corpus-for-voice-cloning", "VCTK", path) + else: + os.makedirs(path, exist_ok=True) + url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" + download_url(url, path) + basename = os.path.basename(url) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + + +def download_tweb(path: str): + """Download and extract Tweb dataset + + Args: + path (str): Path to the directory where the dataset will be stored. + """ + download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path) + + +def download_libri_tts(path: str, subset: Optional[str] = "all"): + """Download and extract libri tts dataset. + + Args: + path (str): Path to the directory where the dataset will be stored. + + subset (str, optional): Name of the subset to download. If you only want to download a certain + portion specify it here. Defaults to 'all'. + """ + + subset_dict = { + "libri-tts-clean-100": "http://www.openslr.org/resources/60/train-clean-100.tar.gz", + "libri-tts-clean-360": "http://www.openslr.org/resources/60/train-clean-360.tar.gz", + "libri-tts-other-500": "http://www.openslr.org/resources/60/train-other-500.tar.gz", + "libri-tts-dev-clean": "http://www.openslr.org/resources/60/dev-clean.tar.gz", + "libri-tts-dev-other": "http://www.openslr.org/resources/60/dev-other.tar.gz", + "libri-tts-test-clean": "http://www.openslr.org/resources/60/test-clean.tar.gz", + "libri-tts-test-other": "http://www.openslr.org/resources/60/test-other.tar.gz", + } + + os.makedirs(path, exist_ok=True) + if subset == "all": + for sub, val in subset_dict.items(): + print(f" > Downloading {sub}...") + download_url(val, path) + basename = os.path.basename(val) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + print(" > All subsets downloaded") + else: + url = subset_dict[subset] + download_url(url, path) + basename = os.path.basename(url) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + + +def download_thorsten_de(path: str): + """Download and extract Thorsten german male voice dataset. + + Args: + path (str): Path to the directory where the dataset will be stored. """ os.makedirs(path, exist_ok=True) - url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" + url = "https://www.openslr.org/resources/95/thorsten-de_v02.tgz" + download_url(url, path) + basename = os.path.basename(url) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + + +def download_mailabs(path: str, language: str = "english"): + """Download and extract Mailabs dataset. + + Args: + path (str): Path to the directory where the dataset will be stored. + + language (str): Language subset to download. Defaults to english. + """ + language_dict = { + "english": "https://data.solak.de/data/Training/stt_tts/en_US.tgz", + "german": "https://data.solak.de/data/Training/stt_tts/de_DE.tgz", + "french": "https://data.solak.de/data/Training/stt_tts/fr_FR.tgz", + "italian": "https://data.solak.de/data/Training/stt_tts/it_IT.tgz", + "spanish": "https://data.solak.de/data/Training/stt_tts/es_ES.tgz", + } + os.makedirs(path, exist_ok=True) + url = language_dict[language] download_url(url, path) basename = os.path.basename(url) archive = os.path.join(path, basename)