diff --git a/config.json b/config.json index 2a171ad1..c5434bf9 100644 --- a/config.json +++ b/config.json @@ -65,10 +65,6 @@ "run_eval": true, "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - "data_path": "/home/erogol/Data/LJSpeech-1.1/", // DATASET-RELATED: can overwritten from command argument - "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader. - "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader. - "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. @@ -80,6 +76,17 @@ "text_cleaner": "phoneme_cleaners", "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. - "use_gst": false // TACOTRON ONLY: use global style tokens + "use_gst": false, // TACOTRON ONLY: use global style tokens + + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "/home/erogol/Data/LJSpeech-1.1/", + "meta_file_train": "metadata_train.csv", + "meta_file_val": "metadata_val.csv" + } + ] + } diff --git a/train.py b/train.py index cbcfb1ec..b94f44fa 100644 --- a/train.py +++ b/train.py @@ -28,7 +28,7 @@ from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ from TTS.utils.synthesis import synthesis from TTS.utils.text.symbols import phonemes, symbols from TTS.utils.visual import plot_alignment, plot_spectrogram -from TTS.datasets.preprocess import get_preprocessor_by_name +from TTS.datasets.preprocess import load_meta_data from TTS.utils.radam import RAdam from TTS.utils.measures import alignment_diagonal_score @@ -46,17 +46,7 @@ def setup_loader(ap, is_val=False, verbose=False): global meta_data_train global meta_data_eval if "meta_data_train" not in globals(): - if c.meta_file_train is not None: - meta_data_train = get_preprocessor_by_name( - c.dataset)(c.data_path, c.meta_file_train) - else: - meta_data_train = get_preprocessor_by_name(c.dataset)(c.data_path) - if "meta_data_eval" not in globals() and c.run_eval: - if c.meta_file_val is not None: - meta_data_eval = get_preprocessor_by_name( - c.dataset)(c.data_path, c.meta_file_val) - else: - meta_data_eval, meta_data_train = split_dataset(meta_data_train) + meta_data_train, meta_data_eval = load_meta_data(c.datasets) if is_val and not c.run_eval: loader = None else: