diff --git a/run_bash_tests.sh b/run_bash_tests.sh index 16381611..feb9082b 100755 --- a/run_bash_tests.sh +++ b/run_bash_tests.sh @@ -2,13 +2,7 @@ set -e TF_CPP_MIN_LOG_LEVEL=3 # runtime bash based tests +# TODO: move these to python ./tests/bash_tests/test_demo_server.sh && \ ./tests/bash_tests/test_resample.sh && \ -./tests/bash_tests/test_tacotron_train.sh && \ -./tests/bash_tests/test_glow-tts_train.sh && \ -./tests/bash_tests/test_vocoder_gan_train.sh && \ -./tests/bash_tests/test_vocoder_wavernn_train.sh && \ -./tests/bash_tests/test_vocoder_wavegrad_train.sh && \ -./tests/bash_tests/test_speedy_speech_train.sh && \ -./tests/bash_tests/test_aligntts_train.sh && \ ./tests/bash_tests/test_compute_statistics.sh diff --git a/tests/inputs/test_config.json b/tests/inputs/test_config.json index 2fb52bb6..8f8810d1 100644 --- a/tests/inputs/test_config.json +++ b/tests/inputs/test_config.json @@ -1,24 +1,24 @@ { "audio":{ - "audio_processor": "audio", // to use dictate different audio processors, if available. - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": null, // stft window length in ms. - "frame_shift_ms": null, // stft window hop-lengh in ms. + "audio_processor": "audio", + "num_mels": 80, + "fft_size": 1024, + "sample_rate": 22050, + "frame_length_ms": null, + "frame_shift_ms": null, "hop_length": 256, "win_length": 1024, - "preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 30,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "clip_norm": true, // clip normalized values into the range. - "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!! + "preemphasis": 0.97, + "min_level_db": -100, + "ref_level_db": 20, + "power": 1.5, + "griffin_lim_iters": 30, + "signal_norm": true, + "symmetric_norm": true, + "clip_norm": true, + "max_norm": 4, + "mel_fmin": 0, + "mel_fmax": 8000, "do_trim_silence": false, "spec_gain": 20 }, @@ -53,15 +53,15 @@ "max_seq_len": 300, "log_dir": "tests/outputs/", - // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_gst": true, // use global style tokens - "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} - // with the dictionary being len(dict) <= len(gst_num_style_tokens). - "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST. + + "use_speaker_embedding": false, + "use_gst": true, + "gst": { + "gst_style_input": null, + + + + "gst_use_speaker_embedding": true, "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_num_style_tokens": 10 diff --git a/tests/inputs/test_speaker_encoder_config.json b/tests/inputs/test_speaker_encoder_config.json index f1174e76..4f3678e1 100644 --- a/tests/inputs/test_speaker_encoder_config.json +++ b/tests/inputs/test_speaker_encoder_config.json @@ -1,5 +1,6 @@ { + "model": "speaker_encoder", "run_name": "test_speaker_encoder", "run_description": "test speaker encoder.", "audio":{ @@ -42,8 +43,9 @@ "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 20, // Number of steps to log traning on console. + "batch_size": 32, "output_path": "", // DATASET-RELATED: output path for all training outputs. - "model": { + "model_params": { "input_dim": 40, "proj_dim": 256, "lstm_dim": 768, diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json index 3996e09a..b51bb3a8 100644 --- a/tests/outputs/dummy_model_config.json +++ b/tests/outputs/dummy_model_config.json @@ -87,7 +87,6 @@ // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled "gst_style_input": null, // Condition the style input either on a // -> wave file [path to wave] or diff --git a/tests/test_audio.py b/tests/test_audio.py index 527defa8..7291a31f 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -3,21 +3,21 @@ import unittest from tests import get_tests_input_path, get_tests_output_path, get_tests_path from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_config +from TTS.config import BaseAudioConfig TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) -conf = load_config(os.path.join(get_tests_input_path(), "test_config.json")) +conf = BaseAudioConfig(mel_fmax=8000) # pylint: disable=protected-access class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.ap = AudioProcessor(**conf.audio) + self.ap = AudioProcessor(**conf) def test_audio_synthesis(self): """1. load wav @@ -163,12 +163,12 @@ class TestAudio(unittest.TestCase): def test_scaler(self): scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") - conf.audio["stats_path"] = scaler_stats_path - conf.audio["preemphasis"] = 0.0 - conf.audio["do_trim_silence"] = True - conf.audio["signal_norm"] = True + conf.stats_path = scaler_stats_path + conf.preemphasis = 0.0 + conf.do_trim_silence = True + conf.signal_norm = True - ap = AudioProcessor(**conf.audio) + ap = AudioProcessor(**conf) mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) diff --git a/tests/test_glow_tts.py b/tests/test_glow_tts.py index 77801b29..c1c8177b 100644 --- a/tests/test_glow_tts.py +++ b/tests/test_glow_tts.py @@ -17,7 +17,7 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) +c = GlowTTSConfig() ap = AudioProcessor(**c.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") diff --git a/tests/test_loader.py b/tests/test_loader.py index 6174865b..ca2ac6eb 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -10,13 +10,17 @@ from tests import get_tests_input_path, get_tests_output_path from TTS.tts.datasets import TTSDataset from TTS.tts.datasets.preprocess import ljspeech from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_config +from TTS.tts.configs import BaseTTSConfig # pylint: disable=unused-variable OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) + +# create a dummy config for testing data loaders. +c = BaseTTSConfig(text_cleaner='english_cleaners', num_loader_workers=0, batch_size=2) +c.r = 5 +c.data_path = "tests/data/ljspeech/" ok_ljspeech = os.path.exists(c.data_path) DATA_EXIST = True @@ -40,7 +44,7 @@ class TestTTSDataset(unittest.TestCase): compute_linear_spec=True, ap=self.ap, meta_data=items, - tp=c.characters if "characters" in c.keys() else None, + tp=c.characters, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py index 0bf04966..1258f550 100644 --- a/tests/test_speaker_encoder_train.py +++ b/tests/test_speaker_encoder_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig from TTS.config.shared_configs import BaseAudioConfig @@ -15,9 +16,9 @@ config = SpeakerEncoderConfig( num_speakers_in_batch=1, num_utters_per_speaker=10, num_loader_workers=0, - max_train_step=10, + max_train_step=2, print_step=1, - save_step=10, + save_step=1, print_eval=True, audio=BaseAudioConfig(num_mels=40) ) @@ -27,7 +28,7 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_encoder.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " @@ -41,6 +42,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_encoder.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index f8c742d9..31082f86 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -26,11 +26,11 @@ class SpeakerManagerTest(unittest.TestCase): def test_speaker_embedding(): # load config config = load_config(encoder_config_path) - config["audio"]["resample"] = True + config.audio.resample = True # create a dummy speaker encoder - model = SpeakerEncoder(**config.model) - save_checkpoint(model, None, None, get_tests_input_path(), 0, 0) + model = SpeakerEncoder(**config.model_params) + save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) diff --git a/tests/test_synthesize.py b/tests/test_synthesize.py index 526f7dc8..a8d5c31c 100644 --- a/tests/test_synthesize.py +++ b/tests/test_synthesize.py @@ -1,6 +1,7 @@ import os -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + def test_synthesize(): diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py index 46997dbb..f29509c7 100644 --- a/tests/test_synthesizer.py +++ b/tests/test_synthesizer.py @@ -15,8 +15,8 @@ class SynthesizerTest(unittest.TestCase): # pylint: disable=global-statement global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json")) - if "characters" in config.keys(): - symbols, phonemes = make_symbols(**config.characters) + if config.has('characters') and config.characters: + symbols, phonemes = make_symbols(**config.characters.to_dict()) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) @@ -25,11 +25,10 @@ class SynthesizerTest(unittest.TestCase): def test_in_out(self): self._create_random_model() - config = load_config(os.path.join(get_tests_input_path(), "server_config.json")) tts_root_path = get_tests_output_path() - config["tts_checkpoint"] = os.path.join(tts_root_path, config["tts_checkpoint"]) - config["tts_config"] = os.path.join(tts_root_path, config["tts_config"]) - synthesizer = Synthesizer(config["tts_checkpoint"], config["tts_config"], None, None) + tts_checkpoint = os.path.join(tts_root_path, 'checkpoint_10.pth.tar') + tts_config = os.path.join(tts_root_path, 'dummy_model_config.json') + synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None) synthesizer.tts("Better this test works!!") def test_split_into_sentences(self): diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index 3a08e4d0..22af3384 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -17,7 +17,7 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) +c = Tacotron2Config() ap = AudioProcessor(**c.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -152,10 +152,8 @@ class TacotronGSTTrainTest(unittest.TestCase): num_chars=24, r=c.r, num_speakers=5, - gst=True, - gst_embedding_dim=c.gst["gst_embedding_dim"], - gst_num_heads=c.gst["gst_num_heads"], - gst_style_tokens=c.gst["gst_style_tokens"], + use_gst=True, + gst=c.gst ).to(device) model.train() model_ref = copy.deepcopy(model) @@ -216,10 +214,8 @@ class TacotronGSTTrainTest(unittest.TestCase): num_chars=24, r=c.r, num_speakers=5, - gst=True, - gst_embedding_dim=c.gst["gst_embedding_dim"], - gst_num_heads=c.gst["gst_num_heads"], - gst_style_tokens=c.gst["gst_style_tokens"], + use_gst=True, + gst =c.gst ).to(device) model.train() model_ref = copy.deepcopy(model) @@ -280,11 +276,8 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): r=c.r, num_speakers=5, speaker_embedding_dim=55, - gst=True, - gst_embedding_dim=c.gst["gst_embedding_dim"], - gst_num_heads=c.gst["gst_num_heads"], - gst_style_tokens=c.gst["gst_style_tokens"], - gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"], + use_gst=True, + gst=c.gst ).to(device) model.train() model_ref = copy.deepcopy(model) diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py index aa9c1846..d8f88571 100644 --- a/tests/test_tacotron2_tf_model.py +++ b/tests/test_tacotron2_tf_model.py @@ -19,7 +19,7 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) +c = Tacotron2Config() class TacotronTFTrainTest(unittest.TestCase): diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index d6f5189e..8142e23a 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -18,7 +18,7 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) +c = TacotronConfig() ap = AudioProcessor(**c.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -175,10 +175,8 @@ class TacotronGSTTrainTest(unittest.TestCase): model = Tacotron( num_chars=32, num_speakers=5, - gst=True, - gst_embedding_dim=c.gst["gst_embedding_dim"], - gst_num_heads=c.gst["gst_num_heads"], - gst_style_tokens=c.gst["gst_style_tokens"], + use_gst=True, + gst=c.gst, postnet_output_dim=c.audio["fft_size"], decoder_output_dim=c.audio["num_mels"], r=c.r, @@ -240,10 +238,8 @@ class TacotronGSTTrainTest(unittest.TestCase): model = Tacotron( num_chars=32, num_speakers=5, - gst=True, - gst_embedding_dim=c.gst["gst_embedding_dim"], - gst_num_heads=c.gst["gst_num_heads"], - gst_style_tokens=c.gst["gst_style_tokens"], + use_gst=True, + gst=c.gst, postnet_output_dim=c.audio["fft_size"], decoder_output_dim=c.audio["num_mels"], r=c.r, @@ -306,11 +302,8 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): num_speakers=5, postnet_output_dim=c.audio["fft_size"], decoder_output_dim=c.audio["num_mels"], - gst=True, - gst_embedding_dim=c.gst["gst_embedding_dim"], - gst_num_heads=c.gst["gst_num_heads"], - gst_style_tokens=c.gst["gst_style_tokens"], - gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"], + use_gst=True, + gst=c.gst, r=c.r, memory_size=c.memory_size, speaker_embedding_dim=55, diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index aefc7dc3..c5fd098c 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.tts.configs import AlignTTSConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") @@ -30,7 +31,7 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_align_tts.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " @@ -43,6 +44,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_align_tts.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index bb630aef..014fc5c4 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.tts.configs import GlowTTSConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") @@ -30,7 +31,7 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_glow_tts.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " @@ -44,6 +45,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_glow_tts.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 1b356985..a2384cb2 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.tts.configs import SpeedySpeechConfig config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") @@ -30,7 +31,7 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_speedy_speech.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " @@ -44,6 +45,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_speedy_speech.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 2ac17502..5743d581 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.tts.configs import Tacotron2Config config_path = os.path.join(get_tests_output_path(), "test_model_config.json") @@ -31,7 +32,7 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_tacotron.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " @@ -44,6 +45,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_tacotron.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index b45e4a64..cd00a6f4 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.tts.configs import TacotronConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") @@ -30,7 +31,7 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_tacotron.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " @@ -43,6 +44,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_tacotron.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index 64355af9..d052cc76 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.vocoder.configs import FullbandMelganConfig config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") @@ -28,13 +29,13 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py index fa431eb3..1e0e303c 100644 --- a/tests/vocoder_tests/test_hifigan_train.py +++ b/tests/vocoder_tests/test_hifigan_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.vocoder.configs import HifiganConfig config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") @@ -29,13 +30,13 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index b362ce86..bec7d5f5 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -2,9 +2,10 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id from TTS.vocoder.configs import MelganConfig + config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") @@ -28,13 +29,13 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index bd2ae86f..583be8da 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.vocoder.configs import MultibandMelganConfig config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") @@ -28,13 +29,13 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py index 5d89d069..73cfa39c 100644 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ b/tests/vocoder_tests/test_parallel_wavegan_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.vocoder.configs import ParallelWaveganConfig config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") @@ -28,13 +29,13 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py index 7202d06e..cbf6da77 100644 --- a/tests/vocoder_tests/test_vocoder_gan_datasets.py +++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py @@ -3,17 +3,17 @@ import os import numpy as np from torch.utils.data import DataLoader -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_output_path, get_tests_path from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_config from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data +from TTS.vocoder.configs import BaseGANVocoderConfig file_path = os.path.dirname(os.path.realpath(__file__)) OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -C = load_config(os.path.join(get_tests_input_path(), "test_config.json")) +C = BaseGANVocoderConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") ok_ljspeech = os.path.exists(test_data_path) @@ -46,6 +46,8 @@ def gan_dataset_case( def check_item(feat, wav): """Pass a single pair of features and waveform""" + feat = feat.numpy() + wav = wav.numpy() expected_feat_shape = (batch_size, ap.num_mels, seq_len // hop_len + conv_pad * 2) # check shapes @@ -61,7 +63,7 @@ def gan_dataset_case( # the first 2 and the last 2 frames are skipped due to the padding # differences in stft max_diff = abs((feat - mel[:, : feat.shape[-1]])[:, 2:-2]).max() - assert max_diff <= 0, f" [!] {max_diff}" + assert max_diff <= 1e-6, f" [!] {max_diff}" # return random segments or return the whole audio if return_segments: @@ -69,7 +71,6 @@ def gan_dataset_case( for item1, item2 in loader: feat1, wav1 = item1 feat2, wav2 = item2 - check_item(feat1, wav1) check_item(feat2, wav2) count_iter += 1 diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py index 915c5947..65b1fa86 100644 --- a/tests/vocoder_tests/test_vocoder_losses.py +++ b/tests/vocoder_tests/test_vocoder_losses.py @@ -14,8 +14,7 @@ os.makedirs(OUT_PATH, exist_ok=True) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -C = load_config(os.path.join(get_tests_input_path(), "test_config.json")) -ap = AudioProcessor(**C.audio) +ap = AudioProcessor(**BaseAudioConfig().to_dict()) def test_torch_stft(): diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py index 755d4772..588f529f 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py +++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py @@ -14,7 +14,7 @@ file_path = os.path.dirname(os.path.realpath(__file__)) OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -C = load_config(os.path.join(get_tests_input_path(), "test_vocoder_wavernn_config.json")) +C = WavernnConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") test_mel_feat_path = os.path.join(test_data_path, "mel") diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index c2269bbd..b52715c7 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -2,39 +2,44 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.vocoder.configs import WavegradConfig config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = WavegradConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_val_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) +config = WavegradConfig(batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_val_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=8192, + eval_split_size=1, + print_step=1, + print_eval=True, + data_path="tests/data/ljspeech", + output_path=output_path, + test_noise_schedule={ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 2 + }) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} " run_cli(command_train) # Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) +continue_path = max(glob.glob(os.path.join(output_path, "*/")), + key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py index 1ac9d9eb..4597bb8f 100644 --- a/tests/vocoder_tests/test_wavernn_train.py +++ b/tests/vocoder_tests/test_wavernn_train.py @@ -2,7 +2,8 @@ import glob import os import shutil -from tests import get_tests_output_path, run_cli +from tests import get_tests_output_path, run_cli, get_device_id + from TTS.vocoder.configs import WavernnConfig config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") @@ -16,7 +17,7 @@ config = WavernnConfig( run_eval=True, test_delay_epochs=-1, epochs=1, - seq_len=8192, + seq_len=256, # for shorter test time eval_split_size=1, print_step=1, print_eval=True, @@ -28,13 +29,14 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} " run_cli(command_train) # Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) +continue_path = max(glob.glob(os.path.join(output_path, "*/")), + key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} " +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path)