mirror of https://github.com/coqui-ai/TTS.git
refactoring tests after Coqpit
parent
87384c6008
commit
5aee30443f
|
@ -2,13 +2,7 @@ set -e
|
|||
TF_CPP_MIN_LOG_LEVEL=3
|
||||
|
||||
# runtime bash based tests
|
||||
# TODO: move these to python
|
||||
./tests/bash_tests/test_demo_server.sh && \
|
||||
./tests/bash_tests/test_resample.sh && \
|
||||
./tests/bash_tests/test_tacotron_train.sh && \
|
||||
./tests/bash_tests/test_glow-tts_train.sh && \
|
||||
./tests/bash_tests/test_vocoder_gan_train.sh && \
|
||||
./tests/bash_tests/test_vocoder_wavernn_train.sh && \
|
||||
./tests/bash_tests/test_vocoder_wavegrad_train.sh && \
|
||||
./tests/bash_tests/test_speedy_speech_train.sh && \
|
||||
./tests/bash_tests/test_aligntts_train.sh && \
|
||||
./tests/bash_tests/test_compute_statistics.sh
|
||||
|
|
|
@ -1,24 +1,24 @@
|
|||
{
|
||||
"audio":{
|
||||
"audio_processor": "audio", // to use dictate different audio processors, if available.
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
||||
"frame_length_ms": null, // stft window length in ms.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms.
|
||||
"audio_processor": "audio",
|
||||
"num_mels": 80,
|
||||
"fft_size": 1024,
|
||||
"sample_rate": 22050,
|
||||
"frame_length_ms": null,
|
||||
"frame_shift_ms": null,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 30,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"preemphasis": 0.97,
|
||||
"min_level_db": -100,
|
||||
"ref_level_db": 20,
|
||||
"power": 1.5,
|
||||
"griffin_lim_iters": 30,
|
||||
"signal_norm": true,
|
||||
"symmetric_norm": true,
|
||||
"clip_norm": true,
|
||||
"max_norm": 4,
|
||||
"mel_fmin": 0,
|
||||
"mel_fmax": 8000,
|
||||
"do_trim_silence": false,
|
||||
"spec_gain": 20
|
||||
},
|
||||
|
@ -53,15 +53,15 @@
|
|||
"max_seq_len": 300,
|
||||
"log_dir": "tests/outputs/",
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_gst": true, // use global style tokens
|
||||
"gst": { // gst parameter if gst is enabled
|
||||
"gst_style_input": null, // Condition the style input either on a
|
||||
// -> wave file [path to wave] or
|
||||
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
|
||||
// with the dictionary being len(dict) <= len(gst_num_style_tokens).
|
||||
"gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
|
||||
|
||||
"use_speaker_embedding": false,
|
||||
"use_gst": true,
|
||||
"gst": {
|
||||
"gst_style_input": null,
|
||||
|
||||
|
||||
|
||||
"gst_use_speaker_embedding": true,
|
||||
"gst_embedding_dim": 512,
|
||||
"gst_num_heads": 4,
|
||||
"gst_num_style_tokens": 10
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
|
||||
{
|
||||
"model": "speaker_encoder",
|
||||
"run_name": "test_speaker_encoder",
|
||||
"run_description": "test speaker encoder.",
|
||||
"audio":{
|
||||
|
@ -42,8 +43,9 @@
|
|||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 20, // Number of steps to log traning on console.
|
||||
"batch_size": 32,
|
||||
"output_path": "", // DATASET-RELATED: output path for all training outputs.
|
||||
"model": {
|
||||
"model_params": {
|
||||
"input_dim": 40,
|
||||
"proj_dim": 256,
|
||||
"lstm_dim": 768,
|
||||
|
|
|
@ -87,7 +87,6 @@
|
|||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_gst": true, // use global style tokens
|
||||
"gst": { // gst parameter if gst is enabled
|
||||
"gst_style_input": null, // Condition the style input either on a
|
||||
// -> wave file [path to wave] or
|
||||
|
|
|
@ -3,21 +3,21 @@ import unittest
|
|||
|
||||
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.config import BaseAudioConfig
|
||||
|
||||
TESTS_PATH = get_tests_path()
|
||||
OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
||||
os.makedirs(OUT_PATH, exist_ok=True)
|
||||
conf = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
|
||||
conf = BaseAudioConfig(mel_fmax=8000)
|
||||
|
||||
|
||||
# pylint: disable=protected-access
|
||||
class TestAudio(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.ap = AudioProcessor(**conf.audio)
|
||||
self.ap = AudioProcessor(**conf)
|
||||
|
||||
def test_audio_synthesis(self):
|
||||
"""1. load wav
|
||||
|
@ -163,12 +163,12 @@ class TestAudio(unittest.TestCase):
|
|||
|
||||
def test_scaler(self):
|
||||
scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
|
||||
conf.audio["stats_path"] = scaler_stats_path
|
||||
conf.audio["preemphasis"] = 0.0
|
||||
conf.audio["do_trim_silence"] = True
|
||||
conf.audio["signal_norm"] = True
|
||||
conf.stats_path = scaler_stats_path
|
||||
conf.preemphasis = 0.0
|
||||
conf.do_trim_silence = True
|
||||
conf.signal_norm = True
|
||||
|
||||
ap = AudioProcessor(**conf.audio)
|
||||
ap = AudioProcessor(**conf)
|
||||
mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
|
||||
ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
|
||||
c = GlowTTSConfig()
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
|
|
@ -10,13 +10,17 @@ from tests import get_tests_input_path, get_tests_output_path
|
|||
from TTS.tts.datasets import TTSDataset
|
||||
from TTS.tts.datasets.preprocess import ljspeech
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.tts.configs import BaseTTSConfig
|
||||
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
|
||||
os.makedirs(OUTPATH, exist_ok=True)
|
||||
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
|
||||
|
||||
# create a dummy config for testing data loaders.
|
||||
c = BaseTTSConfig(text_cleaner='english_cleaners', num_loader_workers=0, batch_size=2)
|
||||
c.r = 5
|
||||
c.data_path = "tests/data/ljspeech/"
|
||||
ok_ljspeech = os.path.exists(c.data_path)
|
||||
|
||||
DATA_EXIST = True
|
||||
|
@ -40,7 +44,7 @@ class TestTTSDataset(unittest.TestCase):
|
|||
compute_linear_spec=True,
|
||||
ap=self.ap,
|
||||
meta_data=items,
|
||||
tp=c.characters if "characters" in c.keys() else None,
|
||||
tp=c.characters,
|
||||
batch_group_size=bgs,
|
||||
min_seq_len=c.min_seq_len,
|
||||
max_seq_len=float("inf"),
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig
|
||||
from TTS.config.shared_configs import BaseAudioConfig
|
||||
|
||||
|
@ -15,9 +16,9 @@ config = SpeakerEncoderConfig(
|
|||
num_speakers_in_batch=1,
|
||||
num_utters_per_speaker=10,
|
||||
num_loader_workers=0,
|
||||
max_train_step=10,
|
||||
max_train_step=2,
|
||||
print_step=1,
|
||||
save_step=10,
|
||||
save_step=1,
|
||||
print_eval=True,
|
||||
audio=BaseAudioConfig(num_mels=40)
|
||||
)
|
||||
|
@ -27,7 +28,7 @@ config.save_json(config_path)
|
|||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_encoder.py --config_path {config_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
|
||||
f"--coqpit.output_path {output_path} "
|
||||
"--coqpit.datasets.0.name ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_train metadata.csv "
|
||||
|
@ -41,6 +42,6 @@ run_cli(command_train)
|
|||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_encoder.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -26,11 +26,11 @@ class SpeakerManagerTest(unittest.TestCase):
|
|||
def test_speaker_embedding():
|
||||
# load config
|
||||
config = load_config(encoder_config_path)
|
||||
config["audio"]["resample"] = True
|
||||
config.audio.resample = True
|
||||
|
||||
# create a dummy speaker encoder
|
||||
model = SpeakerEncoder(**config.model)
|
||||
save_checkpoint(model, None, None, get_tests_input_path(), 0, 0)
|
||||
model = SpeakerEncoder(**config.model_params)
|
||||
save_checkpoint(model, None, None, get_tests_input_path(), 0)
|
||||
|
||||
# load audio processor and speaker encoder
|
||||
ap = AudioProcessor(**config.audio)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
|
||||
|
||||
def test_synthesize():
|
||||
|
|
|
@ -15,8 +15,8 @@ class SynthesizerTest(unittest.TestCase):
|
|||
# pylint: disable=global-statement
|
||||
global symbols, phonemes
|
||||
config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json"))
|
||||
if "characters" in config.keys():
|
||||
symbols, phonemes = make_symbols(**config.characters)
|
||||
if config.has('characters') and config.characters:
|
||||
symbols, phonemes = make_symbols(**config.characters.to_dict())
|
||||
|
||||
num_chars = len(phonemes) if config.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, 0, config)
|
||||
|
@ -25,11 +25,10 @@ class SynthesizerTest(unittest.TestCase):
|
|||
|
||||
def test_in_out(self):
|
||||
self._create_random_model()
|
||||
config = load_config(os.path.join(get_tests_input_path(), "server_config.json"))
|
||||
tts_root_path = get_tests_output_path()
|
||||
config["tts_checkpoint"] = os.path.join(tts_root_path, config["tts_checkpoint"])
|
||||
config["tts_config"] = os.path.join(tts_root_path, config["tts_config"])
|
||||
synthesizer = Synthesizer(config["tts_checkpoint"], config["tts_config"], None, None)
|
||||
tts_checkpoint = os.path.join(tts_root_path, 'checkpoint_10.pth.tar')
|
||||
tts_config = os.path.join(tts_root_path, 'dummy_model_config.json')
|
||||
synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
|
||||
synthesizer.tts("Better this test works!!")
|
||||
|
||||
def test_split_into_sentences(self):
|
||||
|
|
|
@ -17,7 +17,7 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
|
||||
c = Tacotron2Config()
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
@ -152,10 +152,8 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
num_chars=24,
|
||||
r=c.r,
|
||||
num_speakers=5,
|
||||
gst=True,
|
||||
gst_embedding_dim=c.gst["gst_embedding_dim"],
|
||||
gst_num_heads=c.gst["gst_num_heads"],
|
||||
gst_style_tokens=c.gst["gst_style_tokens"],
|
||||
use_gst=True,
|
||||
gst=c.gst
|
||||
).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
|
@ -216,10 +214,8 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
num_chars=24,
|
||||
r=c.r,
|
||||
num_speakers=5,
|
||||
gst=True,
|
||||
gst_embedding_dim=c.gst["gst_embedding_dim"],
|
||||
gst_num_heads=c.gst["gst_num_heads"],
|
||||
gst_style_tokens=c.gst["gst_style_tokens"],
|
||||
use_gst=True,
|
||||
gst =c.gst
|
||||
).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
|
@ -280,11 +276,8 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
r=c.r,
|
||||
num_speakers=5,
|
||||
speaker_embedding_dim=55,
|
||||
gst=True,
|
||||
gst_embedding_dim=c.gst["gst_embedding_dim"],
|
||||
gst_num_heads=c.gst["gst_num_heads"],
|
||||
gst_style_tokens=c.gst["gst_style_tokens"],
|
||||
gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"],
|
||||
use_gst=True,
|
||||
gst=c.gst
|
||||
).to(device)
|
||||
model.train()
|
||||
model_ref = copy.deepcopy(model)
|
||||
|
|
|
@ -19,7 +19,7 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
|
||||
c = Tacotron2Config()
|
||||
|
||||
|
||||
class TacotronTFTrainTest(unittest.TestCase):
|
||||
|
|
|
@ -18,7 +18,7 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
|
||||
c = TacotronConfig()
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
@ -175,10 +175,8 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
model = Tacotron(
|
||||
num_chars=32,
|
||||
num_speakers=5,
|
||||
gst=True,
|
||||
gst_embedding_dim=c.gst["gst_embedding_dim"],
|
||||
gst_num_heads=c.gst["gst_num_heads"],
|
||||
gst_style_tokens=c.gst["gst_style_tokens"],
|
||||
use_gst=True,
|
||||
gst=c.gst,
|
||||
postnet_output_dim=c.audio["fft_size"],
|
||||
decoder_output_dim=c.audio["num_mels"],
|
||||
r=c.r,
|
||||
|
@ -240,10 +238,8 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
|||
model = Tacotron(
|
||||
num_chars=32,
|
||||
num_speakers=5,
|
||||
gst=True,
|
||||
gst_embedding_dim=c.gst["gst_embedding_dim"],
|
||||
gst_num_heads=c.gst["gst_num_heads"],
|
||||
gst_style_tokens=c.gst["gst_style_tokens"],
|
||||
use_gst=True,
|
||||
gst=c.gst,
|
||||
postnet_output_dim=c.audio["fft_size"],
|
||||
decoder_output_dim=c.audio["num_mels"],
|
||||
r=c.r,
|
||||
|
@ -306,11 +302,8 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
|||
num_speakers=5,
|
||||
postnet_output_dim=c.audio["fft_size"],
|
||||
decoder_output_dim=c.audio["num_mels"],
|
||||
gst=True,
|
||||
gst_embedding_dim=c.gst["gst_embedding_dim"],
|
||||
gst_num_heads=c.gst["gst_num_heads"],
|
||||
gst_style_tokens=c.gst["gst_style_tokens"],
|
||||
gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"],
|
||||
use_gst=True,
|
||||
gst=c.gst,
|
||||
r=c.r,
|
||||
memory_size=c.memory_size,
|
||||
speaker_embedding_dim=55,
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.tts.configs import AlignTTSConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
|
||||
|
@ -30,7 +31,7 @@ config.save_json(config_path)
|
|||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_align_tts.py --config_path {config_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --config_path {config_path} "
|
||||
f"--coqpit.output_path {output_path} "
|
||||
"--coqpit.datasets.0.name ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_train metadata.csv "
|
||||
|
@ -43,6 +44,6 @@ run_cli(command_train)
|
|||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_align_tts.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.tts.configs import GlowTTSConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
|
||||
|
@ -30,7 +31,7 @@ config.save_json(config_path)
|
|||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_glow_tts.py --config_path {config_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --config_path {config_path} "
|
||||
f"--coqpit.output_path {output_path} "
|
||||
"--coqpit.datasets.0.name ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_train metadata.csv "
|
||||
|
@ -44,6 +45,6 @@ run_cli(command_train)
|
|||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_glow_tts.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.tts.configs import SpeedySpeechConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
|
||||
|
@ -30,7 +31,7 @@ config.save_json(config_path)
|
|||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_speedy_speech.py --config_path {config_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --config_path {config_path} "
|
||||
f"--coqpit.output_path {output_path} "
|
||||
"--coqpit.datasets.0.name ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_train metadata.csv "
|
||||
|
@ -44,6 +45,6 @@ run_cli(command_train)
|
|||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_speedy_speech.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.tts.configs import Tacotron2Config
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
|
||||
|
@ -31,7 +32,7 @@ config.save_json(config_path)
|
|||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_tacotron.py --config_path {config_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} "
|
||||
f"--coqpit.output_path {output_path} "
|
||||
"--coqpit.datasets.0.name ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_train metadata.csv "
|
||||
|
@ -44,6 +45,6 @@ run_cli(command_train)
|
|||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_tacotron.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.tts.configs import TacotronConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
|
||||
|
@ -30,7 +31,7 @@ config.save_json(config_path)
|
|||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_tacotron.py --config_path {config_path} "
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} "
|
||||
f"--coqpit.output_path {output_path} "
|
||||
"--coqpit.datasets.0.name ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_train metadata.csv "
|
||||
|
@ -43,6 +44,6 @@ run_cli(command_train)
|
|||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_tacotron.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.vocoder.configs import FullbandMelganConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
|
||||
|
@ -28,13 +29,13 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.vocoder.configs import HifiganConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
|
||||
|
@ -29,13 +30,13 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,9 +2,10 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
from TTS.vocoder.configs import MelganConfig
|
||||
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
|
||||
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||
|
||||
|
@ -28,13 +29,13 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.vocoder.configs import MultibandMelganConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
|
||||
|
@ -28,13 +29,13 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.vocoder.configs import ParallelWaveganConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
|
||||
|
@ -28,13 +29,13 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -3,17 +3,17 @@ import os
|
|||
import numpy as np
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
|
||||
from tests import get_tests_output_path, get_tests_path
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.configs import BaseGANVocoderConfig
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
|
||||
os.makedirs(OUTPATH, exist_ok=True)
|
||||
|
||||
C = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
|
||||
C = BaseGANVocoderConfig()
|
||||
|
||||
test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
|
||||
ok_ljspeech = os.path.exists(test_data_path)
|
||||
|
@ -46,6 +46,8 @@ def gan_dataset_case(
|
|||
|
||||
def check_item(feat, wav):
|
||||
"""Pass a single pair of features and waveform"""
|
||||
feat = feat.numpy()
|
||||
wav = wav.numpy()
|
||||
expected_feat_shape = (batch_size, ap.num_mels, seq_len // hop_len + conv_pad * 2)
|
||||
|
||||
# check shapes
|
||||
|
@ -61,7 +63,7 @@ def gan_dataset_case(
|
|||
# the first 2 and the last 2 frames are skipped due to the padding
|
||||
# differences in stft
|
||||
max_diff = abs((feat - mel[:, : feat.shape[-1]])[:, 2:-2]).max()
|
||||
assert max_diff <= 0, f" [!] {max_diff}"
|
||||
assert max_diff <= 1e-6, f" [!] {max_diff}"
|
||||
|
||||
# return random segments or return the whole audio
|
||||
if return_segments:
|
||||
|
@ -69,7 +71,6 @@ def gan_dataset_case(
|
|||
for item1, item2 in loader:
|
||||
feat1, wav1 = item1
|
||||
feat2, wav2 = item2
|
||||
|
||||
check_item(feat1, wav1)
|
||||
check_item(feat2, wav2)
|
||||
count_iter += 1
|
||||
|
|
|
@ -14,8 +14,7 @@ os.makedirs(OUT_PATH, exist_ok=True)
|
|||
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
||||
C = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
|
||||
ap = AudioProcessor(**C.audio)
|
||||
ap = AudioProcessor(**BaseAudioConfig().to_dict())
|
||||
|
||||
|
||||
def test_torch_stft():
|
||||
|
|
|
@ -14,7 +14,7 @@ file_path = os.path.dirname(os.path.realpath(__file__))
|
|||
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
|
||||
os.makedirs(OUTPATH, exist_ok=True)
|
||||
|
||||
C = load_config(os.path.join(get_tests_input_path(), "test_vocoder_wavernn_config.json"))
|
||||
C = WavernnConfig()
|
||||
|
||||
test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
|
||||
test_mel_feat_path = os.path.join(test_data_path, "mel")
|
||||
|
|
|
@ -2,39 +2,44 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.vocoder.configs import WavegradConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
|
||||
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||
|
||||
config = WavegradConfig(
|
||||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
seq_len=8192,
|
||||
eval_split_size=1,
|
||||
print_step=1,
|
||||
print_eval=True,
|
||||
data_path="tests/data/ljspeech",
|
||||
output_path=output_path,
|
||||
)
|
||||
config = WavegradConfig(batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_val_loader_workers=0,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
seq_len=8192,
|
||||
eval_split_size=1,
|
||||
print_step=1,
|
||||
print_eval=True,
|
||||
data_path="tests/data/ljspeech",
|
||||
output_path=output_path,
|
||||
test_noise_schedule={
|
||||
"min_val": 1e-6,
|
||||
"max_val": 1e-2,
|
||||
"num_steps": 2
|
||||
})
|
||||
config.audio.do_trim_silence = True
|
||||
config.audio.trim_db = 60
|
||||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")),
|
||||
key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
|
@ -2,7 +2,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_output_path, run_cli, get_device_id
|
||||
|
||||
from TTS.vocoder.configs import WavernnConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
|
||||
|
@ -16,7 +17,7 @@ config = WavernnConfig(
|
|||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
seq_len=8192,
|
||||
seq_len=256, # for shorter test time
|
||||
eval_split_size=1,
|
||||
print_step=1,
|
||||
print_eval=True,
|
||||
|
@ -28,13 +29,14 @@ config.audio.trim_db = 60
|
|||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} "
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")),
|
||||
key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} "
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
||||
|
|
Loading…
Reference in New Issue