stylewav for testing inference

2019-07-24 12:17:08 +02:00 · 2019-07-24 12:17:08 +02:00 · 4a23354d3c
parent b1657d70b1
commit 4a23354d3c
2 changed files with 5 additions and 2 deletions
--- a/config_tacotron_gst.json
+++ b/config_tacotron_gst.json
@ -77,6 +77,7 @@
        "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
        "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
        "text_cleaner": "phoneme_cleaners",
-        "use_speaker_embedding": false // whether to use additional embeddings for separate speakers
+        "use_speaker_embedding": false, // whether to use additional embeddings for separate speakers
+        "style_wav_for_test": null // path to wav for styling the inference tests when using GST
    }
    
--- a/train.py
+++ b/train.py
@ -409,11 +409,13 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
        test_figures = {}
        print(" | > Synthesizing test sentences")
        speaker_id = 0 if c.use_speaker_embedding else None
+        style_wav = c.get("style_wav_for_test")
        for idx, test_sentence in enumerate(test_sentences):
            try:
                wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
                    model, test_sentence, c, use_cuda, ap,
-                    speaker_id=speaker_id)
+                    speaker_id=speaker_id,
+                    style_wav=style_wav)
                file_path = os.path.join(AUDIO_PATH, str(current_step))
                os.makedirs(file_path, exist_ok=True)
                file_path = os.path.join(file_path,