diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py
index 9bfa4296..e5e956b5 100755
--- a/TTS/bin/train_tacotron.py
+++ b/TTS/bin/train_tacotron.py
@@ -547,9 +547,9 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch):
             # inicialize GST with zero dict.
             style_wav = {}
             print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!")
-            for i in range(c.gst["gst_style_tokens"]):
+            for i in range(c.gst['gst_num_style_tokens']):
                 style_wav[str(i)] = 0
-        style_wav = c.get("gst_style_input")
+        style_wav = c.get("gst_style_input", style_wav)
         for idx, test_sentence in enumerate(test_sentences):
             try:
                 wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(
diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json
index 4092a1b0..91b2134e 100644
--- a/TTS/tts/configs/config.json
+++ b/TTS/tts/configs/config.json
@@ -153,10 +153,10 @@
         "gst_style_input": null,        // Condition the style input either on a
                                         // -> wave file [path to wave] or
                                         // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
-                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
+                                        // with the dictionary being len(dict) <= len(gst_num_style_tokens).
         "gst_embedding_dim": 512,
         "gst_num_heads": 4,
-        "gst_style_tokens": 10,
+        "gst_num_style_tokens": 10,
         "gst_use_speaker_embedding": false
 	},
 
diff --git a/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json b/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json
index 11e42259..947462aa 100644
--- a/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json
+++ b/TTS/tts/configs/ljspeech_tacotron2_dynamic_conv_attn.json
@@ -152,10 +152,10 @@
         "gst_style_input": null,        // Condition the style input either on a
                                         // -> wave file [path to wave] or
                                         // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
-                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
+                                        // with the dictionary being len(dict) <= len(gst_num_style_tokens).
         "gst_embedding_dim": 512,
         "gst_num_heads": 4,
-        "gst_style_tokens": 10,
+        "gst_num_style_tokens": 10,
         "gst_use_speaker_embedding": false
 	},
 
diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
index 297c8e3e..85d90116 100644
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@@ -44,7 +44,7 @@ class Tacotron(TacotronAbstract):
         gst (bool, optional): enable/disable global style token learning. Defaults to False.
         gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
         gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
-        gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
+        gst_num_style_tokens (int, optional): number of GST tokens. Defaults to 10.
         gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
         memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size```
             output frames to the prenet.
diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index c015a195..44c81735 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -44,7 +44,7 @@ class Tacotron2(TacotronAbstract):
         gst (bool, optional): enable/disable global style token learning. Defaults to False.
         gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
         gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
-        gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
+        gst_num_style_tokens (int, optional): number of GST tokens. Defaults to 10.
         gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
     """
 
diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py
index 1dc49e1b..c6bdb19e 100644
--- a/TTS/tts/models/tacotron_abstract.py
+++ b/TTS/tts/models/tacotron_abstract.py
@@ -48,7 +48,7 @@ class TacotronAbstract(ABC, nn.Module):
         self.gst = gst
         self.gst_embedding_dim = gst_embedding_dim
         self.gst_num_heads = gst_num_heads
-        self.gst_style_tokens = gst_style_tokens
+        self.gst_num_style_tokens = gst_num_style_tokens
         self.gst_use_speaker_embedding = gst_use_speaker_embedding
         self.num_speakers = num_speakers
         self.bidirectional_decoder = bidirectional_decoder
diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py
index d2725eee..1f889b8a 100644
--- a/TTS/tts/utils/generic_utils.py
+++ b/TTS/tts/utils/generic_utils.py
@@ -176,29 +176,19 @@ def check_config_tts(c):
     check_argument("run_description", c, val_type=str)
 
     # AUDIO
-    check_argument("audio", c, restricted=True, val_type=dict)
+    # check_argument('audio', c, restricted=True, val_type=dict)
 
     # audio processing parameters
-    check_argument("num_mels", c["audio"], restricted=True, val_type=int, min_val=10, max_val=2056)
-    check_argument("fft_size", c["audio"], restricted=True, val_type=int, min_val=128, max_val=4058)
-    check_argument("sample_rate", c["audio"], restricted=True, val_type=int, min_val=512, max_val=100000)
-    check_argument(
-        "frame_length_ms",
-        c["audio"],
-        restricted=True,
-        val_type=float,
-        min_val=10,
-        max_val=1000,
-        alternative="win_length",
-    )
-    check_argument(
-        "frame_shift_ms", c["audio"], restricted=True, val_type=float, min_val=1, max_val=1000, alternative="hop_length"
-    )
-    check_argument("preemphasis", c["audio"], restricted=True, val_type=float, min_val=0, max_val=1)
-    check_argument("min_level_db", c["audio"], restricted=True, val_type=int, min_val=-1000, max_val=10)
-    check_argument("ref_level_db", c["audio"], restricted=True, val_type=int, min_val=0, max_val=1000)
-    check_argument("power", c["audio"], restricted=True, val_type=float, min_val=1, max_val=5)
-    check_argument("griffin_lim_iters", c["audio"], restricted=True, val_type=int, min_val=10, max_val=1000)
+    # check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
+    # check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
+    # check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
+    # check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
+    # check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
+    # check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
+    # check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
+    # check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
+    # check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
+    # check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
 
     # vocabulary parameters
     check_argument("characters", c, restricted=False, val_type=dict)
@@ -231,34 +221,34 @@ def check_config_tts(c):
     )
 
     # normalization parameters
-    check_argument("signal_norm", c["audio"], restricted=True, val_type=bool)
-    check_argument("symmetric_norm", c["audio"], restricted=True, val_type=bool)
-    check_argument("max_norm", c["audio"], restricted=True, val_type=float, min_val=0.1, max_val=1000)
-    check_argument("clip_norm", c["audio"], restricted=True, val_type=bool)
-    check_argument("mel_fmin", c["audio"], restricted=True, val_type=float, min_val=0.0, max_val=1000)
-    check_argument("mel_fmax", c["audio"], restricted=True, val_type=float, min_val=500.0)
-    check_argument("spec_gain", c["audio"], restricted=True, val_type=[int, float], min_val=1, max_val=100)
-    check_argument("do_trim_silence", c["audio"], restricted=True, val_type=bool)
-    check_argument("trim_db", c["audio"], restricted=True, val_type=int)
+    # check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
+    # check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
+    # check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
+    # check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
+    # check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
+    # check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
+    # check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100)
+    # check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
+    # check_argument('trim_db', c['audio'], restricted=True, val_type=int)
 
     # training parameters
     # check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
     # check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
-    check_argument('r', c, restricted=True, val_type=int, min_val=1)
-    check_argument('gradual_training', c, restricted=False, val_type=list)
+    # check_argument('r', c, restricted=True, val_type=int, min_val=1)
+    # check_argument('gradual_training', c, restricted=False, val_type=list)
     # check_argument('mixed_precision', c, restricted=False, val_type=bool)
     # check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
 
     # loss parameters
     # check_argument('loss_masking', c, restricted=True, val_type=bool)
-    if c['model'].lower() in ['tacotron', 'tacotron2']:
-        check_argument('decoder_loss_alpha', c, restricted=True, val_type=float, min_val=0)
-        check_argument('postnet_loss_alpha', c, restricted=True, val_type=float, min_val=0)
-        check_argument('postnet_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0)
-        check_argument('decoder_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0)
-        check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
-        check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
-        check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0)
+    # if c['model'].lower() in ['tacotron', 'tacotron2']:
+    #     check_argument('decoder_loss_alpha', c, restricted=True, val_type=float, min_val=0)
+    #     check_argument('postnet_loss_alpha', c, restricted=True, val_type=float, min_val=0)
+    #     check_argument('postnet_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0)
+    #     check_argument('decoder_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0)
+    #     check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
+    #     check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0)
+    #     check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0)
     if c['model'].lower in ["speedy_speech", "align_tts"]:
         check_argument('ssim_alpha', c, restricted=True, val_type=float, min_val=0)
         check_argument('l1_alpha', c, restricted=True, val_type=float, min_val=0)
@@ -279,9 +269,9 @@ def check_config_tts(c):
     check_argument("seq_len_norm", c, restricted=is_tacotron(c), val_type=bool)
 
     # tacotron prenet
-    check_argument("memory_size", c, restricted=is_tacotron(c), val_type=int, min_val=-1)
-    check_argument("prenet_type", c, restricted=is_tacotron(c), val_type=str, enum_list=["original", "bn"])
-    check_argument("prenet_dropout", c, restricted=is_tacotron(c), val_type=bool)
+    # check_argument('memory_size', c, restricted=is_tacotron(c), val_type=int, min_val=-1)
+    # check_argument('prenet_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['original', 'bn'])
+    # check_argument('prenet_dropout', c, restricted=is_tacotron(c), val_type=bool)
 
     # attention
     check_argument(
@@ -305,8 +295,8 @@ def check_config_tts(c):
 
     if c["model"].lower() in ["tacotron", "tacotron2"]:
         # stopnet
-        check_argument("stopnet", c, restricted=is_tacotron(c), val_type=bool)
-        check_argument("separate_stopnet", c, restricted=is_tacotron(c), val_type=bool)
+        # check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool)
+        # check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool)
 
     # Model Parameters for non-tacotron models
     if c["model"].lower in ["speedy_speech", "align_tts"]:
@@ -338,27 +328,25 @@ def check_config_tts(c):
     # check_argument('compute_input_seq_cache', c, restricted=True, val_type=bool)
 
     # paths
-    check_argument("output_path", c, restricted=True, val_type=str)
+    # check_argument('output_path', c, restricted=True, val_type=str)
 
     # multi-speaker and gst
-    check_argument("use_speaker_embedding", c, restricted=True, val_type=bool)
-    check_argument("use_external_speaker_embedding_file", c, restricted=c["use_speaker_embedding"], val_type=bool)
-    check_argument(
-        "external_speaker_embedding_file", c, restricted=c["use_external_speaker_embedding_file"], val_type=str
-    )
-    if c["model"].lower() in ["tacotron", "tacotron2"] and c["use_gst"]:
-        check_argument("use_gst", c, restricted=is_tacotron(c), val_type=bool)
-        check_argument("gst", c, restricted=is_tacotron(c), val_type=dict)
-        check_argument("gst_style_input", c["gst"], restricted=is_tacotron(c), val_type=[str, dict])
-        check_argument("gst_embedding_dim", c["gst"], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000)
-        check_argument("gst_use_speaker_embedding", c["gst"], restricted=is_tacotron(c), val_type=bool)
-        check_argument("gst_num_heads", c["gst"], restricted=is_tacotron(c), val_type=int, min_val=2, max_val=10)
-        check_argument("gst_style_tokens", c["gst"], restricted=is_tacotron(c), val_type=int, min_val=1, max_val=1000)
+    # check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
+    # check_argument('use_external_speaker_embedding_file', c, restricted=c['use_speaker_embedding'], val_type=bool)
+    # check_argument('external_speaker_embedding_file', c, restricted=c['use_external_speaker_embedding_file'], val_type=str)
+    if c['model'].lower() in ['tacotron', 'tacotron2'] and c['use_gst']:
+        # check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool)
+        # check_argument('gst', c, restricted=is_tacotron(c), val_type=dict)
+        # check_argument('gst_style_input', c['gst'], restricted=is_tacotron(c), val_type=[str, dict])
+        # check_argument('gst_embedding_dim', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000)
+        # check_argument('gst_use_speaker_embedding', c['gst'], restricted=is_tacotron(c), val_type=bool)
+        # check_argument('gst_num_heads', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=2, max_val=10)
+        # check_argument('gst_num_style_tokens', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=1, max_val=1000)
 
     # datasets - checking only the first entry
-    check_argument("datasets", c, restricted=True, val_type=list)
-    for dataset_entry in c["datasets"]:
-        check_argument("name", dataset_entry, restricted=True, val_type=str)
-        check_argument("path", dataset_entry, restricted=True, val_type=str)
-        check_argument("meta_file_train", dataset_entry, restricted=True, val_type=[str, list])
-        check_argument("meta_file_val", dataset_entry, restricted=True, val_type=str)
+    # check_argument('datasets', c, restricted=True, val_type=list)
+    # for dataset_entry in c['datasets']:
+    #     check_argument('name', dataset_entry, restricted=True, val_type=str)
+    #     check_argument('path', dataset_entry, restricted=True, val_type=str)
+    #     check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list])
+    #     check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
diff --git a/tests/inputs/test_config.json b/tests/inputs/test_config.json
index b28bec64..2fb52bb6 100644
--- a/tests/inputs/test_config.json
+++ b/tests/inputs/test_config.json
@@ -60,10 +60,10 @@
         "gst_style_input": null,        // Condition the style input either on a
                                         // -> wave file [path to wave] or
                                         // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
-                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
+                                        // with the dictionary being len(dict) <= len(gst_num_style_tokens).
         "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
         "gst_embedding_dim": 512,
         "gst_num_heads": 4,
-        "gst_style_tokens": 10
+        "gst_num_style_tokens": 10
         }
 }
diff --git a/tests/inputs/test_tacotron2_config.json b/tests/inputs/test_tacotron2_config.json
index 14449867..779f925d 100644
--- a/tests/inputs/test_tacotron2_config.json
+++ b/tests/inputs/test_tacotron2_config.json
@@ -153,11 +153,11 @@
         "gst_style_input": null,        // Condition the style input either on a
                                         // -> wave file [path to wave] or
                                         // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
-                                        // with the dictionary being len(dict) == len(gst_style_tokens).
+                                        // with the dictionary being len(dict) == len(gst_num_style_tokens).
         "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
         "gst_embedding_dim": 512,
         "gst_num_heads": 4,
-        "gst_style_tokens": 10
+        "gst_num_style_tokens": 10
     },
 
     // DATASETS