Remove minor bugs and make code trainable

2021-03-04 00:24:32 +05:30 · 2021-03-04 00:24:32 +05:30 · b533474e3b
parent ef6ff4e95c
commit b533474e3b
2 changed files with 10 additions and 10 deletions
--- a/TTS/vocoder/configs/modified_hifigan.json
+++ b/TTS/vocoder/configs/modified_hifigan.json
@ -31,7 +31,7 @@
        "symmetric_norm": true, // move normalization to range [-1, 1]
        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
-        "stats_path": "/home/erogol/Data/libritts/LibriTTS/scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
    },

    // DISTRIBUTED TRAINING
@ -44,11 +44,11 @@
    "use_pqmf": false,

    // LOSS PARAMETERS
-    "use_stft_loss": false,
+    "use_stft_loss": true,
    "use_subband_stft_loss": false,
    "use_mse_gan_loss": true,
    "use_hinge_gan_loss": false,
-    "use_feat_match_loss": false,  // use only with melgan discriminators
+    "use_feat_match_loss": true,  // use only with melgan discriminators

    // loss weights
    "stft_loss_weight": 0.5,
@ -67,14 +67,14 @@
    "target_loss": "avg_G_loss",  // loss value to pick the best model to save after each epoch

    // DISCRIMINATOR
-    "discriminator_model": "hifigan_mpd_discriminator",
+    "discriminator_model": "multi_period_discriminator",
    "discriminator_model_params":{
        "peroids": [2, 3, 5, 7, 11],
        "base_channels": 16,
        "max_channels":512,
        "downsample_factors":[4, 4, 4]
    },
-    "steps_to_start_discriminator": 1,      // steps required to start GAN trainining.1
+    "steps_to_start_discriminator": 0,      // steps required to start GAN trainining.1

    // GENERATOR
    "generator_model": "hifigan_generator",
@ -87,7 +87,7 @@
    },

    // DATASET
-    "data_path": "/home/erogol/Data/libritts/LibriTTS/train-clean-360/",
+    "data_path": "/workspace/LJSpeech-1.1/",
    "feature_path": null,
    "seq_len": 16384,
    "pad_short": 2000,
@ -98,7 +98,7 @@
    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.

    // TRAINING
-    "batch_size": 48,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.

    // VALIDATION
    "run_eval": true,
@ -136,7 +136,7 @@
    "eval_split_size": 10,

    // PATHS
-    "output_path": "/home/erogol/Models/"
+    "output_path": "/workspace/Models/"
 }


--- a/TTS/vocoder/models/hifigan_generator.py
+++ b/TTS/vocoder/models/hifigan_generator.py
@ -3,11 +3,11 @@ from torch import nn
 from TTS.vocoder.layers.hifigan import MRF


-class Generator(nn.Module):
+class HifiganGenerator(nn.Module):

    def __init__(self, in_channels=80, out_channels=1, base_channels=512, upsample_kernel=[16, 16, 4, 4],
                 resblock_kernel_sizes=[3, 7, 11], resblock_dilation_sizes=[1, 3, 5]):
-        super(Generator, self).__init__()
+        super(HifiganGenerator, self).__init__()

        self.inference_padding = 2