diff --git a/compute_statistics.py b/compute_statistics.py new file mode 100755 index 00000000..bbedf7af --- /dev/null +++ b/compute_statistics.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse + +import numpy as np +from tqdm import tqdm + +from TTS.datasets.preprocess import load_meta_data +from TTS.utils.generic_utils import load_config +from TTS.utils.audio import AudioProcessor + +def main(): + """Run preprocessing process.""" + parser = argparse.ArgumentParser( + description="Compute mean and variance of spectrogtram features.") + parser.add_argument("--config_path", type=str, required=True, + help="TTS config file path.") + parser.add_argument("--out_path", default=None, type=str, + help="directory to save the output file.") + args = parser.parse_args() + + # load config + CONFIG = load_config(args.config_path) + CONFIG.audio['signal_norm'] = False # do not apply earlier normalization + CONFIG.audio['stats_path'] = None # discard pre-defined stats + + # load audio processor + ap = AudioProcessor(**CONFIG.audio) + + # load the meta data of target dataset + dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data + print(f" > There are {len(dataset_items)} files.") + + mel_sum = 0 + mel_square_sum = 0 + linear_sum = 0 + linear_square_sum = 0 + N = 0 + for item in tqdm(dataset_items): + # compute features + wav = ap.load_wav(item[1]) + linear = ap.spectrogram(wav) + mel = ap.melspectrogram(wav) + + # compute stats + N += mel.shape[1] + mel_sum += mel.sum(1) + linear_sum += linear.sum(1) + mel_square_sum += (mel ** 2).sum(axis=1) + linear_square_sum += (linear ** 2).sum(axis=1) + + mel_mean = mel_sum / N + mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) + linear_mean = linear_sum / N + linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) + + output_file_path = os.path.join(args.out_path, "scale_stats.npy") + stats = {} + stats['mel_mean'] = mel_mean + stats['mel_std'] = mel_scale + stats['linear_mean'] = linear_mean + stats['linear_std'] = linear_scale + + # set default config values for mean-var scaling + CONFIG.audio['stats_path'] = output_file_path + CONFIG.audio['signal_norm'] = True + # remove redundant values + del CONFIG.audio['max_norm'] + del CONFIG.audio['min_level_db'] + del CONFIG.audio['symmetric_norm'] + del CONFIG.audio['clip_norm'] + stats['audio_config'] = CONFIG.audio + np.save(output_file_path, stats, allow_pickle=True) + + +if __name__ == "__main__": + main()