mirror of https://github.com/coqui-ai/TTS.git
fix linter
parent
3e9ca4b95d
commit
215a74b32e
|
@ -3,7 +3,6 @@ import argparse
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
import numpy
|
|
||||||
from tqdm.contrib.concurrent import process_map
|
from tqdm.contrib.concurrent import process_map
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
|
|
|
@ -6,12 +6,7 @@ import glob
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import sys
|
|
||||||
import wave
|
import wave
|
||||||
from itertools import chain
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import tqdm
|
|
||||||
import webrtcvad
|
import webrtcvad
|
||||||
from tqdm.contrib.concurrent import process_map
|
from tqdm.contrib.concurrent import process_map
|
||||||
|
|
||||||
|
@ -47,8 +42,8 @@ def write_wave(path, audio, sample_rate):
|
||||||
class Frame(object):
|
class Frame(object):
|
||||||
"""Represents a "frame" of audio data."""
|
"""Represents a "frame" of audio data."""
|
||||||
|
|
||||||
def __init__(self, bytes, timestamp, duration):
|
def __init__(self, _bytes, timestamp, duration):
|
||||||
self.bytes = bytes
|
self.bytes =_bytes
|
||||||
self.timestamp = timestamp
|
self.timestamp = timestamp
|
||||||
self.duration = duration
|
self.duration = duration
|
||||||
|
|
||||||
|
@ -121,7 +116,7 @@ def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, fram
|
||||||
# We want to yield all the audio we see from now until
|
# We want to yield all the audio we see from now until
|
||||||
# we are NOTTRIGGERED, but we have to start with the
|
# we are NOTTRIGGERED, but we have to start with the
|
||||||
# audio that's already in the ring buffer.
|
# audio that's already in the ring buffer.
|
||||||
for f, s in ring_buffer:
|
for f, _ in ring_buffer:
|
||||||
voiced_frames.append(f)
|
voiced_frames.append(f)
|
||||||
ring_buffer.clear()
|
ring_buffer.clear()
|
||||||
else:
|
else:
|
||||||
|
@ -146,11 +141,10 @@ def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, fram
|
||||||
|
|
||||||
|
|
||||||
def remove_silence(filepath):
|
def remove_silence(filepath):
|
||||||
filename = os.path.basename(filepath)
|
|
||||||
output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
|
output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
|
||||||
# ignore if the file exists
|
# ignore if the file exists
|
||||||
if os.path.exists(output_path) and not args.force:
|
if os.path.exists(output_path) and not args.force:
|
||||||
return False
|
return
|
||||||
# create all directory structure
|
# create all directory structure
|
||||||
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
padding_duration_ms = 300 # default 300
|
padding_duration_ms = 300 # default 300
|
||||||
|
@ -166,7 +160,7 @@ def remove_silence(filepath):
|
||||||
if num_segments != 0:
|
if num_segments != 0:
|
||||||
for i, segment in reversed(list(enumerate(segments))):
|
for i, segment in reversed(list(enumerate(segments))):
|
||||||
if i >= 1:
|
if i >= 1:
|
||||||
if flag == False:
|
if not flag:
|
||||||
concat_segment = segment
|
concat_segment = segment
|
||||||
flag = True
|
flag = True
|
||||||
else:
|
else:
|
||||||
|
@ -176,11 +170,12 @@ def remove_silence(filepath):
|
||||||
segment = segment + concat_segment
|
segment = segment + concat_segment
|
||||||
write_wave(output_path, segment, sample_rate)
|
write_wave(output_path, segment, sample_rate)
|
||||||
print(output_path)
|
print(output_path)
|
||||||
return True
|
return
|
||||||
else:
|
else:
|
||||||
print("> Just Copying the file to:", output_path)
|
print("> Just Copying the file to:", output_path)
|
||||||
# if fail to remove silence just write the file
|
# if fail to remove silence just write the file
|
||||||
write_wave(output_path, audio, sample_rate)
|
write_wave(output_path, audio, sample_rate)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
def preprocess_audios():
|
def preprocess_audios():
|
||||||
|
@ -198,11 +193,9 @@ def preprocess_audios():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""
|
parser = argparse.ArgumentParser(
|
||||||
usage
|
description="python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2"
|
||||||
python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2
|
)
|
||||||
"""
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
|
parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
|
"-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
|
||||||
|
|
|
@ -59,7 +59,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
def mailabs(root_path, meta_files=None):
|
def mailabs(root_path, meta_files=None, ununsed_speakers=None):
|
||||||
"""Normalizes M-AI-Labs meta data files to TTS format
|
"""Normalizes M-AI-Labs meta data files to TTS format
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import math
|
import math
|
||||||
import random
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
@ -747,7 +746,7 @@ class Vits(BaseTTS):
|
||||||
|
|
||||||
# inverse decoder and get the output
|
# inverse decoder and get the output
|
||||||
z_f_pred = self.flow(z_ft, y_mask, g=g, reverse=True)
|
z_f_pred = self.flow(z_ft, y_mask, g=g, reverse=True)
|
||||||
z_slice, slice_ids = rand_segment(z_f_pred, y_lengths, self.spec_segment_size)
|
z_slice, slice_ids = rand_segments(z_f_pred, y_lengths, self.spec_segment_size)
|
||||||
|
|
||||||
o = self.waveform_decoder(z_slice, g=g)
|
o = self.waveform_decoder(z_slice, g=g)
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ import fsspec
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
from torch.utils.data.sampler import WeightedRandomSampler
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.speaker_encoder.utils.generic_utils import setup_model
|
from TTS.speaker_encoder.utils.generic_utils import setup_model
|
||||||
|
|
|
@ -180,7 +180,7 @@ def plot_phonemes(train_path, cmu_dict_path, save_path):
|
||||||
|
|
||||||
plt.figure()
|
plt.figure()
|
||||||
plt.rcParams["figure.figsize"] = (50, 20)
|
plt.rcParams["figure.figsize"] = (50, 20)
|
||||||
barplot = sns.barplot(x, y)
|
barplot = sns.barplot(x=x, y=y)
|
||||||
if save_path:
|
if save_path:
|
||||||
fig = barplot.get_figure()
|
fig = barplot.get_figure()
|
||||||
fig.savefig(os.path.join(save_path, "phoneme_dist"))
|
fig.savefig(os.path.join(save_path, "phoneme_dist"))
|
||||||
|
|
Loading…
Reference in New Issue