Merge pull request #2495 from coqui-ai/api_voice_conversion

Api voice conversion
pull/2499/head
Eren Gölge 2023-04-11 16:40:14 +02:00 committed by GitHub
commit 73d963718a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 49 additions and 11 deletions

View File

@ -1,10 +1,13 @@
<img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
----
### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
## 🐸Coqui.ai News
- 📣 Coqui Studio API is landed on 🐸TTS. You can use the studio voices in combination with 🐸TTS models. [Example](https://github.com/coqui-ai/TTS/edit/dev/README.md#-python-api)
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on Coqui.ai!! [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
- 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
<br>
## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
----
🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
@ -123,6 +126,9 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
- UnivNet: [paper](https://arxiv.org/abs/2106.07889)
### Voice Conversion
- FreeVC: [paper](https://arxiv.org/abs/2210.15418)
You can also help us implement more models.
## Install TTS

View File

@ -85,7 +85,7 @@ class CS_API:
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
if not self.api_token:
raise ValueError(
"No API token found for 🐸Coqui Studio voices - https://coqui.ai.\n"
"No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
"Visit 🔗https://app.coqui.ai/account to get one.\n"
"Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
""
@ -273,8 +273,11 @@ class TTS:
self.csapi = None
self.model_name = None
if model_name:
self.load_tts_model_by_name(model_name, gpu)
if model_name is not None:
if "tts_models" in model_name or "coqui_studio" in model_name:
self.load_tts_model_by_name(model_name, gpu)
elif "voice_conversion_models" in model_name:
self.load_vc_model_by_name(model_name, gpu)
if model_path:
self.load_tts_model_by_path(
@ -342,6 +345,7 @@ class TTS:
model_name (str): Model name to load. You can list models by ```tts.models```.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.model_name = model_name
model_path, config_path, _, _ = self.download_model_by_name(model_name)
self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
@ -565,19 +569,39 @@ class TTS:
def voice_conversion(
self,
sourve_wav: str,
source_wav: str,
target_wav: str,
):
"""Voice conversion with FreeVC. Convert source wav to target speaker.
Args:``
source_wav (str):
Path to the source wav file.
target_wav (str):`
Path to the target wav file.
"""
wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
return wav
def voice_conversion_to_file(
self,
source_wav: str,
target_wav: str,
file_path: str = "output.wav",
):
"""Voice conversion with FreeVC. Convert source wav to target speaker.
Args:
source_wav (str):
Path to the source wav file.
target_wav (str):
Path to the target wav file.
file_path (str, optional):
Output file path. Defaults to "output.wav".
"""
wav = self.synthesizer.voice_conversion(source_wav=sourve_wav, target_wav=target_wav)
return wav
wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
return file_path
def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
"""Convert text to speech with voice conversion.

View File

@ -711,7 +711,7 @@ class AudioProcessor(object):
Args:
filename (str): Path to the wav file.
"""
return librosa.get_duration(path=filename)
return librosa.get_duration(filename=filename)
@staticmethod
def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray:

View File

@ -93,3 +93,11 @@ class TTSTest(unittest.TestCase):
tts = TTS()
tts.load_tts_model_by_name("tts_models/multilingual/multi-dataset/your_tts")
tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH)
def test_voice_conversion(self): # pylint: disable=no-self-use
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=False)
tts.voice_conversion_to_file(
source_wav=cloning_test_wav_path,
target_wav=cloning_test_wav_path,
file_path=OUTPUT_PATH,
)