Use `synthesize` when exists

2023-05-06 11:58:59 +02:00 · 2023-05-06 11:58:59 +02:00 · a4860eac9a
parent 90b33e398f
commit a4860eac9a
3 changed files with 7 additions and 5 deletions
--- a/TTS/api.py
+++ b/TTS/api.py
@ -342,7 +342,9 @@ class TTS:

    def download_model_by_name(self, model_name: str):
        model_path, config_path, model_item = self.manager.download_model(model_name)
-        if model_path.split("--")[-1] == "tortoise-v2":
+        if isinstance(model_item["github_rls_url"], list):
+            # return model directory if there are multiple files
+            # we assume that the model knows how to load itself
            return None, None, None, None, model_path
        if model_item.get("default_vocoder") is None:
            return model_path, config_path, None, None
--- a/TTS/tts/models/tortoise.py
+++ b/TTS/tts/models/tortoise.py
@ -450,7 +450,7 @@ class Tortoise(BaseTTS):
        with torch.no_grad():
            return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))

-    def synthesis(self, text, config, speaker_id="lj", **kwargs):
+    def synthesize(self, text, config, speaker_id="lj", **kwargs):
        voice_samples, conditioning_latents = load_voice(speaker_id)

        outputs = self.inference_with_config(
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -333,15 +333,15 @@ class Synthesizer(object):
                )

        # compute a new d_vector from the given clip.
-        if speaker_wav is not None:
+        if speaker_wav is not None and self.tts_model.speaker_manager is not None:
            speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)

        use_gl = self.vocoder_model is None

        if not reference_wav:
            for sen in sens:
-                if self.tts_config.model == "tortoise":
-                    outputs = self.tts_model.synthesis(text=sen, config=self.tts_config, **kwargs)
+                if hasattr(self.tts_model, "synthesize"):
+                    outputs = self.tts_model.synthesize(text=sen, config=self.tts_config, **kwargs)
                else:
                    # synthesize voice
                    outputs = synthesis(