diff --git a/TTS/api.py b/TTS/api.py index 554fec80..13f71c84 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -342,7 +342,9 @@ class TTS: def download_model_by_name(self, model_name: str): model_path, config_path, model_item = self.manager.download_model(model_name) - if model_path.split("--")[-1] == "tortoise-v2": + if isinstance(model_item["github_rls_url"], list): + # return model directory if there are multiple files + # we assume that the model knows how to load itself return None, None, None, None, model_path if model_item.get("default_vocoder") is None: return model_path, config_path, None, None diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 16808f9b..3403d8a2 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -450,7 +450,7 @@ class Tortoise(BaseTTS): with torch.no_grad(): return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0])) - def synthesis(self, text, config, speaker_id="lj", **kwargs): + def synthesize(self, text, config, speaker_id="lj", **kwargs): voice_samples, conditioning_latents = load_voice(speaker_id) outputs = self.inference_with_config( diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8b50e11d..a7a68eb2 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -333,15 +333,15 @@ class Synthesizer(object): ) # compute a new d_vector from the given clip. - if speaker_wav is not None: + if speaker_wav is not None and self.tts_model.speaker_manager is not None: speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) use_gl = self.vocoder_model is None if not reference_wav: for sen in sens: - if self.tts_config.model == "tortoise": - outputs = self.tts_model.synthesis(text=sen, config=self.tts_config, **kwargs) + if hasattr(self.tts_model, "synthesize"): + outputs = self.tts_model.synthesize(text=sen, config=self.tts_config, **kwargs) else: # synthesize voice outputs = synthesis(