Remove coqui studio integration from TTS

2023-12-11 22:11:46 +01:00 · 2023-12-11 22:11:46 +01:00 · 8c20a599d8
parent 5cd750ac7e
commit 8c20a599d8
11 changed files with 33 additions and 782 deletions
--- a/.github/workflows/api_tests.yml
+++ b/.github/workflows/api_tests.yml
@ -1,53 +0,0 @@
-name: api_tests
-
-on:
-  push:
-    branches:
-      - main
-jobs:
-  check_skip:
-    runs-on: ubuntu-latest
-    if: "! contains(github.event.head_commit.message, '[ci skip]')"
-    steps:
-      - run: echo "${{ github.event.head_commit.message }}"
-
-  test:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [3.9, "3.10", "3.11"]
-        experimental: [false]
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-          architecture: x64
-          cache: 'pip'
-          cache-dependency-path: 'requirements*'
-      - name: check OS
-        run: cat /etc/os-release
-      - name: set ENV
-        run: |
-          export TRAINER_TELEMETRY=0
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends git make gcc
-          sudo apt-get install espeak-ng
-          make system-deps
-      - name: Install/upgrade Python setup deps
-        run: python3 -m pip install --upgrade pip setuptools wheel
-      - name: Replace scarf urls
-        run: |
-          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
-      - name: Install TTS
-        run: |
-          python3 -m pip install .[all]
-          python3 setup.py egg_info
-      - name: Unit tests
-        run: make api_tests
-        env:
-          COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
--- a/3
+++ b/3
@ -35,9 +35,6 @@ test_zoo:	## run zoo tests.
 inference_tests: ## run inference tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests

-api_tests: ## run api tests.
-	nose2 -F -v -B --with-coverage --coverage TTS tests.api_tests
-
 data_tests: ## run data tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests

--- a/README.md
+++ b/README.md
@ -7,8 +7,6 @@
 - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
 - 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
- 📣 **Coqui Studio API** is landed on 🐸TTS. - [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
- 📣 [**Coqui Studio API**](https://docs.coqui.ai/docs) is live.
 - 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
 - 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
 - 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
@ -253,29 +251,6 @@ tts.tts_with_vc_to_file(
 )
 ```

-#### Example using [🐸Coqui Studio](https://coqui.ai) voices.
-You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai).
-To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account).
-After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable.
-
-Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list.
-These models will follow the naming convention `coqui_studio/en/<studio_speaker_name>/coqui_studio`
-
-```python
-# XTTS model
-models = TTS(cs_api_model="XTTS").list_models()
-# Init TTS with the target studio speaker
-tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
-# Run TTS
-tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
-
-# V1 model
-models = TTS(cs_api_model="V1").list_models()
-# Run TTS with emotion and speed control
-# Emotion control only works with V1 model
-tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
-```
-
 #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
 For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
 You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
@ -353,10 +328,6 @@ If you don't specify any models, then it uses LJSpeech based English model.

 - Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:

-  ```
-  $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
-  ```
-
 - Run a TTS model with its default vocoder model:

  ```
--- a/TTS/api.py
+++ b/TTS/api.py
@ -6,7 +6,6 @@ from typing import Union
 import numpy as np
 from torch import nn

-from TTS.cs_api import CS_API
 from TTS.utils.audio.numpy_transforms import save_wav
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
@ -24,7 +23,6 @@ class TTS(nn.Module):
        vocoder_path: str = None,
        vocoder_config_path: str = None,
        progress_bar: bool = True,
-        cs_api_model: str = "XTTS",
        gpu=False,
    ):
        """🐸TTS python interface that allows to load and use the released models.
@ -60,9 +58,6 @@ class TTS(nn.Module):
            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
-            cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
-                "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
-                Defaults to "XTTS".
            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
        """
        super().__init__()
@ -70,14 +65,12 @@ class TTS(nn.Module):
        self.config = load_config(config_path) if config_path else None
        self.synthesizer = None
        self.voice_converter = None
-        self.csapi = None
-        self.cs_api_model = cs_api_model
        self.model_name = ""
        if gpu:
            warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")

        if model_name is not None and len(model_name) > 0:
-            if "tts_models" in model_name or "coqui_studio" in model_name:
+            if "tts_models" in model_name:
                self.load_tts_model_by_name(model_name, gpu)
            elif "voice_conversion_models" in model_name:
                self.load_vc_model_by_name(model_name, gpu)
@ -99,12 +92,6 @@ class TTS(nn.Module):
            return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
        return False

-    @property
-    def is_coqui_studio(self):
-        if self.model_name is None:
-            return False
-        return "coqui_studio" in self.model_name
-
    @property
    def is_multi_lingual(self):
        # Not sure what sets this to None, but applied a fix to prevent crashing.
@ -136,14 +123,7 @@ class TTS(nn.Module):
        return Path(__file__).parent / ".models.json"

    def list_models(self):
-        try:
-            csapi = CS_API(model=self.cs_api_model)
-            models = csapi.list_speakers_as_tts_models()
-        except ValueError as e:
-            print(e)
-            models = []
-        manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
-        return manager.list_tts_models() + models
+        return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)

    def download_model_by_name(self, model_name: str):
        model_path, config_path, model_item = self.manager.download_model(model_name)
@ -186,30 +166,26 @@ class TTS(nn.Module):
        TODO: Add tests
        """
        self.synthesizer = None
-        self.csapi = None
        self.model_name = model_name

-        if "coqui_studio" in model_name:
-            self.csapi = CS_API()
-        else:
-            model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
-                model_name
-            )
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name
+        )

-            # init synthesizer
-            # None values are fetch from the model
-            self.synthesizer = Synthesizer(
-                tts_checkpoint=model_path,
-                tts_config_path=config_path,
-                tts_speakers_file=None,
-                tts_languages_file=None,
-                vocoder_checkpoint=vocoder_path,
-                vocoder_config=vocoder_config_path,
-                encoder_checkpoint=None,
-                encoder_config=None,
-                model_dir=model_dir,
-                use_cuda=gpu,
-            )
+        # init synthesizer
+        # None values are fetch from the model
+        self.synthesizer = Synthesizer(
+            tts_checkpoint=model_path,
+            tts_config_path=config_path,
+            tts_speakers_file=None,
+            tts_languages_file=None,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            encoder_checkpoint=None,
+            encoder_config=None,
+            model_dir=model_dir,
+            use_cuda=gpu,
+        )

    def load_tts_model_by_path(
        self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
@ -246,77 +222,17 @@ class TTS(nn.Module):
        **kwargs,
    ) -> None:
        """Check if the arguments are valid for the model."""
-        if not self.is_coqui_studio:
-            # check for the coqui tts models
-            if self.is_multi_speaker and (speaker is None and speaker_wav is None):
-                raise ValueError("Model is multi-speaker but no `speaker` is provided.")
-            if self.is_multi_lingual and language is None:
-                raise ValueError("Model is multi-lingual but no `language` is provided.")
-            if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
-                raise ValueError("Model is not multi-speaker but `speaker` is provided.")
-            if not self.is_multi_lingual and language is not None:
-                raise ValueError("Model is not multi-lingual but `language` is provided.")
-            if not emotion is None and not speed is None:
-                raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
-        else:
-            if emotion is None:
-                emotion = "Neutral"
-            if speed is None:
-                speed = 1.0
-            # check for the studio models
-            if speaker_wav is not None:
-                raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
-            if speaker is not None:
-                raise ValueError("Coqui Studio models do not support `speaker` argument.")
-            if language is not None and language != "en":
-                raise ValueError("Coqui Studio models currently support only `language=en` argument.")
-            if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
-                raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
-
-    def tts_coqui_studio(
-        self,
-        text: str,
-        speaker_name: str = None,
-        language: str = None,
-        emotion: str = None,
-        speed: float = 1.0,
-        pipe_out=None,
-        file_path: str = None,
-    ) -> Union[np.ndarray, str]:
-        """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
-
-        Args:
-            text (str):
-                Input text to synthesize.
-            speaker_name (str, optional):
-                Speaker name from Coqui Studio. Defaults to None.
-            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS` model.
-            emotion (str, optional):
-                Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
-                with "V1" model. Defaults to None.
-            speed (float, optional):
-                Speed of the speech. Defaults to 1.0.
-            pipe_out (BytesIO, optional):
-                Flag to stdout the generated TTS wav file for shell pipe.
-            file_path (str, optional):
-                Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
-
-        Returns:
-            Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
-        """
-        speaker_name = self.model_name.split("/")[2]
-        if file_path is not None:
-            return self.csapi.tts_to_file(
-                text=text,
-                speaker_name=speaker_name,
-                language=language,
-                speed=speed,
-                pipe_out=pipe_out,
-                emotion=emotion,
-                file_path=file_path,
-            )[0]
-        return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
+        # check for the coqui tts models
+        if self.is_multi_speaker and (speaker is None and speaker_wav is None):
+            raise ValueError("Model is multi-speaker but no `speaker` is provided.")
+        if self.is_multi_lingual and language is None:
+            raise ValueError("Model is multi-lingual but no `language` is provided.")
+        if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
+            raise ValueError("Model is not multi-speaker but `speaker` is provided.")
+        if not self.is_multi_lingual and language is not None:
+            raise ValueError("Model is not multi-lingual but `language` is provided.")
+        if not emotion is None and not speed is None:
+            raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")

    def tts(
        self,
@ -357,10 +273,6 @@ class TTS(nn.Module):
        self._check_arguments(
            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
        )
-        if self.csapi is not None:
-            return self.tts_coqui_studio(
-                text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
-            )
        wav = self.synthesizer.tts(
            text=text,
            speaker_name=speaker,
@ -419,16 +331,6 @@ class TTS(nn.Module):
        """
        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)

-        if self.csapi is not None:
-            return self.tts_coqui_studio(
-                text=text,
-                speaker_name=speaker,
-                language=language,
-                emotion=emotion,
-                speed=speed,
-                file_path=file_path,
-                pipe_out=pipe_out,
-            )
        wav = self.tts(
            text=text,
            speaker=speaker,
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -66,12 +66,6 @@ If you don't specify any models, then it uses LJSpeech based English model.
  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
  ```

- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
-
-  ```
-  $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
-  ```
-
 - Run a TTS model with its default vocoder model:

  ```
@ -222,25 +216,6 @@ def main():
        default=None,
    )
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
-
-    # args for coqui studio
-    parser.add_argument(
-        "--cs_model",
-        type=str,
-        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
-    )
-    parser.add_argument(
-        "--emotion",
-        type=str,
-        help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
-        default=None,
-    )
-    parser.add_argument(
-        "--language",
-        type=str,
-        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
-        default=None,
-    )
    parser.add_argument(
        "--pipe_out",
        help="stdout the generated TTS wav file for shell pipe.",
@ -249,13 +224,7 @@ def main():
        const=True,
        default=False,
    )
-    parser.add_argument(
-        "--speed",
-        type=float,
-        help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
-        default=None,
-    )
-
+    
    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
@ -389,7 +358,6 @@ def main():

        # CASE1 #list : list pre-trained TTS models
        if args.list_models:
-            manager.add_cs_api_models(api.list_models())
            manager.list_models()
            sys.exit()

@ -404,21 +372,6 @@ def main():
            manager.model_info_by_full_name(model_query_full_name)
            sys.exit()

-        # CASE3: TTS with coqui studio models
-        if "coqui_studio" in args.model_name:
-            print(" > Using 🐸Coqui Studio model: ", args.model_name)
-            api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
-            api.tts_to_file(
-                text=args.text,
-                emotion=args.emotion,
-                file_path=args.out_path,
-                language=args.language,
-                speed=args.speed,
-                pipe_out=pipe_out,
-            )
-            print(" > Saving output to ", args.out_path)
-            return
-
        if args.language_idx is None and args.language is not None:
            msg = (
                "--language is only supported for Coqui Studio models. "
@ -426,7 +379,7 @@ def main():
            )
            raise ValueError(msg)

-        # CASE4: load pre-trained model paths
+        # CASE3: load pre-trained model paths
        if args.model_name is not None and not args.model_path:
            model_path, config_path, model_item = manager.download_model(args.model_name)
            # tts model
@ -454,7 +407,7 @@ def main():
        if args.vocoder_name is not None and not args.vocoder_path:
            vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

-        # CASE5: set custom model paths
+        # CASE4: set custom model paths
        if args.model_path is not None:
            tts_path = args.model_path
            tts_config_path = args.config_path
--- a/TTS/cs_api.py
+++ b/TTS/cs_api.py
@ -1,317 +0,0 @@
-import http.client
-import json
-import os
-import tempfile
-import urllib.request
-from typing import Tuple
-
-import numpy as np
-import requests
-from scipy.io import wavfile
-
-from TTS.utils.audio.numpy_transforms import save_wav
-
-
-class Speaker(object):
-    """Convert dict to object."""
-
-    def __init__(self, d, is_voice=False):
-        self.is_voice = is_voice
-        for k, v in d.items():
-            if isinstance(k, (list, tuple)):
-                setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
-            else:
-                setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
-
-    def __repr__(self):
-        return str(self.__dict__)
-
-
-class CS_API:
-    """🐸Coqui Studio API Wrapper.
-
-    🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
-    interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
-    characteristics. You can use these voices to generate new audio files or use them in your applications.
-    You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
-    You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
-    https://app.coqui.ai/account. We can either enter the token as an environment variable as
-    `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
-    Visit https://app.coqui.ai/api for more information.
-
-
-    Args:
-        api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
-            `COQUI_STUDIO_TOKEN`.
-        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
-
-
-    Example listing all available speakers:
-        >>> from TTS.api import CS_API
-        >>> tts = CS_API()
-        >>> tts.speakers
-
-    Example listing all emotions:
-        >>> # emotions are only available for `V1` model
-        >>> from TTS.api import CS_API
-        >>> tts = CS_API(model="V1")
-        >>> tts.emotions
-
-    Example with a built-in 🐸 speaker:
-        >>> from TTS.api import CS_API
-        >>> tts = CS_API()
-        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
-        >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
-
-    Example with multi-language model:
-        >>> from TTS.api import CS_API
-        >>> tts = CS_API(model="XTTS")
-        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
-    """
-
-    MODEL_ENDPOINTS = {
-        "V1": {
-            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
-            "synthesize": "https://app.coqui.ai/api/v2/samples",
-            "list_voices": "https://app.coqui.ai/api/v2/voices",
-        },
-        "XTTS": {
-            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
-            "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
-            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
-        },
-    }
-
-    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
-
-    def __init__(self, api_token=None, model="XTTS"):
-        self.api_token = api_token
-        self.model = model
-        self.headers = None
-        self._speakers = None
-        self._check_token()
-
-    @staticmethod
-    def ping_api():
-        URL = "https://coqui.gateway.scarf.sh/tts/api"
-        _ = requests.get(URL)
-
-    @property
-    def speakers(self):
-        if self._speakers is None:
-            self._speakers = self.list_all_speakers()
-        return self._speakers
-
-    @property
-    def emotions(self):
-        """Return a list of available emotions.
-
-        TODO: Get this from the API endpoint.
-        """
-        if self.model == "V1":
-            return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
-        else:
-            raise ValueError(f"❗ Emotions are not available for {self.model}.")
-
-    def _check_token(self):
-        if self.api_token is None:
-            self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
-            self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
-        if not self.api_token:
-            raise ValueError(
-                "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
-                "Visit 🔗https://app.coqui.ai/account to get one.\n"
-                "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
-                ""
-            )
-
-    def list_all_speakers(self):
-        """Return both built-in Coqui Studio speakers and custom voices created by the user."""
-        return self.list_speakers() + self.list_voices()
-
-    def list_speakers(self):
-        """List built-in Coqui Studio speakers."""
-        self._check_token()
-        conn = http.client.HTTPSConnection("app.coqui.ai")
-        url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
-        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
-        res = conn.getresponse()
-        data = res.read()
-        return [Speaker(s) for s in json.loads(data)["result"]]
-
-    def list_voices(self):
-        """List custom voices created by the user."""
-        conn = http.client.HTTPSConnection("app.coqui.ai")
-        url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
-        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
-        res = conn.getresponse()
-        data = res.read()
-        return [Speaker(s, True) for s in json.loads(data)["result"]]
-
-    def list_speakers_as_tts_models(self):
-        """List speakers in ModelManager format."""
-        models = []
-        for speaker in self.speakers:
-            model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
-            models.append(model)
-        return models
-
-    def name_to_speaker(self, name):
-        for speaker in self.speakers:
-            if speaker.name == name:
-                return speaker
-        raise ValueError(f"Speaker {name} not found in {self.speakers}")
-
-    def id_to_speaker(self, speaker_id):
-        for speaker in self.speakers:
-            if speaker.id == speaker_id:
-                return speaker
-        raise ValueError(f"Speaker {speaker_id} not found.")
-
-    @staticmethod
-    def url_to_np(url):
-        tmp_file, _ = urllib.request.urlretrieve(url)
-        rate, data = wavfile.read(tmp_file)
-        return data, rate
-
-    @staticmethod
-    def _create_payload(model, text, speaker, speed, emotion, language):
-        payload = {}
-        # if speaker.is_voice:
-        payload["voice_id"] = speaker.id
-        # else:
-        payload["speaker_id"] = speaker.id
-
-        if model == "V1":
-            payload.update(
-                {
-                    "emotion": emotion,
-                    "name": speaker.name,
-                    "text": text,
-                    "speed": speed,
-                }
-            )
-        elif model == "XTTS":
-            payload.update(
-                {
-                    "name": speaker.name,
-                    "text": text,
-                    "speed": speed,
-                    "language": language,
-                }
-            )
-        else:
-            raise ValueError(f"❗ Unknown model {model}")
-        return payload
-
-    def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
-        assert text is not None, "❗ text is required for V1 model."
-        assert speaker_name is not None, "❗ speaker_name is required for V1 model."
-        if self.model == "V1":
-            if emotion is None:
-                emotion = "Neutral"
-            assert language is None, "❗ language is not supported for V1 model."
-        elif self.model == "XTTS":
-            assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
-            assert language is not None, "❗ Language is required for XTTS model."
-            assert (
-                language in self.SUPPORTED_LANGUAGES
-            ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
-        return text, speaker_name, speaker_id, emotion, speed, language
-
-    def tts(
-        self,
-        text: str,
-        speaker_name: str = None,
-        speaker_id=None,
-        emotion=None,
-        speed=1.0,
-        language=None,  # pylint: disable=unused-argument
-    ) -> Tuple[np.ndarray, int]:
-        """Synthesize speech from text.
-
-        Args:
-            text (str): Text to synthesize.
-            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
-                voices (user generated speakers) with `list_voices()`.
-            speaker_id (str): Speaker ID. If None, the speaker name is used.
-            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
-                supported by `V1` model. Defaults to None.
-            speed (float): Speed of the speech. 1.0 is normal speed.
-            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
-        """
-        self._check_token()
-        self.ping_api()
-
-        if speaker_name is None and speaker_id is None:
-            raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
-        if speaker_id is None:
-            speaker = self.name_to_speaker(speaker_name)
-        else:
-            speaker = self.id_to_speaker(speaker_id)
-
-        text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
-            text, speaker_name, speaker_id, emotion, speed, language
-        )
-
-        conn = http.client.HTTPSConnection("app.coqui.ai")
-        payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
-        url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
-        conn.request("POST", url, json.dumps(payload), self.headers)
-        res = conn.getresponse()
-        data = res.read()
-        try:
-            wav, sr = self.url_to_np(json.loads(data)["audio_url"])
-        except KeyError as e:
-            raise ValueError(f" [!] 🐸 API returned error: {data}") from e
-        return wav, sr
-
-    def tts_to_file(
-        self,
-        text: str,
-        speaker_name: str,
-        speaker_id=None,
-        emotion=None,
-        speed=1.0,
-        pipe_out=None,
-        language=None,
-        file_path: str = None,
-    ) -> str:
-        """Synthesize speech from text and save it to a file.
-
-        Args:
-            text (str): Text to synthesize.
-            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
-                voices (user generated speakers) with `list_voices()`.
-            speaker_id (str): Speaker ID. If None, the speaker name is used.
-            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
-            speed (float): Speed of the speech. 1.0 is normal speed.
-            pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
-            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
-            file_path (str): Path to save the file. If None, a temporary file is created.
-        """
-        if file_path is None:
-            file_path = tempfile.mktemp(".wav")
-        wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
-        save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out)
-        return file_path
-
-
-if __name__ == "__main__":
-    import time
-
-    api = CS_API()
-    print(api.speakers)
-    print(api.list_speakers_as_tts_models())
-
-    ts = time.time()
-    wav, sr = api.tts(
-        "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name
-    )
-    print(f" [i] XTTS took {time.time() - ts:.2f}s")
-
-    filepath = api.tts_to_file(
-        text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav"
-    )
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -68,28 +68,6 @@ class ModelManager(object):
        with open(file_path, "r", encoding="utf-8") as json_file:
            self.models_dict = json.load(json_file)

-    def add_cs_api_models(self, model_list: List[str]):
-        """Add list of Coqui Studio model names that are returned from the api
-
-        Each has the following format `<coqui_studio_model>/en/<speaker_name>/<coqui_studio_model>`
-        """
-
-        def _add_model(model_name: str):
-            if not "coqui_studio" in model_name:
-                return
-            model_type, lang, dataset, model = model_name.split("/")
-            if model_type not in self.models_dict:
-                self.models_dict[model_type] = {}
-            if lang not in self.models_dict[model_type]:
-                self.models_dict[model_type][lang] = {}
-            if dataset not in self.models_dict[model_type][lang]:
-                self.models_dict[model_type][lang][dataset] = {}
-            if model not in self.models_dict[model_type][lang][dataset]:
-                self.models_dict[model_type][lang][dataset][model] = {}
-
-        for model_name in model_list:
-            _add_model(model_name)
-
    def _list_models(self, model_type, model_count=0):
        if self.verbose:
            print("\n Name format: type/language/dataset/model")
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@ -172,48 +172,6 @@ tts.tts_with_vc_to_file(
 )
 ```

-#### Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
-
-You can use all of your available speakers in the studio.
-[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
-You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
-
-```python
-# If you have a valid API token set you will see the studio speakers as separate models in the list.
-# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
-models = TTS().list_models()
-# Init TTS with the target studio speaker
-tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
-# Run TTS
-tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
-# Run TTS with emotion and speed control
-tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
-```
-
-If you just need 🐸 Coqui Studio speakers, you can use `CS_API`. It is a wrapper around the 🐸 Coqui Studio API.
-
-```python
-from TTS.api import CS_API
-
-# Init 🐸 Coqui Studio API
-# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
-
-# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
-api = CS_API(api_token=<token>, model="XTTS")
-api.speakers  # all the speakers are available with all the models.
-api.list_speakers()
-api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
-
-# V1 - Fast and lightweight TTS in EN with emotion control.
-api = CS_API(api_token=<token>, model="V1")
-api.speakers
-api.emotions  # emotions are only for the V1 model.
-api.list_speakers()
-api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
-```
-
 #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
 For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.

--- a/tests/api_tests/init.py
+++ b/tests/api_tests/init.py
--- a/tests/api_tests/test_python_api.py
+++ b/tests/api_tests/test_python_api.py
@ -1,113 +0,0 @@
-import os
-import unittest
-
-from tests import get_tests_data_path, get_tests_output_path
-from TTS.api import CS_API, TTS
-
-OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
-cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav")
-
-
-is_coqui_available = os.environ.get("COQUI_STUDIO_TOKEN")
-
-
-if is_coqui_available:
-
-    class CS_APITest(unittest.TestCase):
-        def test_speakers(self):
-            tts = CS_API()
-            self.assertGreater(len(tts.speakers), 1)
-
-        def test_emotions(self):
-            tts = CS_API()
-            self.assertGreater(len(tts.emotions), 1)
-
-        def test_list_calls(self):
-            tts = CS_API()
-            self.assertGreater(len(tts.list_voices()), 1)
-            self.assertGreater(len(tts.list_speakers()), 1)
-            self.assertGreater(len(tts.list_all_speakers()), 1)
-            self.assertGreater(len(tts.list_speakers_as_tts_models()), 1)
-
-        def test_name_to_speaker(self):
-            tts = CS_API()
-            speaker_name = tts.list_speakers_as_tts_models()[0].split("/")[2]
-            speaker = tts.name_to_speaker(speaker_name)
-            self.assertEqual(speaker.name, speaker_name)
-
-        def test_tts(self):
-            tts = CS_API()
-            wav, sr = tts.tts(text="This is a test.", speaker_name=tts.list_speakers()[0].name)
-            self.assertEqual(sr, 44100)
-            self.assertGreater(len(wav), 1)
-
-    class TTSTest(unittest.TestCase):
-        def test_single_speaker_model(self):
-            tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
-
-            error_raised = False
-            try:
-                tts.tts_to_file(text="Ich bin eine Testnachricht.", speaker="Thorsten", language="de")
-            except ValueError:
-                error_raised = True
-
-            tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
-
-            self.assertTrue(error_raised)
-            self.assertFalse(tts.is_multi_speaker)
-            self.assertFalse(tts.is_multi_lingual)
-            self.assertIsNone(tts.speakers)
-            self.assertIsNone(tts.languages)
-
-        def test_studio_model(self):
-            tts = TTS(model_name="coqui_studio/en/Zacharie Aimilios/coqui_studio")
-            tts.tts_to_file(text="This is a test.")
-
-            # check speed > 2.0 raises error
-            raised_error = False
-            try:
-                _ = tts.tts(text="This is a test.", speed=4.0, emotion="Sad")  # should raise error with speed > 2.0
-            except ValueError:
-                raised_error = True
-            self.assertTrue(raised_error)
-
-            # check emotion is invalid
-            raised_error = False
-            try:
-                _ = tts.tts(text="This is a test.", speed=2.0, emotion="No Emo")  # should raise error with speed > 2.0
-            except ValueError:
-                raised_error = True
-            self.assertTrue(raised_error)
-
-            # check valid call
-            wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad")
-            self.assertGreater(len(wav), 0)
-
-        def test_fairseq_model(self):  # pylint: disable=no-self-use
-            tts = TTS(model_name="tts_models/eng/fairseq/vits")
-            tts.tts_to_file(text="This is a test.")
-
-        def test_multi_speaker_multi_lingual_model(self):
-            tts = TTS()
-            tts.load_tts_model_by_name(tts.models[0])  # YourTTS
-            tts.tts_to_file(
-                text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path=OUTPUT_PATH
-            )
-
-            self.assertTrue(tts.is_multi_speaker)
-            self.assertTrue(tts.is_multi_lingual)
-            self.assertGreater(len(tts.speakers), 1)
-            self.assertGreater(len(tts.languages), 1)
-
-        def test_voice_cloning(self):  # pylint: disable=no-self-use
-            tts = TTS()
-            tts.load_tts_model_by_name("tts_models/multilingual/multi-dataset/your_tts")
-            tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH)
-
-        def test_voice_conversion(self):  # pylint: disable=no-self-use
-            tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=False)
-            tts.voice_conversion_to_file(
-                source_wav=cloning_test_wav_path,
-                target_wav=cloning_test_wav_path,
-                file_path=OUTPUT_PATH,
-            )
--- a/tests/api_tests/test_synthesize_api.py
+++ b/tests/api_tests/test_synthesize_api.py
@ -1,25 +0,0 @@
-import os
-
-from tests import get_tests_output_path, run_cli
-
-
-def test_synthesize():
-    """Test synthesize.py with diffent arguments."""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-
-    # 🐸 Coqui studio model
-    run_cli(
-        'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
-        '--text "This is it" '
-        f'--out_path "{output_path}"'
-    )
-
-    # 🐸 Coqui studio model with speed arg.
-    run_cli(
-        'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
-        '--text "This is it but slow" --speed 0.1'
-        f'--out_path "{output_path}"'
-    )
-
-    # test pipe_out command
-    run_cli(f'tts --text "test." --pipe_out --out_path "{output_path}" | aplay')