
1. Create Conda Environment
conda create -n coqui-ai-TTS python=3.10
conda activate coqui-ai-TTS
2. Install
- Use pretrained models only
pip install coqui-tts
- For training/development
git clone https://github.com/idiap/coqui-ai-TTS
cd coqui-ai-TTS
pip install -e .
3. Models
- List available models
tts --list_models
- Change model storage location (temporary)
export XDG_DATA_HOME="/www/coqui/models"
export TTS_HOME="/www/coqui/models"
echo $XDG_DATA_HOME
- Permanent setup (Linux/macOS)
nano ~/.bashrc # or ~/.zshrc
# add:
export XDG_DATA_HOME="/www/coqui/models"
export TTS_HOME="/www/coqui/models"
source ~/.bashrc # or . ~/.zshrc
4. Voice Cloning
Python API
import torch
from TTS.api import TTS
device = "cuda" if torch.cuda.is_available() else "cpu"
api = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Clone voice
api.tts_to_file(
text="Hello world",
speaker_wav=["my/cloning/audio.wav"],
speaker="MySpeaker1",
language="en",
)
# Reuse cloned voice
api.tts_to_file(
text="Hello again",
speaker="MySpeaker1",
language="en",
)
CLI
# Clone voice
tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" \
--text "你好世界" \
--language_idx "zh" \
--speaker_wav "my/cloning/audio.wav" \
--speaker_idx "MySpeaker1"
# Reuse cloned voice
tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" \
--text "你好世界" \
--language_idx "zh" \
--speaker_idx "MySpeaker1"
⚠️ For Chinese:
pip install pypinyin
5. Voice Conversion
Python API
from TTS.api import TTS
tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
tts.voice_conversion_to_file(
source_wav="my/source.wav",
target_wav="my/target.wav",
file_path="output.wav"
)
CLI
tts --model_name "voice_conversion_models/multilingual/multi-dataset/openvoice_v2" \
--source_wav "source.wav" \
--target_wav "target.wav" \
--out_path "output.wav"