Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cartesia synthesizer #17

Merged
merged 11 commits into from
Jun 12, 2024
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ miniaudio = { version = "^1.59", optional = true }
nltk = { version = "^3.8.1", optional = true }
pvkoala = { version = "^2.0.1", optional = true }
pydub = { version = "^0.25.1", optional = true }
cartesia = { version = "^0.1.1", optional = true }

# Transcribers
google-cloud-speech = { version = "^2.26.0", optional = true }
Expand Down Expand Up @@ -82,6 +83,7 @@ synthesizers = [
"nltk",
"pvkoala",
"pydub",
"cartesia",
]
transcribers = ["google-cloud-speech"]
telephony = ["twilio", "vonage"]
Expand All @@ -97,6 +99,7 @@ all = [
"vonage",
"langchain",
"langchain-community",
"cartesia"
]

[tool.mypy]
Expand Down
15 changes: 15 additions & 0 deletions vocode/streaming/models/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class SynthesizerType(str, Enum):
COQUI = "synthesizer_coqui"
BARK = "synthesizer_bark"
POLLY = "synthesizer_polly"
CARTESIA = "synthesizer_cartesia"


class SentimentConfig(BaseModel):
Expand Down Expand Up @@ -226,3 +227,17 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value
language_code: str = DEFAULT_POLLY_LANGUAGE_CODE
voice_id: str = DEFAULT_POLLY_VOICE_ID
sampling_rate: int = DEFAULT_POLLY_SAMPLING_RATE


DEFAULT_CARTESIA_MODEL_ID = 'upbeat-moon'
DEFAULT_CARTESIA_VOICE_ID = '5345cf08-6f37-424d-a5d9-8ae1101b9377'
DEFAULT_CARTESIA_AUDIO_ENCODING = AudioEncoding.LINEAR16
DEFAULT_CARTESIA_SAMPLING_RATE = SamplingRate.RATE_44100


class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA.value): # type: ignore
api_key: Optional[str] = None
model_id: str = DEFAULT_CARTESIA_MODEL_ID
voice_id: str = DEFAULT_CARTESIA_VOICE_ID
audio_encoding: AudioEncoding = DEFAULT_CARTESIA_AUDIO_ENCODING
sampling_rate: int = DEFAULT_CARTESIA_SAMPLING_RATE
115 changes: 115 additions & 0 deletions vocode/streaming/synthesizer/cartesia_synthesizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import io
import wave
import hashlib

from vocode import getenv
from vocode.streaming.models.audio import AudioEncoding, SamplingRate
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import CartesiaSynthesizerConfig
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
from vocode.streaming.utils.create_task import asyncio_create_task_with_done_error_log


class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]):
def __init__(
self,
synthesizer_config: CartesiaSynthesizerConfig,
):
super().__init__(synthesizer_config)

# Lazy import the cartesia module
try:
from cartesia.tts import AsyncCartesiaTTS
except ImportError as e:
raise ImportError(
f"Missing required dependancies for CartesiaSynthesizer"
) from e

self.cartesia_tts = AsyncCartesiaTTS

self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
if not self.api_key:
raise ValueError("Missing Cartesia API key")

self.api_key = getenv("CARTESIA_API_KEY")

if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
self.channel_width = 2
match synthesizer_config.sampling_rate:
case SamplingRate.RATE_44100:
self.sampling_rate = 44100
self.output_format = "pcm_44100"
case SamplingRate.RATE_22050:
self.sampling_rate = 22050
self.output_format = "pcm_22050"
case SamplingRate.RATE_16000:
self.sampling_rate = 16000
self.output_format = "pcm_16000"
case _:
raise ValueError(
f"Unsupported PCM sampling rate {synthesizer_config.sampling_rate}"
)
elif synthesizer_config.audio_encoding == AudioEncoding.MULAW:
# Cartesia has issues with MuLaw/8000. Use pcm/16000 and
# create_synthesis_result_from_wav will handle the conversion to mulaw/8000
self.channel_width = 2
self.output_format = "pcm_16000"
self.sampling_rate = 16000
else:
raise ValueError(
f"Unsupported audio encoding {synthesizer_config.audio_encoding}"
)

self.num_channels = 1
self.model_id = synthesizer_config.model_id
self.voice_id = synthesizer_config.voice_id
self.client = self.cartesia_tts(api_key=self.api_key)
self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id)


async def create_speech_uncached(
self,
message: BaseMessage,
chunk_size: int,
is_first_text_chunk: bool = False,
is_sole_text_chunk: bool = False,
) -> SynthesisResult:
generator = await self.client.generate(
transcript=message.text,
voice=self.voice_embedding,
stream=True,
model_id=self.model_id,
data_rtype='bytes',
output_format=self.output_format
)

audio_file = io.BytesIO()
with wave.open(audio_file, 'wb') as wav_file:
wav_file.setnchannels(self.num_channels)
wav_file.setsampwidth(self.channel_width)
wav_file.setframerate(self.sampling_rate)
async for chunk in generator:
wav_file.writeframes(chunk['audio'])
audio_file.seek(0)

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=audio_file,
message=message,
chunk_size=chunk_size,
)

return result

@classmethod
def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig):
hashed_api_key = hashlib.sha256(f"{synthesizer_config.api_key}".encode("utf-8")).hexdigest()
return ":".join(
(
"cartesia",
hashed_api_key,
str(synthesizer_config.voice_id),
str(synthesizer_config.model_id),
synthesizer_config.audio_encoding
)
)
4 changes: 4 additions & 0 deletions vocode/streaming/synthesizer/default_factory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Type
from vocode.streaming.models.synthesizer import (
AzureSynthesizerConfig,
CartesiaSynthesizerConfig,
ElevenLabsSynthesizerConfig,
PlayHtSynthesizerConfig,
RimeSynthesizerConfig,
Expand All @@ -10,6 +11,7 @@
from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.streaming.synthesizer.cartesia_synthesizer import CartesiaSynthesizer
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer
from vocode.streaming.synthesizer.play_ht_synthesizer import PlayHtSynthesizer
Expand All @@ -25,6 +27,8 @@ def create_synthesizer(
):
if isinstance(synthesizer_config, AzureSynthesizerConfig):
return AzureSynthesizer(synthesizer_config)
elif isinstance(synthesizer_config, CartesiaSynthesizerConfig):
return CartesiaSynthesizer(synthesizer_config)
elif isinstance(synthesizer_config, ElevenLabsSynthesizerConfig):
eleven_labs_synthesizer_class_type: Type[BaseSynthesizer] = ElevenLabsSynthesizer
if synthesizer_config.experimental_websocket:
Expand Down