ajar98 · ajar98 · Jun 12, 2024 · Jun 8, 2024 · Jun 8, 2024 · Jun 8, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,7 @@ miniaudio = { version = "^1.59", optional = true }
 nltk = { version = "^3.8.1", optional = true }
 pvkoala = { version = "^2.0.1", optional = true }
 pydub = { version = "^0.25.1", optional = true }
+cartesia = { version = "^0.1.1", optional = true }
 
 # Transcribers
 google-cloud-speech = { version = "^2.26.0", optional = true }
@@ -82,6 +83,7 @@ synthesizers = [
     "nltk",
     "pvkoala",
     "pydub",
+    "cartesia",
 ]
 transcribers = ["google-cloud-speech"]
 telephony = ["twilio", "vonage"]
@@ -97,6 +99,7 @@ all = [
     "vonage",
     "langchain",
     "langchain-community",
+    "cartesia"
 ]
 
 [tool.mypy]

diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
@@ -23,6 +23,7 @@ class SynthesizerType(str, Enum):
     COQUI = "synthesizer_coqui"
     BARK = "synthesizer_bark"
     POLLY = "synthesizer_polly"
+    CARTESIA = "synthesizer_cartesia"
 
 
 class SentimentConfig(BaseModel):
@@ -226,3 +227,17 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value
     language_code: str = DEFAULT_POLLY_LANGUAGE_CODE
     voice_id: str = DEFAULT_POLLY_VOICE_ID
     sampling_rate: int = DEFAULT_POLLY_SAMPLING_RATE
+
+
+DEFAULT_CARTESIA_MODEL_ID = 'upbeat-moon'
+DEFAULT_CARTESIA_VOICE_ID = '5345cf08-6f37-424d-a5d9-8ae1101b9377'
+DEFAULT_CARTESIA_AUDIO_ENCODING = AudioEncoding.LINEAR16
+DEFAULT_CARTESIA_SAMPLING_RATE = SamplingRate.RATE_44100
+
+
+class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA.value):  # type: ignore
+    api_key: Optional[str] = None
+    model_id: str = DEFAULT_CARTESIA_MODEL_ID
+    voice_id: str = DEFAULT_CARTESIA_VOICE_ID
+    audio_encoding: AudioEncoding = DEFAULT_CARTESIA_AUDIO_ENCODING
+    sampling_rate: int = DEFAULT_CARTESIA_SAMPLING_RATE
diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py
@@ -0,0 +1,115 @@
+import io
+import wave
+import hashlib
+
+from vocode import getenv
+from vocode.streaming.models.audio import AudioEncoding, SamplingRate
+from vocode.streaming.models.message import BaseMessage
+from vocode.streaming.models.synthesizer import CartesiaSynthesizerConfig
+from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
+from vocode.streaming.utils.create_task import asyncio_create_task_with_done_error_log
+
+
+class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]):
+    def __init__(
+        self,
+        synthesizer_config: CartesiaSynthesizerConfig,
+    ):
+        super().__init__(synthesizer_config)
+
+        # Lazy import the cartesia module
+        try:
+            from cartesia.tts import AsyncCartesiaTTS
+        except ImportError as e:
+            raise ImportError(
+                f"Missing required dependancies for CartesiaSynthesizer"
+            ) from e
+
+        self.cartesia_tts = AsyncCartesiaTTS
+
+        self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
+        if not self.api_key:
+            raise ValueError("Missing Cartesia API key")
+
+        self.api_key = getenv("CARTESIA_API_KEY")
+
+        if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
+            self.channel_width = 2
+            match synthesizer_config.sampling_rate:
+                case SamplingRate.RATE_44100:
+                    self.sampling_rate = 44100
+                    self.output_format = "pcm_44100"
+                case SamplingRate.RATE_22050:
+                    self.sampling_rate = 22050
+                    self.output_format = "pcm_22050"
+                case SamplingRate.RATE_16000:
+                    self.sampling_rate = 16000
+                    self.output_format = "pcm_16000"
+                case _:
+                    raise ValueError(
+                        f"Unsupported PCM sampling rate {synthesizer_config.sampling_rate}"
+                    )
+        elif synthesizer_config.audio_encoding == AudioEncoding.MULAW:
+            # Cartesia has issues with MuLaw/8000. Use pcm/16000 and
+            # create_synthesis_result_from_wav will handle the conversion to mulaw/8000
+            self.channel_width = 2
+            self.output_format = "pcm_16000"
+            self.sampling_rate = 16000
+        else:
+            raise ValueError(
+                f"Unsupported audio encoding {synthesizer_config.audio_encoding}"
+            )
+
+        self.num_channels = 1
+        self.model_id = synthesizer_config.model_id
+        self.voice_id = synthesizer_config.voice_id
+        self.client = self.cartesia_tts(api_key=self.api_key)
+        self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id)
+
+
+    async def create_speech_uncached(
+        self,
+        message: BaseMessage,
+        chunk_size: int,
+        is_first_text_chunk: bool = False,
+        is_sole_text_chunk: bool = False,
+    ) -> SynthesisResult:
+        generator = await self.client.generate(
+            transcript=message.text,
+            voice=self.voice_embedding,
+            stream=True,
+            model_id=self.model_id,
+            data_rtype='bytes',
+            output_format=self.output_format
+        )
+
+        audio_file = io.BytesIO()
+        with wave.open(audio_file, 'wb') as wav_file:
+            wav_file.setnchannels(self.num_channels)
+            wav_file.setsampwidth(self.channel_width)
+            wav_file.setframerate(self.sampling_rate)
+            async for chunk in generator:
+                wav_file.writeframes(chunk['audio'])
+        audio_file.seek(0)
+
+        result = self.create_synthesis_result_from_wav(
+            synthesizer_config=self.synthesizer_config,
+            file=audio_file,
+            message=message,
+            chunk_size=chunk_size,
+        )
+
+        return result
+
+    @classmethod
+    def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig):
+        hashed_api_key = hashlib.sha256(f"{synthesizer_config.api_key}".encode("utf-8")).hexdigest()
+        return ":".join(
+            (
+                "cartesia",
+                hashed_api_key,
+                str(synthesizer_config.voice_id),
+                str(synthesizer_config.model_id),
+                synthesizer_config.audio_encoding
+            )
+        )
diff --git a/vocode/streaming/synthesizer/default_factory.py b/vocode/streaming/synthesizer/default_factory.py
@@ -1,6 +1,7 @@
 from typing import Type
 from vocode.streaming.models.synthesizer import (
     AzureSynthesizerConfig,
+    CartesiaSynthesizerConfig,
     ElevenLabsSynthesizerConfig,
     PlayHtSynthesizerConfig,
     RimeSynthesizerConfig,
@@ -10,6 +11,7 @@
 from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
+from vocode.streaming.synthesizer.cartesia_synthesizer import CartesiaSynthesizer
 from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
 from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer
 from vocode.streaming.synthesizer.play_ht_synthesizer import PlayHtSynthesizer
@@ -25,6 +27,8 @@ def create_synthesizer(
     ):
         if isinstance(synthesizer_config, AzureSynthesizerConfig):
             return AzureSynthesizer(synthesizer_config)
+        elif isinstance(synthesizer_config, CartesiaSynthesizerConfig):
+            return CartesiaSynthesizer(synthesizer_config)
         elif isinstance(synthesizer_config, ElevenLabsSynthesizerConfig):
             eleven_labs_synthesizer_class_type: Type[BaseSynthesizer] = ElevenLabsSynthesizer
             if synthesizer_config.experimental_websocket: