ajar98 · ajar98 · Jun 12, 2024 · Jun 8, 2024 · Jun 8, 2024 · Jun 8, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,7 @@ miniaudio = { version = "^1.59", optional = true }
 nltk = { version = "^3.8.1", optional = true }
 pvkoala = { version = "^2.0.1", optional = true }
 pydub = { version = "^0.25.1", optional = true }
+cartesia = { version = "^0.1.1", optional = true }
 
 # Transcribers
 google-cloud-speech = { version = "^2.26.0", optional = true }
@@ -82,6 +83,7 @@ synthesizers = [
     "nltk",
     "pvkoala",
     "pydub",
+    "cartesia",
 ]
 transcribers = ["google-cloud-speech"]
 telephony = ["twilio", "vonage"]
@@ -97,6 +99,7 @@ all = [
     "vonage",
     "langchain",
     "langchain-community",
+    "cartesia"
 ]
 
 [tool.mypy]

diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
@@ -23,6 +23,7 @@ class SynthesizerType(str, Enum):
     COQUI = "synthesizer_coqui"
     BARK = "synthesizer_bark"
     POLLY = "synthesizer_polly"
+    CARTESIA = "synthesizer_cartesia"
 
 
 class SentimentConfig(BaseModel):
@@ -226,3 +227,17 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value
     language_code: str = DEFAULT_POLLY_LANGUAGE_CODE
     voice_id: str = DEFAULT_POLLY_VOICE_ID
     sampling_rate: int = DEFAULT_POLLY_SAMPLING_RATE
+
+
+DEFAULT_CARTESIA_MODEL_ID = 'upbeat-moon'
+DEFAULT_CARTESIA_VOICE_ID = '5345cf08-6f37-424d-a5d9-8ae1101b9377'
+DEFAULT_CARTESIA_OUTPUT_FORMAT = 'pcm'
+DEFAULT_CARTESIA_SAMPLING_RATE = SamplingRate.RATE_44100
+
+
+class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA.value):  # type: ignore
+    api_key: Optional[str] = None
+    model_id: str = DEFAULT_CARTESIA_MODEL_ID
+    voice_id: str = DEFAULT_CARTESIA_VOICE_ID
+    output_format: str = DEFAULT_CARTESIA_OUTPUT_FORMAT
+    sampling_rate: int = DEFAULT_CARTESIA_SAMPLING_RATE
diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py
@@ -0,0 +1,73 @@
+import io
+import wave
+
+from vocode import getenv
+from vocode.streaming.models.message import BaseMessage
+from vocode.streaming.models.synthesizer import CartesiaSynthesizerConfig
+from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
+
+
+class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]):
+    def __init__(
+        self,
+        synthesizer_config: CartesiaSynthesizerConfig,
+    ):
+        super().__init__(synthesizer_config)
+
+        # Lazy import the cartesia module
+        try:
+            from cartesia.tts import AsyncCartesiaTTS
+        except ImportError as e:
+            raise ImportError(
+                f"Missing required dependancies for CartesiaSynthesizer"
+            ) from e
+
+        self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
+        if not self.api_key:
+            raise ValueError("Missing Cartesia API key")
+
+        self.cartesia_tts = AsyncCartesiaTTS
+        self.api_key = getenv("CARTESIA_API_KEY")
+        self.model_id = synthesizer_config.model_id
+        self.voice_id = synthesizer_config.voice_id
+        self.sampling_rate = synthesizer_config.sampling_rate
+        self.output_format = synthesizer_config.output_format
+        self.client = self.cartesia_tts(api_key=self.api_key)
+        self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id)
+
+    async def create_speech(
+        self,
+        message: BaseMessage,
+        chunk_size: int,
+        is_first_text_chunk: bool = False,
+        is_sole_text_chunk: bool = False,
+    ) -> SynthesisResult:
+        generator = await self.client.generate(
+            transcript=message.text,
+            voice=self.voice_embedding,
+            stream=True,
+            model_id=self.model_id,
+            data_rtype='bytes',
+            output_format=self.output_format
+        )
+
+        sample_rate = self.sampling_rate
+        audio_file = io.BytesIO()
+
+        with wave.open(audio_file, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(sample_rate)
+            async for chunk in generator:
+                raw_data = chunk['audio']
+                wav_file.writeframes(raw_data)
+        audio_file.seek(0)
+
+        result = self.create_synthesis_result_from_wav(
+            synthesizer_config=self.synthesizer_config,
+            file=audio_file,
+            message=message,
+            chunk_size=chunk_size,
+        )
+
+        return result
diff --git a/vocode/streaming/synthesizer/default_factory.py b/vocode/streaming/synthesizer/default_factory.py
@@ -1,6 +1,7 @@
 from typing import Type
 from vocode.streaming.models.synthesizer import (
     AzureSynthesizerConfig,
+    CartesiaSynthesizerConfig,
     ElevenLabsSynthesizerConfig,
     PlayHtSynthesizerConfig,
     RimeSynthesizerConfig,
@@ -10,6 +11,7 @@
 from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
+from vocode.streaming.synthesizer.cartesia_synthesizer import CartesiaSynthesizer
 from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
 from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer
 from vocode.streaming.synthesizer.play_ht_synthesizer import PlayHtSynthesizer
@@ -25,6 +27,8 @@ def create_synthesizer(
     ):
         if isinstance(synthesizer_config, AzureSynthesizerConfig):
             return AzureSynthesizer(synthesizer_config)
+        elif isinstance(synthesizer_config, CartesiaSynthesizerConfig):
+            return CartesiaSynthesizer(synthesizer_config)
         elif isinstance(synthesizer_config, ElevenLabsSynthesizerConfig):
             eleven_labs_synthesizer_class_type: Type[BaseSynthesizer] = ElevenLabsSynthesizer
             if synthesizer_config.experimental_websocket: