From 54ca2db3bb2962f8c7cf85f9338a5089b2a80b3f Mon Sep 17 00:00:00 2001
From: Navicstein <navicsteinrotciv@gmail.com>
Date: Wed, 12 Jun 2024 00:39:40 +0100
Subject: [PATCH 1/2] feat: add cartesia synthesizer

---
 poetry.lock                                   | 97 +++++++++++--------
 pyproject.toml                                |  2 +
 vocode/streaming/models/synthesizer.py        | 35 +++++--
 .../synthesizer/cartesia_synthesizer.py       | 94 ++++++++++++++++++
 .../streaming/synthesizer/default_factory.py  |  3 +
 5 files changed, 183 insertions(+), 48 deletions(-)
 create mode 100644 vocode/streaming/synthesizer/cartesia_synthesizer.py

diff --git a/poetry.lock b/poetry.lock
index e9786cffe..ce592237f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -153,7 +153,7 @@ files = [
 name = "anthropic"
 version = "0.28.0"
 description = "The official Python library for the anthropic API"
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "anthropic-0.28.0-py3-none-any.whl", hash = "sha256:2b620b21aee3d20c5d8005483c34df239d53ae895687113b26b8a36892a7e20f"},
@@ -199,7 +199,7 @@ trio = ["trio (>=0.23)"]
 name = "asttokens"
 version = "2.4.1"
 description = "Annotate AST trees with source code positions"
-optional = true
+optional = false
 python-versions = "*"
 files = [
     {file = "asttokens-2.4.1-py2.py3-none-any.whl", hash = "sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24"},
@@ -247,7 +247,7 @@ tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "p
 name = "azure-cognitiveservices-speech"
 version = "1.37.0"
 description = "Microsoft Cognitive Services Speech SDK for Python"
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "azure_cognitiveservices_speech-1.37.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:906d6bf65176e93464e2d763dd074ca00c48cfe1a896371fcdcb155a500910f7"},
@@ -315,6 +315,28 @@ files = [
     {file = "cachetools-5.3.3.tar.gz", hash = "sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"},
 ]
 
+[[package]]
+name = "cartesia"
+version = "0.1.1"
+description = "The official Python library for the Cartesia API."
+optional = true
+python-versions = ">=3.8.0"
+files = [
+    {file = "cartesia-0.1.1-py2.py3-none-any.whl", hash = "sha256:7a7365f17e220247ee2af1efdb88e69b0aa332e390c85775bf356b5e7b882498"},
+    {file = "cartesia-0.1.1.tar.gz", hash = "sha256:c584770f4698e6dc826a75b7b5fd39bfce749c88ad9786dca46edd9527710002"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+httpx = "*"
+pytest-asyncio = "*"
+requests = "*"
+websockets = "*"
+
+[package.extras]
+all = ["numpy", "pytest (>=8.0.2)", "pytest-cov (>=4.1.0)", "setuptools", "twine", "wheel"]
+dev = ["numpy", "pytest (>=8.0.2)", "pytest-cov (>=4.1.0)", "setuptools", "twine", "wheel"]
+
 [[package]]
 name = "certifi"
 version = "2024.6.2"
@@ -654,7 +676,7 @@ typing-inspect = ">=0.4.0,<1"
 name = "decorator"
 version = "5.1.1"
 description = "Decorators for Humans"
-optional = true
+optional = false
 python-versions = ">=3.5"
 files = [
     {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
@@ -682,7 +704,7 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
 name = "distro"
 version = "1.9.0"
 description = "Distro - an OS platform information API"
-optional = true
+optional = false
 python-versions = ">=3.6"
 files = [
     {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
@@ -713,7 +735,7 @@ wmi = ["wmi (>=1.5.1)"]
 name = "elevenlabs"
 version = "1.2.2"
 description = ""
-optional = true
+optional = false
 python-versions = "<4.0,>=3.8"
 files = [
     {file = "elevenlabs-1.2.2-py3-none-any.whl", hash = "sha256:60b92b0e2aabdfba93a43569f207f8a2ad397492519b8e11a2eebb32807ddefa"},
@@ -761,7 +783,7 @@ test = ["pytest (>=6)"]
 name = "executing"
 version = "2.0.1"
 description = "Get the currently executing AST node of a frame, and other information"
-optional = true
+optional = false
 python-versions = ">=3.5"
 files = [
     {file = "executing-2.0.1-py2.py3-none-any.whl", hash = "sha256:eac49ca94516ccc753f9fb5ce82603156e590b27525a8bc32cce8ae302eb61bc"},
@@ -842,7 +864,7 @@ standard = ["fastapi", "uvicorn[standard] (>=0.15.0)"]
 name = "filelock"
 version = "3.14.0"
 description = "A platform independent file lock."
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "filelock-3.14.0-py3-none-any.whl", hash = "sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f"},
@@ -944,7 +966,7 @@ files = [
 name = "fsspec"
 version = "2024.6.0"
 description = "File-system specification"
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "fsspec-2024.6.0-py3-none-any.whl", hash = "sha256:58d7122eb8a1a46f7f13453187bfea4972d66bf01618d37366521b1998034cee"},
@@ -1159,7 +1181,7 @@ test = ["objgraph", "psutil"]
 name = "grpcio"
 version = "1.64.1"
 description = "HTTP/2-based RPC framework"
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "grpcio-1.64.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:55697ecec192bc3f2f3cc13a295ab670f51de29884ca9ae6cd6247df55df2502"},
@@ -1337,7 +1359,7 @@ socks = ["socksio (==1.*)"]
 name = "huggingface-hub"
 version = "0.23.3"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
-optional = true
+optional = false
 python-versions = ">=3.8.0"
 files = [
     {file = "huggingface_hub-0.23.3-py3-none-any.whl", hash = "sha256:22222c41223f1b7c209ae5511d2d82907325a0e3cdbce5f66949d43c598ff3bc"},
@@ -1412,7 +1434,7 @@ files = [
 name = "ipython"
 version = "8.25.0"
 description = "IPython: Productive Interactive Computing"
-optional = true
+optional = false
 python-versions = ">=3.10"
 files = [
     {file = "ipython-8.25.0-py3-none-any.whl", hash = "sha256:53eee7ad44df903a06655871cbab66d156a051fd86f3ec6750470ac9604ac1ab"},
@@ -1478,7 +1500,7 @@ typing-extensions = ">=3.7.4.3"
 name = "jedi"
 version = "0.19.1"
 description = "An autocompletion tool for Python that can be used for text editors."
-optional = true
+optional = false
 python-versions = ">=3.6"
 files = [
     {file = "jedi-0.19.1-py2.py3-none-any.whl", hash = "sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0"},
@@ -1514,7 +1536,7 @@ i18n = ["Babel (>=2.7)"]
 name = "jiter"
 version = "0.4.1"
 description = "Fast iterable JSON parser."
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "jiter-0.4.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3c2370cd8826b484f3fc6ed729cb58510ba24b4bc277c92323a57d35cf4df223"},
@@ -1905,7 +1927,7 @@ tests = ["pytest", "pytz", "simplejson"]
 name = "matplotlib-inline"
 version = "0.1.7"
 description = "Inline Matplotlib backend for Jupyter"
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"},
@@ -2221,7 +2243,7 @@ files = [
 name = "openai"
 version = "1.31.0"
 description = "The official Python library for the openai API"
-optional = true
+optional = false
 python-versions = ">=3.7.1"
 files = [
     {file = "openai-1.31.0-py3-none-any.whl", hash = "sha256:82044ee3122113f2a468a1f308a8882324d09556ba5348687c535d3655ee331c"},
@@ -2355,7 +2377,7 @@ files = [
 name = "parso"
 version = "0.8.4"
 description = "A Python Parser"
-optional = true
+optional = false
 python-versions = ">=3.6"
 files = [
     {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"},
@@ -2381,7 +2403,7 @@ files = [
 name = "pexpect"
 version = "4.9.0"
 description = "Pexpect allows easy control of interactive console applications."
-optional = true
+optional = false
 python-versions = "*"
 files = [
     {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"},
@@ -2437,7 +2459,7 @@ testing = ["pytest", "pytest-benchmark"]
 name = "prompt-toolkit"
 version = "3.0.46"
 description = "Library for building powerful interactive command lines in Python"
-optional = true
+optional = false
 python-versions = ">=3.7.0"
 files = [
     {file = "prompt_toolkit-3.0.46-py3-none-any.whl", hash = "sha256:45abe60a8300f3c618b23c16c4bb98c6fc80af8ce8b17c7ae92db48db3ee63c1"},
@@ -2468,7 +2490,7 @@ testing = ["google-api-core[grpc] (>=1.31.5)"]
 name = "protobuf"
 version = "4.25.3"
 description = ""
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "protobuf-4.25.3-cp310-abi3-win32.whl", hash = "sha256:d4198877797a83cbfe9bffa3803602bbe1625dc30d8a097365dbc762e5790faa"},
@@ -2488,7 +2510,7 @@ files = [
 name = "ptyprocess"
 version = "0.7.0"
 description = "Run a subprocess in a pseudo terminal"
-optional = true
+optional = false
 python-versions = "*"
 files = [
     {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
@@ -2499,7 +2521,7 @@ files = [
 name = "pure-eval"
 version = "0.2.2"
 description = "Safely evaluate AST nodes without side effects"
-optional = true
+optional = false
 python-versions = "*"
 files = [
     {file = "pure_eval-0.2.2-py3-none-any.whl", hash = "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350"},
@@ -2728,7 +2750,7 @@ windows-terminal = ["colorama (>=0.4.6)"]
 name = "pyht"
 version = "0.0.28"
 description = ""
-optional = true
+optional = false
 python-versions = "<4.0,>=3.8"
 files = [
     {file = "pyht-0.0.28-py3-none-any.whl", hash = "sha256:ad8801acaa906eff5d6b39ce5ca76a08c154f705a2d0b2b6e841b219ef7875be"},
@@ -3008,7 +3030,7 @@ rpds-py = ">=0.7.0"
 name = "regex"
 version = "2024.5.15"
 description = "Alternative regular expression module, to replace re."
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "regex-2024.5.15-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a81e3cfbae20378d75185171587cbf756015ccb14840702944f014e0d93ea09f"},
@@ -3257,7 +3279,7 @@ pyasn1 = ">=0.1.3"
 name = "sentry-sdk"
 version = "2.4.0"
 description = "Python client for Sentry (https://sentry.io)"
-optional = true
+optional = false
 python-versions = ">=3.6"
 files = [
     {file = "sentry_sdk-2.4.0-py2.py3-none-any.whl", hash = "sha256:a42b70981cd4ed7da3c85d0360502d2ac932a15a4a420b360e1ebded2fc19a92"},
@@ -3319,7 +3341,7 @@ files = [
 name = "six"
 version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
-optional = true
+optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
@@ -3459,7 +3481,7 @@ sqlcipher = ["sqlcipher3_binary"]
 name = "stack-data"
 version = "0.6.3"
 description = "Extract data from python stack frames and tracebacks for informative displays"
-optional = true
+optional = false
 python-versions = "*"
 files = [
     {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"},
@@ -3510,7 +3532,7 @@ test = ["pytest", "tornado (>=4.5)", "typeguard"]
 name = "tiktoken"
 version = "0.7.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "tiktoken-0.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:485f3cc6aba7c6b6ce388ba634fbba656d9ee27f766216f45146beb4ac18b25f"},
@@ -3562,7 +3584,7 @@ blobfile = ["blobfile (>=2)"]
 name = "tokenizers"
 version = "0.19.1"
 description = ""
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "tokenizers-0.19.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:952078130b3d101e05ecfc7fc3640282d74ed26bcf691400f872563fca15ac97"},
@@ -3690,7 +3712,7 @@ files = [
 name = "tqdm"
 version = "4.66.4"
 description = "Fast, Extensible Progress Meter"
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "tqdm-4.66.4-py3-none-any.whl", hash = "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644"},
@@ -3710,7 +3732,7 @@ telegram = ["requests"]
 name = "traitlets"
 version = "5.14.3"
 description = "Traitlets Python configuration system"
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"},
@@ -4174,7 +4196,7 @@ anyio = ">=3.0.0"
 name = "wcwidth"
 version = "0.2.13"
 description = "Measures the displayed width of unicode strings in a terminal"
-optional = true
+optional = false
 python-versions = "*"
 files = [
     {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"},
@@ -4474,14 +4496,13 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke
 test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
 [extras]
-agents = ["anthropic", "openai", "tiktoken"]
-all = ["anthropic", "azure-cognitiveservices-speech", "elevenlabs", "google-cloud-speech", "google-cloud-texttospeech", "langchain", "langchain-community", "miniaudio", "nltk", "openai", "pvkoala", "pydub", "pyht", "redis", "sentry-sdk", "tiktoken", "twilio", "vonage"]
-misc = ["langchain", "langchain-community", "sentry-sdk"]
-synthesizers = ["azure-cognitiveservices-speech", "elevenlabs", "google-cloud-texttospeech", "miniaudio", "nltk", "pvkoala", "pydub", "pyht"]
-telephony = ["redis", "twilio", "vonage"]
+all = ["google-cloud-speech", "google-cloud-texttospeech", "langchain", "langchain-community", "miniaudio", "nltk", "pvkoala", "pydub", "twilio", "vonage"]
+misc = ["langchain", "langchain-community"]
+synthesizers = ["cartesia", "google-cloud-texttospeech", "miniaudio", "nltk", "pvkoala", "pydub"]
+telephony = ["twilio", "vonage"]
 transcribers = ["google-cloud-speech"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4.0"
-content-hash = "0b362b102f8d84dd63633857a97c7ae7922358bfbb1e66b070e40c10643b7667"
+content-hash = "3b1ebebe25e230979e87480dd05910cf9a8c39366cd0a70cfd5ea21af1d6b95a"
diff --git a/pyproject.toml b/pyproject.toml
index fe9c29741..244af4ef5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ miniaudio = { version = "^1.59", optional = true }
 nltk = { version = "^3.8.1", optional = true }
 pvkoala = { version = "^2.0.1", optional = true }
 pydub = { version = "^0.25.1", optional = true }
+cartesia = { version = "^0.1.1", optional = true }
 
 # Transcribers
 google-cloud-speech = { version = "^2.26.0", optional = true }
@@ -82,6 +83,7 @@ synthesizers = [
     "nltk",
     "pvkoala",
     "pydub",
+    "cartesia",
 ]
 transcribers = ["google-cloud-speech"]
 telephony = ["twilio", "vonage"]
diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
index 24616bb8b..a5b0e9be0 100644
--- a/vocode/streaming/models/synthesizer.py
+++ b/vocode/streaming/models/synthesizer.py
@@ -7,7 +7,10 @@
 from .model import BaseModel, TypedModel
 from vocode.streaming.models.client_backend import OutputAudioConfig
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
-from vocode.streaming.telephony.constants import DEFAULT_AUDIO_ENCODING, DEFAULT_SAMPLING_RATE
+from vocode.streaming.telephony.constants import (
+    DEFAULT_AUDIO_ENCODING,
+    DEFAULT_SAMPLING_RATE,
+)
 
 
 class SynthesizerType(str, Enum):
@@ -23,6 +26,7 @@ class SynthesizerType(str, Enum):
     COQUI = "synthesizer_coqui"
     BARK = "synthesizer_bark"
     POLLY = "synthesizer_polly"
+    CARTESIA = "synthesizer_cartesia"
 
 
 class SentimentConfig(BaseModel):
@@ -49,14 +53,16 @@ def from_output_device(cls, output_device: BaseOutputDevice, **kwargs):
         return cls(
             sampling_rate=output_device.sampling_rate,
             audio_encoding=output_device.audio_encoding,
-            **kwargs
+            **kwargs,
         )
 
     # TODO(EPD-186): switch to from_twilio_output_device and from_vonage_output_device
     @classmethod
     def from_telephone_output_device(cls, **kwargs):
         return cls(
-            sampling_rate=DEFAULT_SAMPLING_RATE, audio_encoding=DEFAULT_AUDIO_ENCODING, **kwargs
+            sampling_rate=DEFAULT_SAMPLING_RATE,
+            audio_encoding=DEFAULT_AUDIO_ENCODING,
+            **kwargs,
         )
 
     @classmethod
@@ -64,7 +70,7 @@ def from_output_audio_config(cls, output_audio_config: OutputAudioConfig, **kwar
         return cls(
             sampling_rate=output_audio_config.sampling_rate,
             audio_encoding=output_audio_config.audio_encoding,
-            **kwargs
+            **kwargs,
         )
 
 
@@ -97,7 +103,8 @@ class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE.val
 
 
 class ElevenLabsSynthesizerConfig(
-    SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS.value  # type: ignore
+    SynthesizerConfig,
+    type=SynthesizerType.ELEVEN_LABS.value,  # type: ignore
 ):
     api_key: Optional[str] = None
     voice_id: Optional[str] = ELEVEN_LABS_ADAM_VOICE_ID
@@ -117,18 +124,24 @@ def set_name(cls, voice_id):
     def stability_and_similarity_boost_check(cls, similarity_boost, values):
         stability = values.get("stability")
         if (stability is None) != (similarity_boost is None):
-            raise ValueError("Both stability and similarity_boost must be set or not set.")
+            raise ValueError(
+                "Both stability and similarity_boost must be set or not set."
+            )
         return similarity_boost
 
     @validator("optimize_streaming_latency")
     def optimize_streaming_latency_check(cls, optimize_streaming_latency):
-        if optimize_streaming_latency is not None and not (0 <= optimize_streaming_latency <= 4):
+        if optimize_streaming_latency is not None and not (
+            0 <= optimize_streaming_latency <= 4
+        ):
             raise ValueError("optimize_streaming_latency must be between 0 and 4.")
         return optimize_streaming_latency
 
     @validator("backchannel_amplitude_factor")
     def backchannel_amplitude_factor_check(cls, backchannel_amplitude_factor):
-        if backchannel_amplitude_factor is not None and not (0 < backchannel_amplitude_factor <= 1):
+        if backchannel_amplitude_factor is not None and not (
+            0 < backchannel_amplitude_factor <= 1
+        ):
             raise ValueError(
                 "backchannel_amplitude_factor must be between 0 (not inclusive) and 1."
             )
@@ -192,7 +205,8 @@ class PlayHtSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.PLAY_HT.va
 
 
 class CoquiTTSSynthesizerConfig(
-    SynthesizerConfig, type=SynthesizerType.COQUI_TTS.value  # type: ignore
+    SynthesizerConfig,
+    type=SynthesizerType.COQUI_TTS.value,  # type: ignore
 ):
     tts_kwargs: dict = {}
     speaker: Optional[str] = None
@@ -207,7 +221,8 @@ class GTTSSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GTTS.value):
 
 
 class StreamElementsSynthesizerConfig(
-    SynthesizerConfig, type=SynthesizerType.STREAM_ELEMENTS.value  # type: ignore
+    SynthesizerConfig,
+    type=SynthesizerType.STREAM_ELEMENTS.value,  # type: ignore
 ):
     voice: str = STREAM_ELEMENTS_SYNTHESIZER_DEFAULT_VOICE
 
diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py
new file mode 100644
index 000000000..56ce102ee
--- /dev/null
+++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py
@@ -0,0 +1,94 @@
+import asyncio
+from typing import Optional
+
+from cartesia import AsyncCartesiaTTS
+from cartesia.tts import AudioOutputFormat
+
+from vocode import getenv
+from vocode.streaming.models.audio import AudioEncoding
+from vocode.streaming.models.message import BaseMessage
+from vocode.streaming.models.synthesizer import SynthesizerConfig, SynthesizerType
+from vocode.streaming.synthesizer.base_synthesizer import (
+    BaseSynthesizer,
+    SynthesisResult,
+)
+
+CARTESIA_DEFAULT_VOICE = "Barbershop Man"
+CARTESIA_DATA_RTYPE = "bytes"
+CARTESIA_DEFAULT_OUTPUT_FORMAT = AudioOutputFormat.PCM
+CARTESIA_DEFAULT_MODEL_ID = "upbeat-moon"
+
+
+class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA.value):
+    voice_name: str = CARTESIA_DEFAULT_VOICE
+    output_format: str | AudioOutputFormat = CARTESIA_DEFAULT_OUTPUT_FORMAT
+    data_rtype: str = CARTESIA_DATA_RTYPE
+    model_id: str = CARTESIA_DEFAULT_MODEL_ID
+    api_key: str = str(getenv("CARTESIA_API_KEY"))
+
+
+class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]):
+    def __init__(self, synthesizer_config: CartesiaSynthesizerConfig):
+        super().__init__(synthesizer_config=synthesizer_config)
+
+        if self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
+            self.synthesizer_config.output_format = AudioOutputFormat.MULAW_8000
+        elif self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
+            self.synthesizer_config.output_format = AudioOutputFormat.PCM
+
+        self.catesia_client = AsyncCartesiaTTS(api_key=self.synthesizer_config.api_key)
+        voices = self.catesia_client.get_voices()
+        voice_id = voices[self.synthesizer_config.voice_name]["id"]
+        self.voice = self.catesia_client.get_voice_embedding(voice_id=voice_id)
+
+    async def create_speech_uncached(
+        self,
+        message: BaseMessage,
+        chunk_size: int,
+        is_first_text_chunk: bool = False,
+        is_sole_text_chunk: bool = False,
+    ) -> SynthesisResult:
+        self.total_chars += len(message.text)
+        chunk_queue: asyncio.Queue[Optional[bytes]] = asyncio.Queue()
+
+        try:
+            chunk_generator = await self.catesia_client.generate(
+                transcript=message.text,
+                voice=self.voice,
+                stream=True,
+                data_rtype=self.synthesizer_config.data_rtype,
+                model_id=self.synthesizer_config.model_id,
+                output_format=self.synthesizer_config.output_format,
+            )
+            async for data in chunk_generator:  # type: ignore
+                chunk = data["audio"]
+                chunk_queue.put_nowait(chunk)
+
+        except asyncio.CancelledError:
+            pass
+        finally:
+            await chunk_queue.put(None)
+
+        return SynthesisResult(
+            chunk_generator=self.chunk_result_generator_from_queue(chunk_queue),
+            get_message_up_to=lambda seconds: self.get_message_cutoff_from_voice_speed(
+                message=message, seconds=seconds, words_per_minute=150
+            ),
+        )
+
+    @classmethod
+    def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig):
+        output_format = (
+            synthesizer_config.output_format
+            if isinstance(synthesizer_config.output_format, str)
+            else synthesizer_config.output_format.value
+        )
+
+        return ":".join(
+            (
+                SynthesizerType.CARTESIA.value,
+                synthesizer_config.model_id,
+                synthesizer_config.audio_encoding,
+                output_format,
+            )
+        )
diff --git a/vocode/streaming/synthesizer/default_factory.py b/vocode/streaming/synthesizer/default_factory.py
index 7539cf1dd..6ed43f5d9 100644
--- a/vocode/streaming/synthesizer/default_factory.py
+++ b/vocode/streaming/synthesizer/default_factory.py
@@ -10,6 +10,7 @@
 from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
+from vocode.streaming.synthesizer.cartesia_synthesizer import CartesiaSynthesizer, CartesiaSynthesizerConfig
 from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
 from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer
 from vocode.streaming.synthesizer.play_ht_synthesizer import PlayHtSynthesizer
@@ -39,5 +40,7 @@ def create_synthesizer(
             return RimeSynthesizer(synthesizer_config)
         elif isinstance(synthesizer_config, StreamElementsSynthesizerConfig):
             return StreamElementsSynthesizer(synthesizer_config)
+        elif isinstance(synthesizer_config, CartesiaSynthesizerConfig):
+            return CartesiaSynthesizer(synthesizer_config)
         else:
             raise Exception("Invalid synthesizer config")

From 5ae15fc09351cff87b870d03140d09565b0d5496 Mon Sep 17 00:00:00 2001
From: Navicstein <navicsteinrotciv@gmail.com>
Date: Wed, 12 Jun 2024 02:27:44 +0100
Subject: [PATCH 2/2] add filler word generation option

---
 .../synthesizer/cartesia_synthesizer.py       | 81 ++++++++++++++++---
 1 file changed, 72 insertions(+), 9 deletions(-)

diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py
index 56ce102ee..3aa7fb1f1 100644
--- a/vocode/streaming/synthesizer/cartesia_synthesizer.py
+++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py
@@ -1,15 +1,22 @@
 import asyncio
-from typing import Optional
+import io
+import os
+from typing import List, Optional, cast
 
 from cartesia import AsyncCartesiaTTS
-from cartesia.tts import AudioOutputFormat
+from cartesia.tts import AudioOutput, AudioOutputFormat
+from loguru import logger
+from pydub import AudioSegment
 
 from vocode import getenv
 from vocode.streaming.models.audio import AudioEncoding
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import SynthesizerConfig, SynthesizerType
 from vocode.streaming.synthesizer.base_synthesizer import (
+    FILLER_AUDIO_PATH,
+    FILLER_PHRASES,
     BaseSynthesizer,
+    FillerAudio,
     SynthesisResult,
 )
 
@@ -41,6 +48,12 @@ def __init__(self, synthesizer_config: CartesiaSynthesizerConfig):
         voice_id = voices[self.synthesizer_config.voice_name]["id"]
         self.voice = self.catesia_client.get_voice_embedding(voice_id=voice_id)
 
+        self.output_format = (
+            synthesizer_config.output_format
+            if isinstance(synthesizer_config.output_format, str)
+            else synthesizer_config.output_format.value
+        )
+
     async def create_speech_uncached(
         self,
         message: BaseMessage,
@@ -78,17 +91,67 @@ async def create_speech_uncached(
 
     @classmethod
     def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig):
-        output_format = (
-            synthesizer_config.output_format
-            if isinstance(synthesizer_config.output_format, str)
-            else synthesizer_config.output_format.value
-        )
-
+        instance = cls(synthesizer_config)
         return ":".join(
             (
                 SynthesizerType.CARTESIA.value,
                 synthesizer_config.model_id,
                 synthesizer_config.audio_encoding,
-                output_format,
+                instance.output_format,
             )
         )
+
+    async def get_phrase_filler_audios(self) -> List[FillerAudio]:
+        filler_phrase_audios = []
+        for filler_phrase in FILLER_PHRASES:
+            cache_key = "-".join(
+                (
+                    str(filler_phrase.text),
+                    str(self.output_format),
+                    str(self.synthesizer_config.audio_encoding.value),
+                    str(self.synthesizer_config.sampling_rate),
+                    str(self.synthesizer_config.model_id),
+                    str(self.synthesizer_config.voice_name),
+                )
+            )
+            filler_audio_path = os.path.join(FILLER_AUDIO_PATH, f"{cache_key}.bytes")
+            if os.path.exists(filler_audio_path):
+                audio_data = open(filler_audio_path, "rb").read()
+            else:
+                logger.debug(f"Generating filler audio for {filler_phrase.text}")
+                audio_data, sample_rate = await self.create_audio(filler_phrase.text)
+
+                audio = AudioSegment.from_raw(
+                    io.BytesIO(audio_data),  # type: ignore
+                    frame_rate=sample_rate,
+                    channels=1,
+                    sample_width=2,
+                )
+                audio.export(filler_audio_path, format="wav")
+            filler_phrase_audios.append(
+                FillerAudio(
+                    message=filler_phrase,
+                    audio_data=audio_data,
+                    synthesizer_config=self.synthesizer_config,
+                )
+            )
+        return filler_phrase_audios
+
+    async def create_audio(self, text: str) -> tuple[bytes, int]:
+        data = await self.catesia_client.generate(
+            voice=self.voice,
+            stream=False,
+            data_rtype=self.synthesizer_config.data_rtype,
+            model_id=self.synthesizer_config.model_id,
+            output_format=self.synthesizer_config.output_format,
+            transcript=text,
+        )
+
+        data = cast(AudioOutput, data)
+        if isinstance(data["audio"], bytes):
+            return data["audio"], data["sampling_rate"]
+        raise ValueError(
+            f"Unexpected data type for filler audio: {type(data['audio'])}"
+        )
+
+  
\ No newline at end of file