Skip to content

Commit 5dc841a

Browse files
rjheetaajar98
andauthoredJun 12, 2024··
add cartesia synthesizer (#17)
* add cartesia synthesizer * make Cartesia dependency optional, add it to the synthesizers extra group * lazy import cartesia * improved lazy loading, and added api_key as a config parameter * improvements to cartesia synth * use create_speech_uncached * use existing abstractions default encoding and sample rates * Remove redundant api_key assignment Co-authored-by: Ajay Raj <[email protected]> * Remove default setting of sampling rate Co-authored-by: Ajay Raj <[email protected]> * Remove default setting of audio_encoding Co-authored-by: Ajay Raj <[email protected]> * remove default setting of sampling rate Co-authored-by: Ajay Raj <[email protected]> * Remove redundant setting of audio enconding the output device handles this Co-authored-by: Ajay Raj <[email protected]> * build failed with poetry.lock file. re-updating it --------- Co-authored-by: Ajay Raj <[email protected]>
1 parent 221b69a commit 5dc841a

File tree

5 files changed

+192
-39
lines changed

5 files changed

+192
-39
lines changed
 

‎poetry.lock

+60-39
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pyproject.toml

+3
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ google-cloud-texttospeech = { version = "^2.16.3", optional = true }
4040
miniaudio = { version = "^1.59", optional = true }
4141
pvkoala = { version = "^2.0.1", optional = true }
4242
pydub = { version = "^0.25.1", optional = true }
43+
cartesia = { version = "^0.1.1", optional = true }
4344

4445
# Transcribers
4546
google-cloud-speech = { version = "^2.26.0", optional = true }
@@ -81,6 +82,7 @@ synthesizers = [
8182
"miniaudio",
8283
"pvkoala",
8384
"pydub",
85+
"cartesia",
8486
]
8587
transcribers = ["google-cloud-speech"]
8688
telephony = ["twilio", "vonage"]
@@ -96,6 +98,7 @@ all = [
9698
"vonage",
9799
"langchain",
98100
"langchain-community",
101+
"cartesia"
99102
]
100103

101104
[tool.mypy]

‎vocode/streaming/models/synthesizer.py

+11
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class SynthesizerType(str, Enum):
2323
COQUI = "synthesizer_coqui"
2424
BARK = "synthesizer_bark"
2525
POLLY = "synthesizer_polly"
26+
CARTESIA = "synthesizer_cartesia"
2627

2728

2829
class SentimentConfig(BaseModel):
@@ -226,3 +227,13 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value
226227
language_code: str = DEFAULT_POLLY_LANGUAGE_CODE
227228
voice_id: str = DEFAULT_POLLY_VOICE_ID
228229
sampling_rate: int = DEFAULT_POLLY_SAMPLING_RATE
230+
231+
232+
DEFAULT_CARTESIA_MODEL_ID = 'upbeat-moon'
233+
DEFAULT_CARTESIA_VOICE_ID = '5345cf08-6f37-424d-a5d9-8ae1101b9377'
234+
235+
236+
class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA.value): # type: ignore
237+
api_key: Optional[str] = None
238+
model_id: str = DEFAULT_CARTESIA_MODEL_ID
239+
voice_id: str = DEFAULT_CARTESIA_VOICE_ID
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import io
2+
import wave
3+
import hashlib
4+
5+
from vocode import getenv
6+
from vocode.streaming.models.audio import AudioEncoding, SamplingRate
7+
from vocode.streaming.models.message import BaseMessage
8+
from vocode.streaming.models.synthesizer import CartesiaSynthesizerConfig
9+
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
10+
from vocode.streaming.utils.create_task import asyncio_create_task_with_done_error_log
11+
12+
13+
class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]):
14+
def __init__(
15+
self,
16+
synthesizer_config: CartesiaSynthesizerConfig,
17+
):
18+
super().__init__(synthesizer_config)
19+
20+
# Lazy import the cartesia module
21+
try:
22+
from cartesia.tts import AsyncCartesiaTTS
23+
except ImportError as e:
24+
raise ImportError(
25+
f"Missing required dependancies for CartesiaSynthesizer"
26+
) from e
27+
28+
self.cartesia_tts = AsyncCartesiaTTS
29+
30+
self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
31+
if not self.api_key:
32+
raise ValueError("Missing Cartesia API key")
33+
34+
35+
if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
36+
self.channel_width = 2
37+
match synthesizer_config.sampling_rate:
38+
case SamplingRate.RATE_44100:
39+
self.sampling_rate = 44100
40+
self.output_format = "pcm_44100"
41+
case SamplingRate.RATE_22050:
42+
self.sampling_rate = 22050
43+
self.output_format = "pcm_22050"
44+
case SamplingRate.RATE_16000:
45+
self.sampling_rate = 16000
46+
self.output_format = "pcm_16000"
47+
case _:
48+
raise ValueError(
49+
f"Unsupported PCM sampling rate {synthesizer_config.sampling_rate}"
50+
)
51+
elif synthesizer_config.audio_encoding == AudioEncoding.MULAW:
52+
# Cartesia has issues with MuLaw/8000. Use pcm/16000 and
53+
# create_synthesis_result_from_wav will handle the conversion to mulaw/8000
54+
self.channel_width = 2
55+
self.output_format = "pcm_16000"
56+
self.sampling_rate = 16000
57+
else:
58+
raise ValueError(
59+
f"Unsupported audio encoding {synthesizer_config.audio_encoding}"
60+
)
61+
62+
self.num_channels = 1
63+
self.model_id = synthesizer_config.model_id
64+
self.voice_id = synthesizer_config.voice_id
65+
self.client = self.cartesia_tts(api_key=self.api_key)
66+
self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id)
67+
68+
69+
async def create_speech_uncached(
70+
self,
71+
message: BaseMessage,
72+
chunk_size: int,
73+
is_first_text_chunk: bool = False,
74+
is_sole_text_chunk: bool = False,
75+
) -> SynthesisResult:
76+
generator = await self.client.generate(
77+
transcript=message.text,
78+
voice=self.voice_embedding,
79+
stream=True,
80+
model_id=self.model_id,
81+
data_rtype='bytes',
82+
output_format=self.output_format
83+
)
84+
85+
audio_file = io.BytesIO()
86+
with wave.open(audio_file, 'wb') as wav_file:
87+
wav_file.setnchannels(self.num_channels)
88+
wav_file.setsampwidth(self.channel_width)
89+
wav_file.setframerate(self.sampling_rate)
90+
async for chunk in generator:
91+
wav_file.writeframes(chunk['audio'])
92+
audio_file.seek(0)
93+
94+
result = self.create_synthesis_result_from_wav(
95+
synthesizer_config=self.synthesizer_config,
96+
file=audio_file,
97+
message=message,
98+
chunk_size=chunk_size,
99+
)
100+
101+
return result
102+
103+
@classmethod
104+
def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig):
105+
hashed_api_key = hashlib.sha256(f"{synthesizer_config.api_key}".encode("utf-8")).hexdigest()
106+
return ":".join(
107+
(
108+
"cartesia",
109+
hashed_api_key,
110+
str(synthesizer_config.voice_id),
111+
str(synthesizer_config.model_id),
112+
synthesizer_config.audio_encoding
113+
)
114+
)

‎vocode/streaming/synthesizer/default_factory.py

+4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Type
22
from vocode.streaming.models.synthesizer import (
33
AzureSynthesizerConfig,
4+
CartesiaSynthesizerConfig,
45
ElevenLabsSynthesizerConfig,
56
PlayHtSynthesizerConfig,
67
RimeSynthesizerConfig,
@@ -10,6 +11,7 @@
1011
from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory
1112
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
1213
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
14+
from vocode.streaming.synthesizer.cartesia_synthesizer import CartesiaSynthesizer
1315
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
1416
from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer
1517
from vocode.streaming.synthesizer.play_ht_synthesizer import PlayHtSynthesizer
@@ -25,6 +27,8 @@ def create_synthesizer(
2527
):
2628
if isinstance(synthesizer_config, AzureSynthesizerConfig):
2729
return AzureSynthesizer(synthesizer_config)
30+
elif isinstance(synthesizer_config, CartesiaSynthesizerConfig):
31+
return CartesiaSynthesizer(synthesizer_config)
2832
elif isinstance(synthesizer_config, ElevenLabsSynthesizerConfig):
2933
eleven_labs_synthesizer_class_type: Type[BaseSynthesizer] = ElevenLabsSynthesizer
3034
if synthesizer_config.experimental_websocket:

0 commit comments

Comments
 (0)
Please sign in to comment.