Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cartesia synthesizer #17

Merged
merged 11 commits into from
Jun 12, 2024
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ miniaudio = { version = "^1.59", optional = true }
nltk = { version = "^3.8.1", optional = true }
pvkoala = { version = "^2.0.1", optional = true }
pydub = { version = "^0.25.1", optional = true }
cartesia = { version = "^0.1.1", optional = true }

# Transcribers
google-cloud-speech = { version = "^2.26.0", optional = true }
Expand Down Expand Up @@ -82,6 +83,7 @@ synthesizers = [
"nltk",
"pvkoala",
"pydub",
"cartesia",
]
transcribers = ["google-cloud-speech"]
telephony = ["twilio", "vonage"]
Expand All @@ -97,6 +99,7 @@ all = [
"vonage",
"langchain",
"langchain-community",
"cartesia"
]

[tool.mypy]
Expand Down
15 changes: 15 additions & 0 deletions vocode/streaming/models/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class SynthesizerType(str, Enum):
COQUI = "synthesizer_coqui"
BARK = "synthesizer_bark"
POLLY = "synthesizer_polly"
CARTESIA = "synthesizer_cartesia"


class SentimentConfig(BaseModel):
Expand Down Expand Up @@ -226,3 +227,17 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value
language_code: str = DEFAULT_POLLY_LANGUAGE_CODE
voice_id: str = DEFAULT_POLLY_VOICE_ID
sampling_rate: int = DEFAULT_POLLY_SAMPLING_RATE


DEFAULT_CARTESIA_MODEL_ID = 'upbeat-moon'
DEFAULT_CARTESIA_VOICE_ID = '5345cf08-6f37-424d-a5d9-8ae1101b9377'
DEFAULT_CARTESIA_OUTPUT_FORMAT = 'pcm'
DEFAULT_CARTESIA_SAMPLING_RATE = SamplingRate.RATE_44100


class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA.value): # type: ignore
api_key: Optional[str] = None
model_id: str = DEFAULT_CARTESIA_MODEL_ID
voice_id: str = DEFAULT_CARTESIA_VOICE_ID
output_format: str = DEFAULT_CARTESIA_OUTPUT_FORMAT
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should use the SynthesizerConfig default of audio_encoding for defining audio encodings and set it to the string expected by the cartesia API similar to here

sampling_rate: int = DEFAULT_CARTESIA_SAMPLING_RATE
73 changes: 73 additions & 0 deletions vocode/streaming/synthesizer/cartesia_synthesizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import io
import wave

from vocode import getenv
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import CartesiaSynthesizerConfig
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult


class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]):
def __init__(
self,
synthesizer_config: CartesiaSynthesizerConfig,
):
super().__init__(synthesizer_config)

# Lazy import the cartesia module
try:
from cartesia.tts import AsyncCartesiaTTS
except ImportError as e:
raise ImportError(
f"Missing required dependancies for CartesiaSynthesizer"
) from e

self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
if not self.api_key:
raise ValueError("Missing Cartesia API key")

self.cartesia_tts = AsyncCartesiaTTS
self.api_key = getenv("CARTESIA_API_KEY")
self.model_id = synthesizer_config.model_id
self.voice_id = synthesizer_config.voice_id
self.sampling_rate = synthesizer_config.sampling_rate
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@macwilk I just realized this is going to generate unintended consequences (specifically, slowed-down sounding audio) if we had initialized the Synthesizer with from_telephone_output_device() because it will be trying to pass DEFAULT_SAMPLING_RATE (8000).

self.output_format = synthesizer_config.output_format
self.client = self.cartesia_tts(api_key=self.api_key)
self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id)

async def create_speech(
self,
message: BaseMessage,
chunk_size: int,
is_first_text_chunk: bool = False,
is_sole_text_chunk: bool = False,
) -> SynthesisResult:
generator = await self.client.generate(
transcript=message.text,
voice=self.voice_embedding,
stream=True,
model_id=self.model_id,
data_rtype='bytes',
output_format=self.output_format
)

sample_rate = self.sampling_rate
audio_file = io.BytesIO()

with wave.open(audio_file, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
async for chunk in generator:
raw_data = chunk['audio']
wav_file.writeframes(raw_data)
audio_file.seek(0)

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=audio_file,
message=message,
chunk_size=chunk_size,
)

return result
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you convert this to a create_speech_uncached implementation?

check out our Eleven Labs implementation here as a good example of how to achieve this as it shouldn't require much changes to the existing code to get it up and running!

4 changes: 4 additions & 0 deletions vocode/streaming/synthesizer/default_factory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Type
from vocode.streaming.models.synthesizer import (
AzureSynthesizerConfig,
CartesiaSynthesizerConfig,
ElevenLabsSynthesizerConfig,
PlayHtSynthesizerConfig,
RimeSynthesizerConfig,
Expand All @@ -10,6 +11,7 @@
from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.streaming.synthesizer.cartesia_synthesizer import CartesiaSynthesizer
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer
from vocode.streaming.synthesizer.play_ht_synthesizer import PlayHtSynthesizer
Expand All @@ -25,6 +27,8 @@ def create_synthesizer(
):
if isinstance(synthesizer_config, AzureSynthesizerConfig):
return AzureSynthesizer(synthesizer_config)
elif isinstance(synthesizer_config, CartesiaSynthesizerConfig):
return CartesiaSynthesizer(synthesizer_config)
elif isinstance(synthesizer_config, ElevenLabsSynthesizerConfig):
eleven_labs_synthesizer_class_type: Type[BaseSynthesizer] = ElevenLabsSynthesizer
if synthesizer_config.experimental_websocket:
Expand Down