forked from vocodedev/vocode-core
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcartesia_synthesizer.py
73 lines (62 loc) · 2.49 KB
/
cartesia_synthesizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import io
import wave
from vocode import getenv
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import CartesiaSynthesizerConfig
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]):
def __init__(
self,
synthesizer_config: CartesiaSynthesizerConfig,
):
super().__init__(synthesizer_config)
# Lazy import the cartesia module
try:
from cartesia.tts import AsyncCartesiaTTS
except ImportError as e:
raise ImportError(
f"Missing required dependancies for CartesiaSynthesizer"
) from e
self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
if not self.api_key:
raise ValueError("Missing Cartesia API key")
self.cartesia_tts = AsyncCartesiaTTS
self.api_key = getenv("CARTESIA_API_KEY")
self.model_id = synthesizer_config.model_id
self.voice_id = synthesizer_config.voice_id
self.sampling_rate = synthesizer_config.sampling_rate
self.output_format = synthesizer_config.output_format
self.client = self.cartesia_tts(api_key=self.api_key)
self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id)
async def create_speech(
self,
message: BaseMessage,
chunk_size: int,
is_first_text_chunk: bool = False,
is_sole_text_chunk: bool = False,
) -> SynthesisResult:
generator = await self.client.generate(
transcript=message.text,
voice=self.voice_embedding,
stream=True,
model_id=self.model_id,
data_rtype='bytes',
output_format=self.output_format
)
sample_rate = self.sampling_rate
audio_file = io.BytesIO()
with wave.open(audio_file, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
async for chunk in generator:
raw_data = chunk['audio']
wav_file.writeframes(raw_data)
audio_file.seek(0)
result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=audio_file,
message=message,
chunk_size=chunk_size,
)
return result