1
+ import io
2
+ import wave
3
+ import hashlib
4
+
5
+ from vocode import getenv
6
+ from vocode .streaming .models .audio import AudioEncoding , SamplingRate
7
+ from vocode .streaming .models .message import BaseMessage
8
+ from vocode .streaming .models .synthesizer import CartesiaSynthesizerConfig
9
+ from vocode .streaming .synthesizer .base_synthesizer import BaseSynthesizer , SynthesisResult
10
+ from vocode .streaming .utils .create_task import asyncio_create_task_with_done_error_log
11
+
12
+
13
+ class CartesiaSynthesizer (BaseSynthesizer [CartesiaSynthesizerConfig ]):
14
+ def __init__ (
15
+ self ,
16
+ synthesizer_config : CartesiaSynthesizerConfig ,
17
+ ):
18
+ super ().__init__ (synthesizer_config )
19
+
20
+ # Lazy import the cartesia module
21
+ try :
22
+ from cartesia .tts import AsyncCartesiaTTS
23
+ except ImportError as e :
24
+ raise ImportError (
25
+ f"Missing required dependancies for CartesiaSynthesizer"
26
+ ) from e
27
+
28
+ self .cartesia_tts = AsyncCartesiaTTS
29
+
30
+ self .api_key = synthesizer_config .api_key or getenv ("CARTESIA_API_KEY" )
31
+ if not self .api_key :
32
+ raise ValueError ("Missing Cartesia API key" )
33
+
34
+
35
+ if synthesizer_config .audio_encoding == AudioEncoding .LINEAR16 :
36
+ self .channel_width = 2
37
+ match synthesizer_config .sampling_rate :
38
+ case SamplingRate .RATE_44100 :
39
+ self .sampling_rate = 44100
40
+ self .output_format = "pcm_44100"
41
+ case SamplingRate .RATE_22050 :
42
+ self .sampling_rate = 22050
43
+ self .output_format = "pcm_22050"
44
+ case SamplingRate .RATE_16000 :
45
+ self .sampling_rate = 16000
46
+ self .output_format = "pcm_16000"
47
+ case _:
48
+ raise ValueError (
49
+ f"Unsupported PCM sampling rate { synthesizer_config .sampling_rate } "
50
+ )
51
+ elif synthesizer_config .audio_encoding == AudioEncoding .MULAW :
52
+ # Cartesia has issues with MuLaw/8000. Use pcm/16000 and
53
+ # create_synthesis_result_from_wav will handle the conversion to mulaw/8000
54
+ self .channel_width = 2
55
+ self .output_format = "pcm_16000"
56
+ self .sampling_rate = 16000
57
+ else :
58
+ raise ValueError (
59
+ f"Unsupported audio encoding { synthesizer_config .audio_encoding } "
60
+ )
61
+
62
+ self .num_channels = 1
63
+ self .model_id = synthesizer_config .model_id
64
+ self .voice_id = synthesizer_config .voice_id
65
+ self .client = self .cartesia_tts (api_key = self .api_key )
66
+ self .voice_embedding = self .client .get_voice_embedding (voice_id = self .voice_id )
67
+
68
+
69
+ async def create_speech_uncached (
70
+ self ,
71
+ message : BaseMessage ,
72
+ chunk_size : int ,
73
+ is_first_text_chunk : bool = False ,
74
+ is_sole_text_chunk : bool = False ,
75
+ ) -> SynthesisResult :
76
+ generator = await self .client .generate (
77
+ transcript = message .text ,
78
+ voice = self .voice_embedding ,
79
+ stream = True ,
80
+ model_id = self .model_id ,
81
+ data_rtype = 'bytes' ,
82
+ output_format = self .output_format
83
+ )
84
+
85
+ audio_file = io .BytesIO ()
86
+ with wave .open (audio_file , 'wb' ) as wav_file :
87
+ wav_file .setnchannels (self .num_channels )
88
+ wav_file .setsampwidth (self .channel_width )
89
+ wav_file .setframerate (self .sampling_rate )
90
+ async for chunk in generator :
91
+ wav_file .writeframes (chunk ['audio' ])
92
+ audio_file .seek (0 )
93
+
94
+ result = self .create_synthesis_result_from_wav (
95
+ synthesizer_config = self .synthesizer_config ,
96
+ file = audio_file ,
97
+ message = message ,
98
+ chunk_size = chunk_size ,
99
+ )
100
+
101
+ return result
102
+
103
+ @classmethod
104
+ def get_voice_identifier (cls , synthesizer_config : CartesiaSynthesizerConfig ):
105
+ hashed_api_key = hashlib .sha256 (f"{ synthesizer_config .api_key } " .encode ("utf-8" )).hexdigest ()
106
+ return ":" .join (
107
+ (
108
+ "cartesia" ,
109
+ hashed_api_key ,
110
+ str (synthesizer_config .voice_id ),
111
+ str (synthesizer_config .model_id ),
112
+ synthesizer_config .audio_encoding
113
+ )
114
+ )
0 commit comments