From 54ca2db3bb2962f8c7cf85f9338a5089b2a80b3f Mon Sep 17 00:00:00 2001 From: Navicstein Date: Wed, 12 Jun 2024 00:39:40 +0100 Subject: [PATCH 1/2] feat: add cartesia synthesizer --- poetry.lock | 97 +++++++++++-------- pyproject.toml | 2 + vocode/streaming/models/synthesizer.py | 35 +++++-- .../synthesizer/cartesia_synthesizer.py | 94 ++++++++++++++++++ .../streaming/synthesizer/default_factory.py | 3 + 5 files changed, 183 insertions(+), 48 deletions(-) create mode 100644 vocode/streaming/synthesizer/cartesia_synthesizer.py diff --git a/poetry.lock b/poetry.lock index e9786cffe..ce592237f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -153,7 +153,7 @@ files = [ name = "anthropic" version = "0.28.0" description = "The official Python library for the anthropic API" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "anthropic-0.28.0-py3-none-any.whl", hash = "sha256:2b620b21aee3d20c5d8005483c34df239d53ae895687113b26b8a36892a7e20f"}, @@ -199,7 +199,7 @@ trio = ["trio (>=0.23)"] name = "asttokens" version = "2.4.1" description = "Annotate AST trees with source code positions" -optional = true +optional = false python-versions = "*" files = [ {file = "asttokens-2.4.1-py2.py3-none-any.whl", hash = "sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24"}, @@ -247,7 +247,7 @@ tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "p name = "azure-cognitiveservices-speech" version = "1.37.0" description = "Microsoft Cognitive Services Speech SDK for Python" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "azure_cognitiveservices_speech-1.37.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:906d6bf65176e93464e2d763dd074ca00c48cfe1a896371fcdcb155a500910f7"}, @@ -315,6 +315,28 @@ files = [ {file = "cachetools-5.3.3.tar.gz", hash = "sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"}, ] +[[package]] +name = "cartesia" +version = "0.1.1" +description = "The official Python library for the Cartesia API." +optional = true +python-versions = ">=3.8.0" +files = [ + {file = "cartesia-0.1.1-py2.py3-none-any.whl", hash = "sha256:7a7365f17e220247ee2af1efdb88e69b0aa332e390c85775bf356b5e7b882498"}, + {file = "cartesia-0.1.1.tar.gz", hash = "sha256:c584770f4698e6dc826a75b7b5fd39bfce749c88ad9786dca46edd9527710002"}, +] + +[package.dependencies] +aiohttp = "*" +httpx = "*" +pytest-asyncio = "*" +requests = "*" +websockets = "*" + +[package.extras] +all = ["numpy", "pytest (>=8.0.2)", "pytest-cov (>=4.1.0)", "setuptools", "twine", "wheel"] +dev = ["numpy", "pytest (>=8.0.2)", "pytest-cov (>=4.1.0)", "setuptools", "twine", "wheel"] + [[package]] name = "certifi" version = "2024.6.2" @@ -654,7 +676,7 @@ typing-inspect = ">=0.4.0,<1" name = "decorator" version = "5.1.1" description = "Decorators for Humans" -optional = true +optional = false python-versions = ">=3.5" files = [ {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, @@ -682,7 +704,7 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] name = "distro" version = "1.9.0" description = "Distro - an OS platform information API" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, @@ -713,7 +735,7 @@ wmi = ["wmi (>=1.5.1)"] name = "elevenlabs" version = "1.2.2" description = "" -optional = true +optional = false python-versions = "<4.0,>=3.8" files = [ {file = "elevenlabs-1.2.2-py3-none-any.whl", hash = "sha256:60b92b0e2aabdfba93a43569f207f8a2ad397492519b8e11a2eebb32807ddefa"}, @@ -761,7 +783,7 @@ test = ["pytest (>=6)"] name = "executing" version = "2.0.1" description = "Get the currently executing AST node of a frame, and other information" -optional = true +optional = false python-versions = ">=3.5" files = [ {file = "executing-2.0.1-py2.py3-none-any.whl", hash = "sha256:eac49ca94516ccc753f9fb5ce82603156e590b27525a8bc32cce8ae302eb61bc"}, @@ -842,7 +864,7 @@ standard = ["fastapi", "uvicorn[standard] (>=0.15.0)"] name = "filelock" version = "3.14.0" description = "A platform independent file lock." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "filelock-3.14.0-py3-none-any.whl", hash = "sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f"}, @@ -944,7 +966,7 @@ files = [ name = "fsspec" version = "2024.6.0" description = "File-system specification" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "fsspec-2024.6.0-py3-none-any.whl", hash = "sha256:58d7122eb8a1a46f7f13453187bfea4972d66bf01618d37366521b1998034cee"}, @@ -1159,7 +1181,7 @@ test = ["objgraph", "psutil"] name = "grpcio" version = "1.64.1" description = "HTTP/2-based RPC framework" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "grpcio-1.64.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:55697ecec192bc3f2f3cc13a295ab670f51de29884ca9ae6cd6247df55df2502"}, @@ -1337,7 +1359,7 @@ socks = ["socksio (==1.*)"] name = "huggingface-hub" version = "0.23.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" -optional = true +optional = false python-versions = ">=3.8.0" files = [ {file = "huggingface_hub-0.23.3-py3-none-any.whl", hash = "sha256:22222c41223f1b7c209ae5511d2d82907325a0e3cdbce5f66949d43c598ff3bc"}, @@ -1412,7 +1434,7 @@ files = [ name = "ipython" version = "8.25.0" description = "IPython: Productive Interactive Computing" -optional = true +optional = false python-versions = ">=3.10" files = [ {file = "ipython-8.25.0-py3-none-any.whl", hash = "sha256:53eee7ad44df903a06655871cbab66d156a051fd86f3ec6750470ac9604ac1ab"}, @@ -1478,7 +1500,7 @@ typing-extensions = ">=3.7.4.3" name = "jedi" version = "0.19.1" description = "An autocompletion tool for Python that can be used for text editors." -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "jedi-0.19.1-py2.py3-none-any.whl", hash = "sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0"}, @@ -1514,7 +1536,7 @@ i18n = ["Babel (>=2.7)"] name = "jiter" version = "0.4.1" description = "Fast iterable JSON parser." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "jiter-0.4.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3c2370cd8826b484f3fc6ed729cb58510ba24b4bc277c92323a57d35cf4df223"}, @@ -1905,7 +1927,7 @@ tests = ["pytest", "pytz", "simplejson"] name = "matplotlib-inline" version = "0.1.7" description = "Inline Matplotlib backend for Jupyter" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"}, @@ -2221,7 +2243,7 @@ files = [ name = "openai" version = "1.31.0" description = "The official Python library for the openai API" -optional = true +optional = false python-versions = ">=3.7.1" files = [ {file = "openai-1.31.0-py3-none-any.whl", hash = "sha256:82044ee3122113f2a468a1f308a8882324d09556ba5348687c535d3655ee331c"}, @@ -2355,7 +2377,7 @@ files = [ name = "parso" version = "0.8.4" description = "A Python Parser" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"}, @@ -2381,7 +2403,7 @@ files = [ name = "pexpect" version = "4.9.0" description = "Pexpect allows easy control of interactive console applications." -optional = true +optional = false python-versions = "*" files = [ {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, @@ -2437,7 +2459,7 @@ testing = ["pytest", "pytest-benchmark"] name = "prompt-toolkit" version = "3.0.46" description = "Library for building powerful interactive command lines in Python" -optional = true +optional = false python-versions = ">=3.7.0" files = [ {file = "prompt_toolkit-3.0.46-py3-none-any.whl", hash = "sha256:45abe60a8300f3c618b23c16c4bb98c6fc80af8ce8b17c7ae92db48db3ee63c1"}, @@ -2468,7 +2490,7 @@ testing = ["google-api-core[grpc] (>=1.31.5)"] name = "protobuf" version = "4.25.3" description = "" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "protobuf-4.25.3-cp310-abi3-win32.whl", hash = "sha256:d4198877797a83cbfe9bffa3803602bbe1625dc30d8a097365dbc762e5790faa"}, @@ -2488,7 +2510,7 @@ files = [ name = "ptyprocess" version = "0.7.0" description = "Run a subprocess in a pseudo terminal" -optional = true +optional = false python-versions = "*" files = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, @@ -2499,7 +2521,7 @@ files = [ name = "pure-eval" version = "0.2.2" description = "Safely evaluate AST nodes without side effects" -optional = true +optional = false python-versions = "*" files = [ {file = "pure_eval-0.2.2-py3-none-any.whl", hash = "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350"}, @@ -2728,7 +2750,7 @@ windows-terminal = ["colorama (>=0.4.6)"] name = "pyht" version = "0.0.28" description = "" -optional = true +optional = false python-versions = "<4.0,>=3.8" files = [ {file = "pyht-0.0.28-py3-none-any.whl", hash = "sha256:ad8801acaa906eff5d6b39ce5ca76a08c154f705a2d0b2b6e841b219ef7875be"}, @@ -3008,7 +3030,7 @@ rpds-py = ">=0.7.0" name = "regex" version = "2024.5.15" description = "Alternative regular expression module, to replace re." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "regex-2024.5.15-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a81e3cfbae20378d75185171587cbf756015ccb14840702944f014e0d93ea09f"}, @@ -3257,7 +3279,7 @@ pyasn1 = ">=0.1.3" name = "sentry-sdk" version = "2.4.0" description = "Python client for Sentry (https://sentry.io)" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "sentry_sdk-2.4.0-py2.py3-none-any.whl", hash = "sha256:a42b70981cd4ed7da3c85d0360502d2ac932a15a4a420b360e1ebded2fc19a92"}, @@ -3319,7 +3341,7 @@ files = [ name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -optional = true +optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, @@ -3459,7 +3481,7 @@ sqlcipher = ["sqlcipher3_binary"] name = "stack-data" version = "0.6.3" description = "Extract data from python stack frames and tracebacks for informative displays" -optional = true +optional = false python-versions = "*" files = [ {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, @@ -3510,7 +3532,7 @@ test = ["pytest", "tornado (>=4.5)", "typeguard"] name = "tiktoken" version = "0.7.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "tiktoken-0.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:485f3cc6aba7c6b6ce388ba634fbba656d9ee27f766216f45146beb4ac18b25f"}, @@ -3562,7 +3584,7 @@ blobfile = ["blobfile (>=2)"] name = "tokenizers" version = "0.19.1" description = "" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "tokenizers-0.19.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:952078130b3d101e05ecfc7fc3640282d74ed26bcf691400f872563fca15ac97"}, @@ -3690,7 +3712,7 @@ files = [ name = "tqdm" version = "4.66.4" description = "Fast, Extensible Progress Meter" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "tqdm-4.66.4-py3-none-any.whl", hash = "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644"}, @@ -3710,7 +3732,7 @@ telegram = ["requests"] name = "traitlets" version = "5.14.3" description = "Traitlets Python configuration system" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, @@ -4174,7 +4196,7 @@ anyio = ">=3.0.0" name = "wcwidth" version = "0.2.13" description = "Measures the displayed width of unicode strings in a terminal" -optional = true +optional = false python-versions = "*" files = [ {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, @@ -4474,14 +4496,13 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -agents = ["anthropic", "openai", "tiktoken"] -all = ["anthropic", "azure-cognitiveservices-speech", "elevenlabs", "google-cloud-speech", "google-cloud-texttospeech", "langchain", "langchain-community", "miniaudio", "nltk", "openai", "pvkoala", "pydub", "pyht", "redis", "sentry-sdk", "tiktoken", "twilio", "vonage"] -misc = ["langchain", "langchain-community", "sentry-sdk"] -synthesizers = ["azure-cognitiveservices-speech", "elevenlabs", "google-cloud-texttospeech", "miniaudio", "nltk", "pvkoala", "pydub", "pyht"] -telephony = ["redis", "twilio", "vonage"] +all = ["google-cloud-speech", "google-cloud-texttospeech", "langchain", "langchain-community", "miniaudio", "nltk", "pvkoala", "pydub", "twilio", "vonage"] +misc = ["langchain", "langchain-community"] +synthesizers = ["cartesia", "google-cloud-texttospeech", "miniaudio", "nltk", "pvkoala", "pydub"] +telephony = ["twilio", "vonage"] transcribers = ["google-cloud-speech"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "0b362b102f8d84dd63633857a97c7ae7922358bfbb1e66b070e40c10643b7667" +content-hash = "3b1ebebe25e230979e87480dd05910cf9a8c39366cd0a70cfd5ea21af1d6b95a" diff --git a/pyproject.toml b/pyproject.toml index fe9c29741..244af4ef5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ miniaudio = { version = "^1.59", optional = true } nltk = { version = "^3.8.1", optional = true } pvkoala = { version = "^2.0.1", optional = true } pydub = { version = "^0.25.1", optional = true } +cartesia = { version = "^0.1.1", optional = true } # Transcribers google-cloud-speech = { version = "^2.26.0", optional = true } @@ -82,6 +83,7 @@ synthesizers = [ "nltk", "pvkoala", "pydub", + "cartesia", ] transcribers = ["google-cloud-speech"] telephony = ["twilio", "vonage"] diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py index 24616bb8b..a5b0e9be0 100644 --- a/vocode/streaming/models/synthesizer.py +++ b/vocode/streaming/models/synthesizer.py @@ -7,7 +7,10 @@ from .model import BaseModel, TypedModel from vocode.streaming.models.client_backend import OutputAudioConfig from vocode.streaming.output_device.base_output_device import BaseOutputDevice -from vocode.streaming.telephony.constants import DEFAULT_AUDIO_ENCODING, DEFAULT_SAMPLING_RATE +from vocode.streaming.telephony.constants import ( + DEFAULT_AUDIO_ENCODING, + DEFAULT_SAMPLING_RATE, +) class SynthesizerType(str, Enum): @@ -23,6 +26,7 @@ class SynthesizerType(str, Enum): COQUI = "synthesizer_coqui" BARK = "synthesizer_bark" POLLY = "synthesizer_polly" + CARTESIA = "synthesizer_cartesia" class SentimentConfig(BaseModel): @@ -49,14 +53,16 @@ def from_output_device(cls, output_device: BaseOutputDevice, **kwargs): return cls( sampling_rate=output_device.sampling_rate, audio_encoding=output_device.audio_encoding, - **kwargs + **kwargs, ) # TODO(EPD-186): switch to from_twilio_output_device and from_vonage_output_device @classmethod def from_telephone_output_device(cls, **kwargs): return cls( - sampling_rate=DEFAULT_SAMPLING_RATE, audio_encoding=DEFAULT_AUDIO_ENCODING, **kwargs + sampling_rate=DEFAULT_SAMPLING_RATE, + audio_encoding=DEFAULT_AUDIO_ENCODING, + **kwargs, ) @classmethod @@ -64,7 +70,7 @@ def from_output_audio_config(cls, output_audio_config: OutputAudioConfig, **kwar return cls( sampling_rate=output_audio_config.sampling_rate, audio_encoding=output_audio_config.audio_encoding, - **kwargs + **kwargs, ) @@ -97,7 +103,8 @@ class GoogleSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GOOGLE.val class ElevenLabsSynthesizerConfig( - SynthesizerConfig, type=SynthesizerType.ELEVEN_LABS.value # type: ignore + SynthesizerConfig, + type=SynthesizerType.ELEVEN_LABS.value, # type: ignore ): api_key: Optional[str] = None voice_id: Optional[str] = ELEVEN_LABS_ADAM_VOICE_ID @@ -117,18 +124,24 @@ def set_name(cls, voice_id): def stability_and_similarity_boost_check(cls, similarity_boost, values): stability = values.get("stability") if (stability is None) != (similarity_boost is None): - raise ValueError("Both stability and similarity_boost must be set or not set.") + raise ValueError( + "Both stability and similarity_boost must be set or not set." + ) return similarity_boost @validator("optimize_streaming_latency") def optimize_streaming_latency_check(cls, optimize_streaming_latency): - if optimize_streaming_latency is not None and not (0 <= optimize_streaming_latency <= 4): + if optimize_streaming_latency is not None and not ( + 0 <= optimize_streaming_latency <= 4 + ): raise ValueError("optimize_streaming_latency must be between 0 and 4.") return optimize_streaming_latency @validator("backchannel_amplitude_factor") def backchannel_amplitude_factor_check(cls, backchannel_amplitude_factor): - if backchannel_amplitude_factor is not None and not (0 < backchannel_amplitude_factor <= 1): + if backchannel_amplitude_factor is not None and not ( + 0 < backchannel_amplitude_factor <= 1 + ): raise ValueError( "backchannel_amplitude_factor must be between 0 (not inclusive) and 1." ) @@ -192,7 +205,8 @@ class PlayHtSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.PLAY_HT.va class CoquiTTSSynthesizerConfig( - SynthesizerConfig, type=SynthesizerType.COQUI_TTS.value # type: ignore + SynthesizerConfig, + type=SynthesizerType.COQUI_TTS.value, # type: ignore ): tts_kwargs: dict = {} speaker: Optional[str] = None @@ -207,7 +221,8 @@ class GTTSSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.GTTS.value): class StreamElementsSynthesizerConfig( - SynthesizerConfig, type=SynthesizerType.STREAM_ELEMENTS.value # type: ignore + SynthesizerConfig, + type=SynthesizerType.STREAM_ELEMENTS.value, # type: ignore ): voice: str = STREAM_ELEMENTS_SYNTHESIZER_DEFAULT_VOICE diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py new file mode 100644 index 000000000..56ce102ee --- /dev/null +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -0,0 +1,94 @@ +import asyncio +from typing import Optional + +from cartesia import AsyncCartesiaTTS +from cartesia.tts import AudioOutputFormat + +from vocode import getenv +from vocode.streaming.models.audio import AudioEncoding +from vocode.streaming.models.message import BaseMessage +from vocode.streaming.models.synthesizer import SynthesizerConfig, SynthesizerType +from vocode.streaming.synthesizer.base_synthesizer import ( + BaseSynthesizer, + SynthesisResult, +) + +CARTESIA_DEFAULT_VOICE = "Barbershop Man" +CARTESIA_DATA_RTYPE = "bytes" +CARTESIA_DEFAULT_OUTPUT_FORMAT = AudioOutputFormat.PCM +CARTESIA_DEFAULT_MODEL_ID = "upbeat-moon" + + +class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA.value): + voice_name: str = CARTESIA_DEFAULT_VOICE + output_format: str | AudioOutputFormat = CARTESIA_DEFAULT_OUTPUT_FORMAT + data_rtype: str = CARTESIA_DATA_RTYPE + model_id: str = CARTESIA_DEFAULT_MODEL_ID + api_key: str = str(getenv("CARTESIA_API_KEY")) + + +class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]): + def __init__(self, synthesizer_config: CartesiaSynthesizerConfig): + super().__init__(synthesizer_config=synthesizer_config) + + if self.synthesizer_config.audio_encoding == AudioEncoding.MULAW: + self.synthesizer_config.output_format = AudioOutputFormat.MULAW_8000 + elif self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16: + self.synthesizer_config.output_format = AudioOutputFormat.PCM + + self.catesia_client = AsyncCartesiaTTS(api_key=self.synthesizer_config.api_key) + voices = self.catesia_client.get_voices() + voice_id = voices[self.synthesizer_config.voice_name]["id"] + self.voice = self.catesia_client.get_voice_embedding(voice_id=voice_id) + + async def create_speech_uncached( + self, + message: BaseMessage, + chunk_size: int, + is_first_text_chunk: bool = False, + is_sole_text_chunk: bool = False, + ) -> SynthesisResult: + self.total_chars += len(message.text) + chunk_queue: asyncio.Queue[Optional[bytes]] = asyncio.Queue() + + try: + chunk_generator = await self.catesia_client.generate( + transcript=message.text, + voice=self.voice, + stream=True, + data_rtype=self.synthesizer_config.data_rtype, + model_id=self.synthesizer_config.model_id, + output_format=self.synthesizer_config.output_format, + ) + async for data in chunk_generator: # type: ignore + chunk = data["audio"] + chunk_queue.put_nowait(chunk) + + except asyncio.CancelledError: + pass + finally: + await chunk_queue.put(None) + + return SynthesisResult( + chunk_generator=self.chunk_result_generator_from_queue(chunk_queue), + get_message_up_to=lambda seconds: self.get_message_cutoff_from_voice_speed( + message=message, seconds=seconds, words_per_minute=150 + ), + ) + + @classmethod + def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig): + output_format = ( + synthesizer_config.output_format + if isinstance(synthesizer_config.output_format, str) + else synthesizer_config.output_format.value + ) + + return ":".join( + ( + SynthesizerType.CARTESIA.value, + synthesizer_config.model_id, + synthesizer_config.audio_encoding, + output_format, + ) + ) diff --git a/vocode/streaming/synthesizer/default_factory.py b/vocode/streaming/synthesizer/default_factory.py index 7539cf1dd..6ed43f5d9 100644 --- a/vocode/streaming/synthesizer/default_factory.py +++ b/vocode/streaming/synthesizer/default_factory.py @@ -10,6 +10,7 @@ from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer +from vocode.streaming.synthesizer.cartesia_synthesizer import CartesiaSynthesizer, CartesiaSynthesizerConfig from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer from vocode.streaming.synthesizer.play_ht_synthesizer import PlayHtSynthesizer @@ -39,5 +40,7 @@ def create_synthesizer( return RimeSynthesizer(synthesizer_config) elif isinstance(synthesizer_config, StreamElementsSynthesizerConfig): return StreamElementsSynthesizer(synthesizer_config) + elif isinstance(synthesizer_config, CartesiaSynthesizerConfig): + return CartesiaSynthesizer(synthesizer_config) else: raise Exception("Invalid synthesizer config") From 5ae15fc09351cff87b870d03140d09565b0d5496 Mon Sep 17 00:00:00 2001 From: Navicstein Date: Wed, 12 Jun 2024 02:27:44 +0100 Subject: [PATCH 2/2] add filler word generation option --- .../synthesizer/cartesia_synthesizer.py | 81 ++++++++++++++++--- 1 file changed, 72 insertions(+), 9 deletions(-) diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py index 56ce102ee..3aa7fb1f1 100644 --- a/vocode/streaming/synthesizer/cartesia_synthesizer.py +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -1,15 +1,22 @@ import asyncio -from typing import Optional +import io +import os +from typing import List, Optional, cast from cartesia import AsyncCartesiaTTS -from cartesia.tts import AudioOutputFormat +from cartesia.tts import AudioOutput, AudioOutputFormat +from loguru import logger +from pydub import AudioSegment from vocode import getenv from vocode.streaming.models.audio import AudioEncoding from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.synthesizer import SynthesizerConfig, SynthesizerType from vocode.streaming.synthesizer.base_synthesizer import ( + FILLER_AUDIO_PATH, + FILLER_PHRASES, BaseSynthesizer, + FillerAudio, SynthesisResult, ) @@ -41,6 +48,12 @@ def __init__(self, synthesizer_config: CartesiaSynthesizerConfig): voice_id = voices[self.synthesizer_config.voice_name]["id"] self.voice = self.catesia_client.get_voice_embedding(voice_id=voice_id) + self.output_format = ( + synthesizer_config.output_format + if isinstance(synthesizer_config.output_format, str) + else synthesizer_config.output_format.value + ) + async def create_speech_uncached( self, message: BaseMessage, @@ -78,17 +91,67 @@ async def create_speech_uncached( @classmethod def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig): - output_format = ( - synthesizer_config.output_format - if isinstance(synthesizer_config.output_format, str) - else synthesizer_config.output_format.value - ) - + instance = cls(synthesizer_config) return ":".join( ( SynthesizerType.CARTESIA.value, synthesizer_config.model_id, synthesizer_config.audio_encoding, - output_format, + instance.output_format, ) ) + + async def get_phrase_filler_audios(self) -> List[FillerAudio]: + filler_phrase_audios = [] + for filler_phrase in FILLER_PHRASES: + cache_key = "-".join( + ( + str(filler_phrase.text), + str(self.output_format), + str(self.synthesizer_config.audio_encoding.value), + str(self.synthesizer_config.sampling_rate), + str(self.synthesizer_config.model_id), + str(self.synthesizer_config.voice_name), + ) + ) + filler_audio_path = os.path.join(FILLER_AUDIO_PATH, f"{cache_key}.bytes") + if os.path.exists(filler_audio_path): + audio_data = open(filler_audio_path, "rb").read() + else: + logger.debug(f"Generating filler audio for {filler_phrase.text}") + audio_data, sample_rate = await self.create_audio(filler_phrase.text) + + audio = AudioSegment.from_raw( + io.BytesIO(audio_data), # type: ignore + frame_rate=sample_rate, + channels=1, + sample_width=2, + ) + audio.export(filler_audio_path, format="wav") + filler_phrase_audios.append( + FillerAudio( + message=filler_phrase, + audio_data=audio_data, + synthesizer_config=self.synthesizer_config, + ) + ) + return filler_phrase_audios + + async def create_audio(self, text: str) -> tuple[bytes, int]: + data = await self.catesia_client.generate( + voice=self.voice, + stream=False, + data_rtype=self.synthesizer_config.data_rtype, + model_id=self.synthesizer_config.model_id, + output_format=self.synthesizer_config.output_format, + transcript=text, + ) + + data = cast(AudioOutput, data) + if isinstance(data["audio"], bytes): + return data["audio"], data["sampling_rate"] + raise ValueError( + f"Unexpected data type for filler audio: {type(data['audio'])}" + ) + + \ No newline at end of file