ajar98 · macwilk · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,46 +11,46 @@ homepage = "https://github.com/vocodedev/vocode-python"
 [tool.poetry.dependencies]
 python = ">=3.10,<4.0"
 aiohttp = "^3.9.5"
+anthropic = "^0.28.0"
+azure-cognitiveservices-speech = "^1.37.0"
+elevenlabs = "^1.2.2"
 fastapi = "^0.111.0"
 janus = "^1.0.0"
 jinja2 = "^3.1.4"
 jsonschema = "^4.22.0"
 loguru = "^0.7.2"
+numpy = "^1.26.4"
+openai = "^1.30.5"
 opentelemetry-sdk = "^1.24.0"
 phonenumbers = "^8.13.37"
 pydantic = "^2.7.2"
+pydantic-settings = "^2.3.0"
+pyht = "^0.0.28"
+redis = "^5.0.4"
 requests = "^2.32.3"
+sentry-sdk = { extras = ["fastapi"], version = "^2.3.1" }
 sounddevice = "^0.4.7"
+tiktoken = "^0.7.0"
 uvicorn = "^0.30.0"
 websockets = "^12.0"
 
-# Agents
-anthropic = { version = "^0.28.0", optional = true }
-openai = { version = "^1.30.5", optional = true }
-tiktoken = { version = "0.7.0", optional = true }
-
 # Synthesizers
-azure-cognitiveservices-speech = { version = "^1.37.0", optional = true }
-elevenlabs = { version = "^1.2.2", optional = true }
 google-cloud-texttospeech = { version = "^2.16.3", optional = true }
 miniaudio = { version = "^1.59", optional = true }
 nltk = { version = "^3.8.1", optional = true }
 pvkoala = { version = "^2.0.1", optional = true }
 pydub = { version = "^0.25.1", optional = true }
-pyht = { version = "^0.0.28", optional = true }
 
 # Transcribers
 google-cloud-speech = { version = "^2.26.0", optional = true }
 
 # Telephony
-redis = { version = "^5.0.4", optional = true }
 twilio = { version = "^9.1.0", optional = true }
 vonage = { version = "^3.14.0", optional = true }
 
 # Misc
 langchain = { version = "^0.2.1", optional = true }
 langchain-community = { version = "^0.2.1", optional = true }
-sentry-sdk = { extras = ["fastapi"], version = "^2.3.1", optional = true }
 
 
 [tool.poetry.group.lint.dependencies]
@@ -76,39 +76,27 @@ pytest-httpx = "^0.30.0"
 pytest-mock = "^3.14.0"
 
 [tool.poetry.extras]
-agents = ["anthropic", "openai", "tiktoken"]
 synthesizers = [
-    "azure-cognitiveservices-speech",
-    "elevenlabs",
     "google-cloud-texttospeech",
     "miniaudio",
     "nltk",
     "pvkoala",
     "pydub",
-    "pyht",
 ]
 transcribers = ["google-cloud-speech"]
-telephony = ["twilio", "redis", "vonage"]
-misc = ["langchain", "langchain-community", "sentry-sdk"]
+telephony = ["twilio", "vonage"]
+misc = ["langchain", "langchain-community"]
 all = [
-    "anthropic",
-    "openai",
-    "tiktoken",
-    "azure-cognitiveservices-speech",
-    "elevenlabs",
     "google-cloud-texttospeech",
     "miniaudio",
     "nltk",
     "pvkoala",
     "pydub",
-    "pyht",
     "google-cloud-speech",
     "twilio",
-    "redis",
     "vonage",
     "langchain",
     "langchain-community",
-    "sentry-sdk",
 ]
 
 [tool.mypy]

diff --git a/quickstarts/streaming_conversation.py b/quickstarts/streaming_conversation.py
@@ -1,26 +1,46 @@
 import asyncio
 import signal
 
-from dotenv import load_dotenv
+from pydantic_settings import BaseSettings, SettingsConfigDict
 
+from vocode.helpers import create_streaming_microphone_input_and_speaker_output
 from vocode.logging import configure_pretty_logging
 from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
 from vocode.streaming.models.agent import ChatGPTAgentConfig
+from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
 from vocode.streaming.models.transcriber import (
     DeepgramTranscriberConfig,
     PunctuationEndpointingConfig,
 )
+from vocode.streaming.streaming_conversation import StreamingConversation
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
 
-load_dotenv()
+configure_pretty_logging()
 
-from vocode.helpers import create_streaming_microphone_input_and_speaker_output
-from vocode.streaming.models.message import BaseMessage
-from vocode.streaming.streaming_conversation import StreamingConversation
 
-configure_pretty_logging()
+class Settings(BaseSettings):
+    """
+    Settings for the streaming conversation quickstart.
+    These parameters can be configured with environment variables.
+    """
+
+    openai_api_key: str = "ENTER_YOUR_OPENAI_API_KEY_HERE"
+    azure_speech_key: str = "ENTER_YOUR_AZURE_KEY_HERE"
+    deepgram_api_key: str = "ENTER_YOUR_DEEPGRAM_API_KEY_HERE"
+
+    azure_speech_region: str = "eastus"
+
+    # This means a .env file can be used to overload these settings
+    # ex: "OPENAI_API_KEY=my_key" will set openai_api_key over the default above
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+    )
+
+
+settings = Settings()
 
 
 async def main():
@@ -38,15 +58,21 @@ async def main():
             DeepgramTranscriberConfig.from_input_device(
                 microphone_input,
                 endpointing_config=PunctuationEndpointingConfig(),
-            )
+                api_key=settings.deepgram_api_key,
+            ),
         ),
         agent=ChatGPTAgent(
             ChatGPTAgentConfig(
+                openai_api_key=settings.openai_api_key,
                 initial_message=BaseMessage(text="What up"),
                 prompt_preamble="""The AI is having a pleasant conversation about life""",
             )
         ),
-        synthesizer=AzureSynthesizer(AzureSynthesizerConfig.from_output_device(speaker_output)),
+        synthesizer=AzureSynthesizer(
+            AzureSynthesizerConfig.from_output_device(speaker_output),
+            azure_speech_key=settings.azure_speech_key,
+            azure_speech_region=settings.azure_speech_region,
+        ),
     )
     await conversation.start()
     print("Conversation started, press Ctrl+C to end")

diff --git a/quickstarts/turn_based_conversation.py b/quickstarts/turn_based_conversation.py
@@ -1,16 +1,32 @@
-from dotenv import load_dotenv
+from pydantic_settings import BaseSettings, SettingsConfigDict
 
-from vocode import getenv
 from vocode.helpers import create_turn_based_microphone_input_and_speaker_output
 from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent
 from vocode.turn_based.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber
 from vocode.turn_based.turn_based_conversation import TurnBasedConversation
 
-load_dotenv()
 
-# See https://api.elevenlabs.io/v1/voices
-ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
+class Settings(BaseSettings):
+    """
+    Settings for the turn-based conversation quickstart.
+    These parameters can be configured with environment variables.
+    """
+
+    openai_api_key: str = "ENTER_YOUR_OPENAI_API_KEY_HERE"
+    azure_speech_key: str = "ENTER_YOUR_AZURE_KEY_HERE"
+
+    azure_speech_region: str = "eastus"
+
+    # This means a .env file can be used to overload these settings
+    # ex: "OPENAI_API_KEY=my_key" will set openai_api_key over the default above
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+    )
+
+
+settings = Settings()
 
 if __name__ == "__main__":
     (
@@ -23,15 +39,15 @@
     conversation = TurnBasedConversation(
         input_device=microphone_input,
         output_device=speaker_output,
-        transcriber=WhisperTranscriber(api_key=getenv("OPENAI_API_KEY")),
+        transcriber=WhisperTranscriber(api_key=settings.openai_api_key),
         agent=ChatGPTAgent(
             system_prompt="The AI is having a pleasant conversation about life",
             initial_message="Hello!",
-            api_key=getenv("OPENAI_API_KEY"),
+            api_key=settings.openai_api_key,
         ),
         synthesizer=AzureSynthesizer(
-            api_key=getenv("AZURE_SPEECH_KEY"),
-            region=getenv("AZURE_SPEECH_REGION"),
+            api_key=settings.azure_speech_key,
+            region=settings.azure_speech_region,
             voice_name="en-US-SteffanNeural",
         ),
     )

diff --git a/vocode/__init__.py b/vocode/__init__.py
@@ -6,9 +6,13 @@
 import sentry_sdk
 from loguru import logger
 
+from vocode.meta import ensure_punkt_installed
+
 environment = {}
 logger.disable("vocode")
 
+ensure_punkt_installed()
+
 
 class ContextWrapper:
     """Context Variable Wrapper."""

diff --git a/vocode/meta.py b/vocode/meta.py
@@ -0,0 +1,15 @@
+from loguru import logger
+
+
+def ensure_punkt_installed():
+    try:
+        from nltk.data import find
+
+        find("tokenizers/punkt")
+    except LookupError:
+        from nltk import download
+
+        # If not installed, download 'punkt'
+        logger.info("Downloading 'punkt' tokenizer...")
+        download("punkt")
+        logger.info("'punkt' tokenizer downloaded successfully.")
diff --git a/vocode/streaming/agent/chat_gpt_agent.py b/vocode/streaming/agent/chat_gpt_agent.py
@@ -4,7 +4,8 @@
 
 import sentry_sdk
 from loguru import logger
-from openai import AsyncAzureOpenAI, AsyncOpenAI
+from openai import DEFAULT_MAX_RETRIES as OPENAI_DEFAULT_MAX_RETRIES
+from openai import AsyncAzureOpenAI, AsyncOpenAI, NotFoundError, RateLimitError
 
 from vocode import sentry_span_tags
 from vocode.streaming.action.abstract_factory import AbstractActionFactory
@@ -27,6 +28,24 @@
 ChatGPTAgentConfigType = TypeVar("ChatGPTAgentConfigType", bound=ChatGPTAgentConfig)
 
 
+def instantiate_openai_client(agent_config: ChatGPTAgentConfig, model_fallback: bool = False):
+    if agent_config.azure_params:
+        return AsyncAzureOpenAI(
+            azure_endpoint=agent_config.azure_params.base_url,
+            api_key=agent_config.azure_params.api_key,
+            api_version=agent_config.azure_params.api_version,
+            max_retries=0 if model_fallback else OPENAI_DEFAULT_MAX_RETRIES,
+        )
+    else:
+        if agent_config.openai_api_key is not None:
+            logger.info("Using OpenAI API key override")
+        return AsyncOpenAI(
+            api_key=agent_config.openai_api_key or os.environ["OPENAI_API_KEY"],
+            base_url="https://api.openai.com/v1",
+            max_retries=0 if model_fallback else OPENAI_DEFAULT_MAX_RETRIES,
+        )
+
+
 class ChatGPTAgent(RespondAgent[ChatGPTAgentConfigType]):
     openai_client: Union[AsyncOpenAI, AsyncAzureOpenAI]
 
@@ -42,19 +61,9 @@ def __init__(
             action_factory=action_factory,
             **kwargs,
         )
-        if agent_config.azure_params:
-            self.openai_client = AsyncAzureOpenAI(
-                azure_endpoint=agent_config.azure_params.base_url,
-                api_key=agent_config.azure_params.api_key,
-                api_version=agent_config.azure_params.api_version,
-            )
-        else:
-            if agent_config.openai_api_key is not None:
-                logger.info("Using OpenAI API key override")
-            self.openai_client = AsyncOpenAI(
-                api_key=agent_config.openai_api_key or os.environ["OPENAI_API_KEY"],
-                base_url="https://api.openai.com/v1",
-            )
+        self.openai_client = instantiate_openai_client(
+            agent_config, model_fallback=agent_config.llm_fallback is not None
+        )
 
         if not self.openai_client.api_key:
             raise ValueError("OPENAI_API_KEY must be set in environment or passed in")
@@ -109,9 +118,37 @@ def get_model_name_for_tokenizer(self):
         else:
             return self.agent_config.azure_params.openai_model_name
 
-    async def _create_openai_stream(self, chat_parameters: Dict[str, Any]) -> AsyncGenerator:
+    def apply_model_fallback(self, chat_parameters: Dict[str, Any]):
+        if self.agent_config.llm_fallback is None:
+            return
+        if self.agent_config.llm_fallback.provider == "openai":
+            self.agent_config.model_name = self.agent_config.llm_fallback.model_name
+            if isinstance(self.openai_client, AsyncAzureOpenAI):
+                self.agent_config.azure_params = None
+        else:
+            if self.agent_config.azure_params:
+                self.agent_config.azure_params.deployment_name = (
+                    self.agent_config.llm_fallback.model_name
+                )
+                if isinstance(self.openai_client, AsyncOpenAI):
+                    # TODO: handle OpenAI fallback to Azure
+                    pass
+
+        self.openai_client = instantiate_openai_client(self.agent_config, model_fallback=False)
+        chat_parameters["model"] = self.agent_config.llm_fallback.model_name
+
+    async def _create_openai_stream_with_fallback(
+        self, chat_parameters: Dict[str, Any]
+    ) -> AsyncGenerator:
         try:
             stream = await self.openai_client.chat.completions.create(**chat_parameters)
+        except (NotFoundError, RateLimitError) as e:
+            logger.error(
+                f"{'Model not found' if isinstance(e, NotFoundError) else 'Rate limit error'} for model_name: {chat_parameters.get('model')}. Applying fallback.",
+                exc_info=True,
+            )
+            self.apply_model_fallback(chat_parameters)
+            stream = await self.openai_client.chat.completions.create(**chat_parameters)
         except Exception as e:
             logger.error(
                 f"Error while hitting OpenAI with chat_parameters: {chat_parameters}",
@@ -120,6 +157,20 @@ async def _create_openai_stream(self, chat_parameters: Dict[str, Any]) -> AsyncG
             raise e
         return stream
 
+    async def _create_openai_stream(self, chat_parameters: Dict[str, Any]) -> AsyncGenerator:
+        if self.agent_config.llm_fallback is not None and self.openai_client.max_retries == 0:
+            stream = await self._create_openai_stream_with_fallback(chat_parameters)
+        else:
+            try:
+                stream = await self.openai_client.chat.completions.create(**chat_parameters)
+            except Exception as e:
+                logger.error(
+                    f"Error while hitting OpenAI with chat_parameters: {chat_parameters}",
+                    exc_info=True,
+                )
+                raise e
+        return stream
+
     def should_backchannel(self, human_input: str) -> bool:
         return (
             not self.is_first_response()

diff --git a/vocode/streaming/agent/default_factory.py b/vocode/streaming/agent/default_factory.py
@@ -9,7 +9,6 @@
     AnthropicAgentConfig,
     ChatGPTAgentConfig,
     EchoAgentConfig,
-    LlamacppAgentConfig,
     RESTfulUserImplementedAgentConfig,
 )