[docs sprint] python quickstart + working with phone calls (#27)

ajar98 · web-flow · commit 2b27d48fbf23 · 2024-06-13T17:42:19.000-07:00
* deprecate SpeakerOutput

* remove play.ht default voice id

* rename open source quickstarts page

* remove building block reference

* update python quickstart

* extra steps to deprecate speakeroutput

* finish telephony docs

* fix some references + language in how-to-use-it

* fix test
diff --git a/apps/telephony_app/speller_agent.py b/apps/telephony_app/speller_agent.py
@@ -1,4 +1,3 @@
-import typing
 from typing import Optional, Tuple
 
 from vocode.streaming.agent.abstract_factory import AbstractAgentFactory
@@ -65,16 +64,10 @@ def create_agent(self, agent_config: AgentConfig) -> BaseAgent:
             Exception: If the agent configuration type is not recognized.
         """
         # If the agent configuration type is CHAT_GPT, create a ChatGPTAgent.
-        if agent_config.type == AgentType.CHAT_GPT:
-            return ChatGPTAgent(
-                # Cast the agent configuration to ChatGPTAgentConfig as we are sure about the type here.
-                agent_config=typing.cast(ChatGPTAgentConfig, agent_config)
-            )
+        if isinstance(agent_config, ChatGPTAgentConfig):
+            return ChatGPTAgent(agent_config=agent_config)
         # If the agent configuration type is agent_speller, create a SpellerAgent.
-        elif agent_config.type == "agent_speller":
-            return SpellerAgent(
-                # Cast the agent configuration to SpellerAgentConfig as we are sure about the type here.
-                agent_config=typing.cast(SpellerAgentConfig, agent_config)
-            )
+        elif isinstance(agent_config, SpellerAgentConfig):
+            return SpellerAgent(agent_config=agent_config)
         # If the agent configuration type is not recognized, raise an exception.
         raise Exception("Invalid agent config")
diff --git a/docs/mint.json b/docs/mint.json
@@ -49,11 +49,7 @@
   "navigation": [
     {
       "group": "Getting Started",
-      "pages": [
-        "welcome",
-        "hosted-quickstart",
-        "open-source-quickstart"
-      ]
+      "pages": ["welcome", "hosted-quickstart", "open-source-quickstarts"]
     },
     {
       "group": "Vocode 101",
@@ -85,14 +81,6 @@
         "open-source/agent-factory"
       ]
     },
-    {
-      "group": "Python",
-      "pages": [
-        "open-source/transcriber-reference",
-        "open-source/agent-reference",
-        "open-source/synthesizer-reference"
-      ]
-    },
     {
       "group": "Numbers",
       "pages": [
@@ -115,9 +103,7 @@
     },
     {
       "group": "Usage",
-      "pages": [
-        "api-reference/usage/get-usage"
-      ]
+      "pages": ["api-reference/usage/get-usage"]
     },
     {
       "group": "Actions",
@@ -231,4 +217,4 @@
     "twitter": "https://twitter.com/vocodehq",
     "website": "https://www.vocode.dev/"
   }
-}
+}
diff --git a/docs/open-source-quickstarts.mdx b/docs/open-source-quickstarts.mdx
@@ -1,12 +1,16 @@
 ---
-title: "Open Source Quickstart"
+title: "Open Source Quickstarts"
 description: "How to get Vocode up and running on your own machine"
 ---
 
 ## Start Developing
 
 <CardGroup>
-  <Card title="Python Quick Start" icon="circle-play" href="/open-source/python-quickstart">
+  <Card
+    title="Python Quick Start"
+    icon="circle-play"
+    href="/open-source/python-quickstart"
+  >
     Quickly get up and running with Vocode by following our Python quick start
     guide.
   </Card>
diff --git a/docs/open-source/how-to-use-it.mdx b/docs/open-source/how-to-use-it.mdx
@@ -5,8 +5,7 @@ description: "Various ways to utilize Vocode."
 
 ## Understanding Our Open Source Libraries
 
-Vocode's Open Source supports both Python and React, with plans for future support for
-additional languages.
+Vocode's Open Source supports Python, and a client library in React.
 
 ### Getting Started with the Open Source Python Library
 
diff --git a/docs/open-source/python-quickstart.mdx b/docs/open-source/python-quickstart.mdx
@@ -8,14 +8,11 @@ description: "Get up and running using Python"
 Install the [vocode package](https://pypi.org/project/vocode/):
 
 ```bash
-pip install 'vocode[io]'
+pip install vocode
 ```
 
 # Getting started
 
-The `io` extra installs the packages necessary to run our voice conversations locally, but is not needed for other surfaces, e.g. [phone calls](/open-source/telephony).
-You may need to install [portaudio](https://formulae.brew.sh/formula/portaudio) and [ffmpeg](https://formulae.brew.sh/formula/ffmpeg) on your system.
-
 ## Working with system audio
 
 We provide helper methods to hook into your system audio.
@@ -33,74 +30,94 @@ If the default I/O devices are not being set properly, set `use_default_devices`
 Vocode provides a unified interface across various speech transcription, speech synthesis, and AI/NLU providers.
 To use these providers with Vocode, you'll need to grab credentials from these providers and set them in the Vocode environment.
 
-```python
-# these can also be set as environment variables
-vocode.setenv(
-    OPENAI_API_KEY="<your OpenAI key>",
-    DEEPGRAM_API_KEY="<your Deepgram key>",
-    AZURE_SPEECH_KEY="<your Azure key>",
-    AZURE_SPEECH_REGION="<your Azure region>",
-)
-```
+You can either set the following parameters as environment variables (e.g. by specifying them in a `.env` file and using a package like `python-dotenv` to load), or set them manually in the pydantic settings (see below).
 
 For AZURE_SPEECH_REGION you should use the URL format. For example, if you're using the "East US" region, the value should be "eastus". See [Azure Region list](https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech?tabs=streaming#prebuilt-neural-voices).
 
 ## `StreamingConversation` example
 
+This can also be found in the [`quickstarts` directory](https://github.com/vocodedev/vocode-core/blob/main/quickstarts/streaming_conversation.py) of the repo.
+
 ```python
 import asyncio
 import signal
 
-import vocode
-from vocode.streaming.streaming_conversation import StreamingConversation
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
 from vocode.helpers import create_streaming_microphone_input_and_speaker_output
-from vocode.streaming.models.transcriber import (
-    DeepgramTranscriberConfig,
-    PunctuationEndpointingConfig,
-)
+from vocode.logging import configure_pretty_logging
 from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
 from vocode.streaming.models.agent import ChatGPTAgentConfig
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
+from vocode.streaming.models.transcriber import (
+    DeepgramTranscriberConfig,
+    PunctuationEndpointingConfig,
+)
+from vocode.streaming.streaming_conversation import StreamingConversation
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
 
-# these can also be set as environment variables
-vocode.setenv(
-    OPENAI_API_KEY="<your OpenAI key>",
-    DEEPGRAM_API_KEY="<your Deepgram key>",
-    AZURE_SPEECH_KEY="<your Azure key>",
-    AZURE_SPEECH_REGION="<your Azure region>",
-)
+configure_pretty_logging()
+
+
+class Settings(BaseSettings):
+    """
+    Settings for the streaming conversation quickstart.
+    These parameters can be configured with environment variables.
+    """
+
+    openai_api_key: str = "ENTER_YOUR_OPENAI_API_KEY_HERE"
+    azure_speech_key: str = "ENTER_YOUR_AZURE_KEY_HERE"
+    deepgram_api_key: str = "ENTER_YOUR_DEEPGRAM_API_KEY_HERE"
+
+    azure_speech_region: str = "eastus"
+
+    # This means a .env file can be used to overload these settings
+    # ex: "OPENAI_API_KEY=my_key" will set openai_api_key over the default above
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+    )
+
+
+settings = Settings()
 
 
 async def main():
-    microphone_input, speaker_output = create_streaming_microphone_input_and_speaker_output(
-        use_default_devices=True,
+    (
+        microphone_input,
+        speaker_output,
+    ) = create_streaming_microphone_input_and_speaker_output(
+        use_default_devices=False,
+        use_blocking_speaker_output=True,  # this moves the playback to a separate thread, set to False to use the main thread
     )
 
     conversation = StreamingConversation(
         output_device=speaker_output,
         transcriber=DeepgramTranscriber(
             DeepgramTranscriberConfig.from_input_device(
-                microphone_input, endpointing_config=PunctuationEndpointingConfig()
-            )
+                microphone_input,
+                endpointing_config=PunctuationEndpointingConfig(),
+                api_key=settings.deepgram_api_key,
+            ),
         ),
         agent=ChatGPTAgent(
             ChatGPTAgentConfig(
-                initial_message=BaseMessage(text="Hello!"),
-                prompt_preamble="Have a pleasant conversation about life",
-            ),
+                openai_api_key=settings.openai_api_key,
+                initial_message=BaseMessage(text="What up"),
+                prompt_preamble="""The AI is having a pleasant conversation about life""",
+            )
         ),
         synthesizer=AzureSynthesizer(
-            AzureSynthesizerConfig.from_output_device(speaker_output)
+            AzureSynthesizerConfig.from_output_device(speaker_output),
+            azure_speech_key=settings.azure_speech_key,
+            azure_speech_region=settings.azure_speech_region,
         ),
     )
     await conversation.start()
     print("Conversation started, press Ctrl+C to end")
-    signal.signal(
-        signal.SIGINT, lambda _0, _1: asyncio.create_task(conversation.terminate())
-    )
+    signal.signal(signal.SIGINT, lambda _0, _1: asyncio.create_task(conversation.terminate()))
     while conversation.is_active():
         chunk = await microphone_input.get_audio()
         conversation.receive_audio(chunk)
diff --git a/docs/open-source/telephony.mdx b/docs/open-source/telephony.mdx
@@ -117,8 +117,6 @@ Make sure the server we just set up is already running. Then, in `outbound_call.
 
 Replace the `to_phone` with the number you want to call and the `from_phone` with the number you want to call from. In order to make a call from the `from_phone`, you must have access to it via Twilio (either a number purchased via Twilio or verify the caller ID).
 
-> Note: To ensure legal compliance with robocall regulations in California, the following code snippet from the [Vocode library](https://github.com/vocodedev/vocode-python/blob/main/vocode/streaming/telephony/conversation/outbound_call.py#L83-L96) utilizes Twilio Line Intelligence to check if calls are made to mobile phones: For Canadian phone numbers, the Twilio Lookup API may not return carrier data due to the Canadian Local Number Portability Consortium (CLNPC) requirements. More information on this issue can be found in the [Twilio Support Article](https://support.twilio.com/hc/en-us/articles/360004563433-Twilio-Lookup-API-is-Not-Returning-Carrier-Data-for-Canadian-Phone-Numbers).
-
 Run the script with `poetry run python outbound_call.py`.
 
 ## Configuration
@@ -129,26 +127,39 @@ or `SynthesizerConfig` - the default transcriber is Deepgram and the default syn
 This example sets up an agent that spells every word that is sent to it - any text-in, text-out function can be turned into a voice conversation by subclassing `BaseAgent` and creating an `AgentFactory`.
 
 ```
+import typing
+from typing import Optional, Tuple
+
+from vocode.streaming.agent.abstract_factory import AbstractAgentFactory
+from vocode.streaming.agent.base_agent import BaseAgent, RespondAgent
+from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
+from vocode.streaming.models.agent import AgentConfig, AgentType, ChatGPTAgentConfig
+
+
 class SpellerAgentConfig(AgentConfig, type="agent_speller"):
     pass
 
 
-class SpellerAgent(BaseAgent):
-    def __init__(self, agent_config: SpellerAgentConfig):
-        super().__init__(agent_config=agent_config)
-
+class SpellerAgent(RespondAgent[SpellerAgentConfig]):
     async def respond(
         self,
-        human_input,
+        human_input: str,
         conversation_id: str,
         is_interrupt: bool = False,
     ) -> Tuple[Optional[str], bool]:
         return "".join(c + " " for c in human_input), False
 
 
-class SpellerAgentFactory(AgentFactory):
+class SpellerAgentFactory(AbstractAgentFactory):
     def create_agent(self, agent_config: AgentConfig) -> BaseAgent:
-        return SpellerAgent(agent_config=agent_config)
+        # If the agent configuration type is CHAT_GPT, create a ChatGPTAgent.
+        if isinstance(agent_config, ChatGPTAgentConfig):
+            return ChatGPTAgent(agent_config=agent_config)
+        # If the agent configuration type is agent_speller, create a SpellerAgent.
+        elif isinstance(agent_config, SpellerAgentConfig):
+            return SpellerAgent(agent_config=agent_config)
+        # If the agent configuration type is not recognized, raise an exception.
+        raise Exception("Invalid agent config")
 ```
 
 An `AgentFactory` instance is passed into the `TelephonyServer` in `telephony_app.py`.
@@ -157,7 +168,7 @@ We provide a small set of agents with already created `AgentConfig`s, including,
 
 ### Accessing call information in your agent
 
-We store the `to` and `from` numbers in the [`ConfigManager`](https://github.com/vocodedev/vocode-python-sdk/blob/b37bf7a1172a917b641d0e70ba14756415e09b0b/apps/telephony_app/main.py#L20) - so
+We store the `to` and `from` numbers in the [`ConfigManager`](https://github.com/vocodedev/vocode-core/blob/53b01dab0b59f71961ee83dbcaf3653a6935c2e3/apps/telephony_app/main.py#L30) - so
 if you'd like to access them in your agent, you can instantiate the manager to hook into the same Redis instance:
 
 ```
@@ -168,7 +179,7 @@ class SpellerAgent(BaseAgent):
 
     async def respond(
         self,
-        human_input,
+        human_input: str,
         conversation_id: str,
         is_interrupt: bool = False,
     ) -> Tuple[Optional[str], bool]:
diff --git a/docs/welcome.mdx b/docs/welcome.mdx
@@ -26,7 +26,7 @@ operations flows. Get started with the [Hosted Service](/hosted-quickstart).
 ## Open Source Library
 
 Most of what we build is open source and free to use! Leverage all of the features mentioned in the introduction by taking a look
-at our open source repos. Get started with [Open Source](/open-source-quickstart).
+at our open source repos. Get started with [Open Source](/open-source-quickstarts).
 
 ## Quickstarts
 
@@ -35,9 +35,9 @@ at our open source repos. Get started with [Open Source](/open-source-quickstart
     Start using the hosted telephony service.
   </Card>
   <Card
-    title="Open Source Quick Start"
+    title="Open Source Quick Starts"
     icon="circle-play"
-    href="/open-source-quickstart"
+    href="/open-source-quickstarts"
   >
     Run Vocode self hosted.
   </Card>
diff --git a/playground/streaming/synthesizer/synthesize.py b/playground/streaming/synthesizer/synthesize.py
@@ -3,7 +3,7 @@
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
-from vocode.streaming.output_device.speaker_output import SpeakerOutput
+from vocode.streaming.output_device.blocking_speaker_output import BlockingSpeakerOutput
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
 from vocode.streaming.utils import get_chunk_size_per_second
@@ -58,7 +58,8 @@ async def speak(
         return message_sent, cut_off
 
     async def main():
-        speaker_output = SpeakerOutput.from_default_device()
+        speaker_output = BlockingSpeakerOutput.from_default_device()
+        speaker_output.start()
         synthesizer = AzureSynthesizer(AzureSynthesizerConfig.from_output_device(speaker_output))
         try:
             while True:
diff --git a/quickstarts/streaming_conversation.py b/quickstarts/streaming_conversation.py
@@ -49,7 +49,6 @@ async def main():
         speaker_output,
     ) = create_streaming_microphone_input_and_speaker_output(
         use_default_devices=False,
-        use_blocking_speaker_output=True,  # this moves the playback to a separate thread, set to False to use the main thread
     )
 
     conversation = StreamingConversation(
diff --git a/tests/fakedata/conversation.py b/tests/fakedata/conversation.py
@@ -26,6 +26,7 @@
 )
 
 DEFAULT_SYNTHESIZER_CONFIG = PlayHtSynthesizerConfig(
+    voice_id="test_voice_id",
     sampling_rate=DEFAULT_SAMPLING_RATE,
     audio_encoding=AudioEncoding.MULAW,
 )
diff --git a/vocode/helpers.py b/vocode/helpers.py
diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
diff --git a/vocode/streaming/output_device/blocking_speaker_output.py b/vocode/streaming/output_device/blocking_speaker_output.py
diff --git a/vocode/streaming/output_device/speaker_output.py b/vocode/streaming/output_device/speaker_output.py
diff --git a/vocode/streaming/output_device/vonage_output_device.py b/vocode/streaming/output_device/vonage_output_device.py

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,6 @@ async def main():`
`49`	`49`	`speaker_output,`
`50`	`50`	`) = create_streaming_microphone_input_and_speaker_output(`
`51`	`51`	`use_default_devices=False,`
`52`		`- use_blocking_speaker_output=True, # this moves the playback to a separate thread, set to False to use the main thread`
`53`	`52`	`)`
`54`	`53`
`55`	`54`	`conversation = StreamingConversation(`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`)`
`27`	`27`
`28`	`28`	`DEFAULT_SYNTHESIZER_CONFIG = PlayHtSynthesizerConfig(`
	`29`	`+ voice_id="test_voice_id",`
`29`	`30`	`sampling_rate=DEFAULT_SAMPLING_RATE,`
`30`	`31`	`audio_encoding=AudioEncoding.MULAW,`
`31`	`32`	`)`