From f0d50c8984d64b61aa7676352eb5f209707de7d5 Mon Sep 17 00:00:00 2001 From: adnaans Date: Fri, 7 Jun 2024 11:53:46 -0700 Subject: [PATCH 01/10] playground docs updated and pyproject has nltk required --- docs/open-source/playground.mdx | 11 +++++++++-- pyproject.toml | 3 +-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/open-source/playground.mdx b/docs/open-source/playground.mdx index 662e8f97e1..f6acb2da9f 100644 --- a/docs/open-source/playground.mdx +++ b/docs/open-source/playground.mdx @@ -8,6 +8,11 @@ test transcribers, agents, and synthesizers. To begin, clone the [repo](https://github.com/vocodedev/vocode-python). +Install the core packages by running the following: +``` +poetry install +``` + # Streaming ## Transcriber @@ -32,9 +37,11 @@ make chat ## Synthesizer -1. Update your synthesizer configuration in `playground/streaming/synthesizer/synthesize.py` +1. Install the synthesizer packages by running `poetry install --extras=synthesizers` + +2. Update your synthesizer configuration in `playground/streaming/synthesizer/synthesize.py` -2. Run the following script to synthesize text to speech and play it to your speaker: +3. Run the following script to synthesize text to speech and play it to your speaker: ``` make synthesize diff --git a/pyproject.toml b/pyproject.toml index fe9c297413..592d95b0d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,11 +33,11 @@ sounddevice = "^0.4.7" tiktoken = "^0.7.0" uvicorn = "^0.30.0" websockets = "^12.0" +nltk = "^3.8.1" # Synthesizers google-cloud-texttospeech = { version = "^2.16.3", optional = true } miniaudio = { version = "^1.59", optional = true } -nltk = { version = "^3.8.1", optional = true } pvkoala = { version = "^2.0.1", optional = true } pydub = { version = "^0.25.1", optional = true } @@ -79,7 +79,6 @@ pytest-mock = "^3.14.0" synthesizers = [ "google-cloud-texttospeech", "miniaudio", - "nltk", "pvkoala", "pydub", ] From 278c08e91f6ec8a20b45da21be4e08b71093a770 Mon Sep 17 00:00:00 2001 From: adnaans Date: Fri, 7 Jun 2024 12:27:16 -0700 Subject: [PATCH 02/10] update docs for turn based conversation and add quickstart to makefile --- Makefile | 3 + docs/open-source/turn-based-conversation.mdx | 73 ++++++++++---------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index b58788087a..6e1833ce7f 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,9 @@ transcribe: synthesize: poetry run python playground/streaming/synthesizer/synthesize.py +turn_based_conversation: + poetry run python quickstarts/turn_based_conversation.py + PYTHON_FILES=. lint: PYTHON_FILES=vocode/ quickstarts/ playground/ lint_diff typecheck_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep -E '\.py$$') diff --git a/docs/open-source/turn-based-conversation.mdx b/docs/open-source/turn-based-conversation.mdx index 18177ba936..d2e6a03e3d 100644 --- a/docs/open-source/turn-based-conversation.mdx +++ b/docs/open-source/turn-based-conversation.mdx @@ -9,55 +9,64 @@ A turn-based conversation is a communication system designed for applications wh This model differs from streaming conversations that try to mimic natural human discourse. Instead, it fits applications triggered by some kind of user input. For example, consider a voice memo application where the user records a message, and the agent generates a complete response. -A turn-based conversation system is perfect for applications that don't require real-time responses or constant back-and-forths. -This design reduces complexity and allows for a more controlled conversation flow. Each user input is treated as a discrete event, +A turn-based conversation system is perfect for applications that don't require interruptions and have a controlled conversation flow. Each user input is treated as a discrete event, giving the system time to generate and deliver a full and meaningful response. ## Turn-based quickstart -The code can be found [here](https://github.com/vocodedev/vocode-python/blob/main/quickstarts/turn_based_conversation.py) +The example below demonstrates a turn-based conversation, using a ChatGPT agent for text generation, WhisperTranscriber for speech-to-text, +and AzureSynthesizer for text-to-speech. User interactions trigger the beginning and end of the recording, signaling the system when to listen and when to respond. You can run it with +``` +make turn_based_conversation +``` -```python -import logging -from dotenv import load_dotenv -from vocode import getenv -from vocode.helpers import create_turn_based_microphone_input_and_speaker_output -from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent -from vocode.turn_based.synthesizer.azure_synthesizer import AzureSynthesizer -from vocode.turn_based.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer -from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber -from vocode.turn_based.turn_based_conversation import TurnBasedConversation +*Remember to replace OPENAI_API_KEY and AZURE_SPEECH_KEY with your actual API keys and set the appropriate Azure region. You can also set these variables in a `.env` file and source it in your terminal. +You can also customize the voice, system prompt, and initial message as needed. The code can be found [here](https://github.com/vocodedev/vocode-python/blob/main/quickstarts/turn_based_conversation.py).* -logging.basicConfig() -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) +```python +class Settings(BaseSettings): + """ + Settings for the turn-based conversation quickstart. + These parameters can be configured with environment variables. + """ + + openai_api_key: str = "ENTER_YOUR_OPENAI_API_KEY_HERE" + azure_speech_key: str = "ENTER_YOUR_AZURE_KEY_HERE" + + azure_speech_region: str = "eastus" + + # This means a .env file can be used to overload these settings + # ex: "OPENAI_API_KEY=my_key" will set openai_api_key over the default above + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + ) -load_dotenv() -# See https://api.elevenlabs.io/v1/voices -ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB" +settings = Settings() if __name__ == "__main__": ( microphone_input, speaker_output, - ) = create_turn_based_microphone_input_and_speaker_output(use_default_devices=False) + ) = create_turn_based_microphone_input_and_speaker_output( + use_default_devices=False, + ) conversation = TurnBasedConversation( input_device=microphone_input, output_device=speaker_output, - transcriber=WhisperTranscriber(api_key=getenv("OPENAI_API_KEY")), + transcriber=WhisperTranscriber(api_key=settings.openai_api_key), agent=ChatGPTAgent( system_prompt="The AI is having a pleasant conversation about life", initial_message="Hello!", - api_key=getenv("OPENAI_API_KEY"), + api_key=settings.openai_api_key, ), synthesizer=AzureSynthesizer( - api_key=getenv("AZURE_SPEECH_KEY"), - region=getenv("AZURE_SPEECH_REGION"), + api_key=settings.azure_speech_key, + region=settings.azure_speech_region, voice_name="en-US-SteffanNeural", ), - logger=logger, ) print("Starting conversation. Press Ctrl+C to exit.") while True: @@ -68,16 +77,4 @@ if __name__ == "__main__": conversation.end_speech_and_respond() except KeyboardInterrupt: break -``` - -This example demonstrates a turn-based conversation, using a ChatGPT agent for text generation, WhisperTranscriber for speech-to-text, -and AzureSynthesizer for text-to-speech. User interactions trigger the beginning and end of the recording, signaling the system when to listen and when to respond. - -Remember to replace OPENAI_API_KEY and AZURE_SPEECH_KEY with your actual API keys and set the appropriate Azure region. -You can also customize the voice, system prompt, and initial message as needed. - -## React turn-based quickstart - -🚧 Under construction - -If you want to work on a sample react app for this, reach out to us! +``` \ No newline at end of file From b27d169ef65a31cca7362943a489abb4bc3df104 Mon Sep 17 00:00:00 2001 From: adnaans Date: Fri, 7 Jun 2024 12:57:22 -0700 Subject: [PATCH 03/10] updates to language support docs --- docs/open-source/language-support.mdx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/open-source/language-support.mdx b/docs/open-source/language-support.mdx index 694887fb62..004c5f089f 100644 --- a/docs/open-source/language-support.mdx +++ b/docs/open-source/language-support.mdx @@ -22,11 +22,9 @@ synthesizer_config = AzureSynthesizerConfig( ) ``` -See the [full list of supported voices](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=tts). - ## Transcription -The transcriber used in vocode is also configurable. By default, `DeepgramTranscriber` is used which supports [over 25 languages](https://developers.deepgram.com/docs/languages-overview). +The transcriber used in vocode is also configurable. By default, `DeepgramTranscriber` is used which supports [over 35 languages](https://developers.deepgram.com/docs/languages-overview). To configure a different language model, modify the language code passed to `TranscriberConfig` when initializing the config object (`en-US` is the default): @@ -35,10 +33,11 @@ from vocode.streaming.models.transcriber import DeepgramTranscriberConfig transcriber_config = DeepgramTranscriberConfig( language="es" # Spanish + model="nova-2" # Most languages are supported on the Nova 2 model ) ``` -See the [Deepgram docs](https://developers.deepgram.com/docs/languages-overview) for the list of supported lamguages. +***Note: the default model for Deepgram is Nova, so you must pass `model="nova-2"` to use that model.*** Other transcription services like Google Cloud Speech or Assembly AI could also be used by configuring the appropriate `TranscriberConfig`. From 404227c06c1bcb6236a19208bff9157104386335 Mon Sep 17 00:00:00 2001 From: adnaans Date: Fri, 7 Jun 2024 12:58:18 -0700 Subject: [PATCH 04/10] add streaming conversation quickstart to makefile --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 6e1833ce7f..c7dcc8ccfc 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,9 @@ synthesize: turn_based_conversation: poetry run python quickstarts/turn_based_conversation.py +streaming_conversation: + poetry run python quickstarts/streaming_conversation.py + PYTHON_FILES=. lint: PYTHON_FILES=vocode/ quickstarts/ playground/ lint_diff typecheck_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep -E '\.py$$') From 127b6d792fe81eea8214356dbcbb74953bc19598 Mon Sep 17 00:00:00 2001 From: adnaans Date: Fri, 7 Jun 2024 13:26:17 -0700 Subject: [PATCH 05/10] removed benchmark and tracing --- docs/open-source/tracing.mdx | 95 ------ playground/streaming/benchmark.py | 535 ------------------------------ 2 files changed, 630 deletions(-) delete mode 100644 docs/open-source/tracing.mdx delete mode 100644 playground/streaming/benchmark.py diff --git a/docs/open-source/tracing.mdx b/docs/open-source/tracing.mdx deleted file mode 100644 index 537e328857..0000000000 --- a/docs/open-source/tracing.mdx +++ /dev/null @@ -1,95 +0,0 @@ ---- -title: "Tracing" -description: "Time components of your Vocode conversations" ---- - -# [Beta] Benchmarking script - -The benchmarking script is located at `playground/streaming/benchmark.py`. You can execute the benchmarking script using the CLI which will enable you to evaluate and compare -transcribers, agents, and synthesizers. You can use it primarily to benchmark latency – but it can also be used to compare the quality of the different providers as well. The -feature is in Beta and will continue to be improved upon – feel free to open an issue with any ideas. - -### Using the CLI - -To access the options of the benchmarking script, run - -```bash -python playground/streaming/benchmark.py --help -``` - -This will display all available options. - -To conduct multiple trials and get averaged results, you can control `num_cycles` - -```bash ---{transcriber,agent,synthesizer}_num_cycles 3 # component specific ---all_num_cycles 3 # all components -``` - -To perform a comprehensive test across all supported transcribers, agents, and synthesizers, use the `--all` command. - -With the CLI, you can get the raw output, write them to a file, and create graphs. -To access your results and visualize them, they will be stored in the `benchmark_results` directory by default. You can also change this location using the `--results_dir` and `--results_file` options. If you want to create visual graphs, add the `--create_graphs` option when running your test. - -#### Example: comparing synthesizers - -To compare different synthesizers, use the `--synthesizers` flag followed by the names of the synthesizers you wish to compare. For instance, - -```bash -python playground/streaming/benchmark.py --synthesizers Google Azure --synthesizer_text "Your text here" -``` - -#### Example: comparing transcribers - -To compare different transcribers, you can use the `--transcribers` flag followed by the names of the transcribers you wish to compare. For example, - -```bash -python playground/streaming/benchmark.py --transcribers deepgram assemblyai --transcriber_audio sample.wav -``` - -You can specify `transcriber_use_mic` instead of `--transcriber_audio` to use your microphone as the audio source. - -#### Example: comparing agents - -To compare different agents, use the `--agents` flag followed by the names of the agents you want to compare. For example, - -```bash -python playground/streaming/benchmark.py --agents openai anthropic -``` - -You can set the prompt preamble with the `--agent_prompt_preamble` argument and the first input with the `--agent_first_input` option. - -# Tracing your application - -At the top of `quickstarts/streaming_conversation.py`, include the following code: - -```python -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter -from opentelemetry.sdk.resources import Resource - - -class PrintDurationSpanExporter(SpanExporter): - def __init__(self): - super().__init__() - self.spans = defaultdict(list) - - def export(self, spans): - for span in spans: - duration_ns = span.end_time - span.start_time - duration_s = duration_ns / 1e9 - self.spans[span.name].append(duration_s) - - def shutdown(self): - for name, durations in self.spans.items(): - print(f"{name}: {sum(durations) / len(durations)}") - - -trace.set_tracer_provider(TracerProvider(resource=Resource.create({}))) -trace.get_tracer_provider().add_span_processor( - SimpleSpanProcessor(PrintDurationSpanExporter()) -) -``` - -This will print out stats about the conversation after it ends. diff --git a/playground/streaming/benchmark.py b/playground/streaming/benchmark.py deleted file mode 100644 index 26b2dbf972..0000000000 --- a/playground/streaming/benchmark.py +++ /dev/null @@ -1,535 +0,0 @@ -raise DeprecationWarning("This playground script is deprecated and will be removed in the future.") - -import argparse -import asyncio -import json -import os -from collections import defaultdict - -import sounddevice as sd -from loguru import logger -from opentelemetry import metrics, trace -from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics.export import InMemoryMetricReader -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import SimpleSpanProcessor -from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter -from playground.streaming.tracing_utils import get_final_metrics -from tqdm import tqdm - -from vocode.streaming.agent import ChatGPTAgent -from vocode.streaming.agent.base_agent import TranscriptionAgentInput -from vocode.streaming.input_device.file_input_device import FileInputDevice -from vocode.streaming.input_device.microphone_input import MicrophoneInput -from vocode.streaming.models.agent import AzureOpenAIConfig, ChatGPTAgentConfig -from vocode.streaming.models.message import BaseMessage -from vocode.streaming.models.synthesizer import ( # BarkSynthesizerConfig,; CoquiSynthesizerConfig,; CoquiTTSSynthesizerConfig, - AzureSynthesizerConfig, - ElevenLabsSynthesizerConfig, - PlayHtSynthesizerConfig, - RimeSynthesizerConfig, -) -from vocode.streaming.models.transcriber import ( - AssemblyAITranscriberConfig, - DeepgramTranscriberConfig, - PunctuationEndpointingConfig, - Transcription, -) -from vocode.streaming.models.transcript import Transcript -from vocode.streaming.output_device.file_output_device import FileOutputDevice -from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer -from vocode.streaming.synthesizer.eleven_labs_websocket_synthesizer import ElevenLabsWSSynthesizer -from vocode.streaming.synthesizer.elevenlabs_synthesizer import ElevenLabsSynthesizer -from vocode.streaming.synthesizer.play_ht_synthesizer import PlayHtSynthesizer -from vocode.streaming.synthesizer.play_ht_synthesizer_v2 import PlayHtSynthesizerV2 -from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer -from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber -from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber -from vocode.streaming.utils import get_chunk_size_per_second, remove_non_letters_digits - -tracer = trace.get_tracer(__name__) -meter = metrics.get_meter(__name__) - - -# Create the parser -parser = argparse.ArgumentParser( - description="Benchmark Vocode's transcribers, agents, and synthesizers.\n" - + "Example usage: python playground/streaming/benchmark.py --all --all_num_cycles 3 --create_graphs", -) - -synthesizer_classes = { - "elevenlabs": (ElevenLabsSynthesizer, ElevenLabsSynthesizerConfig), - "elevenlabsws": (ElevenLabsWSSynthesizer, ElevenLabsSynthesizerConfig), - "azure": (AzureSynthesizer, AzureSynthesizerConfig), - # "bark": (BarkSynthesizer, BarkSynthesizerConfig), - # "coqui": (CoquiSynthesizer, CoquiSynthesizerConfig), - # "coquitts": (CoquiTTSSynthesizer, CoquiTTSSynthesizerConfig), - # "google": (GoogleSynthesizer, GoogleSynthesizerConfig), - # "gtts": (GTTSSynthesizer, GTTSSynthesizerConfig), - "playht": (PlayHtSynthesizer, PlayHtSynthesizerConfig), - "playht2": (PlayHtSynthesizerV2, PlayHtSynthesizerConfig), - "rime": (RimeSynthesizer, RimeSynthesizerConfig), - # "streamelements": (StreamElementsSynthesizer, StreamElementsSynthesizerConfig), -} - - -# These synthesizers stream output so they need to be traced within this file. -STREAMING_SYNTHESIZERS = ["azure", "elevenlabs", "playht2", "elevenlabsws"] - - -TRANSCRIBER_CHOICES = ["deepgram", "assemblyai"] -AGENT_CHOICES = [ - "gpt_gpt-3.5-turbo", - "gpt_gpt-4", - "azuregpt_gpt-35-turbo", -] -SYNTHESIZER_CHOICES = list(synthesizer_classes) - -parser.add_argument( - "--transcribers", - type=str, - nargs="*", - default=[], - choices=TRANSCRIBER_CHOICES + ["all"], - help="The list of transcribers to benchmark", -) -parser.add_argument( - "--agents", - type=str, - nargs="*", - default=[], - choices=AGENT_CHOICES + ["all"], - help="The list of agents to benchmark. Each agent should be of the form _.", -) -parser.add_argument( - "--synthesizers", - type=str, - nargs="*", - default=[], - choices=SYNTHESIZER_CHOICES + ["all"], - help="The list of synthesizers to benchmark", -) -parser.add_argument( - "--transcriber_audio", - type=str, - default=f"{os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test.wav')}", - help="Path to the audio file to transcribe", -) -parser.add_argument( - "--transcriber_use_mic", - action="store_true", - help="Use the microphone as the input device for the transcriber. " - + "Overrides --transcriber_audio. Be silent for ≈5 seconds to end transcription.", -) -parser.add_argument( - "--synthesizer_text", - type=str, - default="Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, “and what is the use of a book,” thought Alice “without pictures or conversations?”", - help="The text for synthesizers to synthesize", -) -parser.add_argument( - "--agent_prompt_preamble", - type=str, - default="The AI is having a very short and pleasant conversation about life", - help="The prompt preamble to use for the agent", -) -parser.add_argument( - "--agent_first_input", - type=str, - default="What is the meaning of life?", - help="The initial message sent to the agent (this is a transcribed sentence that the agent should respond to).", -) -parser.add_argument( - "--no_generate_responses", - action="store_true", - help="Disable streaming generated responses for agents", -) -parser.add_argument( - "--transcriber_num_cycles", - type=int, - default=1, - help="The number of transcriber runs to perform. Results are averaged over the runs.", -) -parser.add_argument( - "--synthesizer_num_cycles", - type=int, - default=1, - help="The number of synthesizer runs to perform. Results are averaged over the runs.", -) -parser.add_argument( - "--all_num_cycles", - type=int, - default=None, - help="The number of transcriber, agent, and synthesizer runs to perform. Overrides all other num_cycle arguments.", -) -parser.add_argument( - "--agent_num_cycles", - type=int, - default=1, - help="The number of agent runs to perform. Results are averaged over the runs.", -) -parser.add_argument( - "--all", - action="store_true", - help="Run all supported transcribers, agents, and synthesizers. Ignores other arguments.", -) -parser.add_argument( - "--create_graphs", - action="store_true", - help="Create graphs from the benchmark results. Requires matplotlib.", -) -parser.add_argument( - "--just_graphs", - action="store_true", - help="Skips computing statistics. Loads the last saved benchmark result " - + "JSON file and creates graphs from it.", -) -parser.add_argument( - "--results_file", - type=str, - default="benchmark_results.json", - help="The file to save the benchmark JSON results to", -) -parser.add_argument( - "--results_dir", - type=str, - default="benchmark_results", - help="The directory to save the text-to-speech output and JSON results to", -) -args = parser.parse_args() -if args.all: - print("--all is set! Running all supported transcribers, agents, and synthesizers.") - args.transcribers = TRANSCRIBER_CHOICES - args.agents = AGENT_CHOICES - args.synthesizers = SYNTHESIZER_CHOICES - -if "all" in args.transcribers: - args.transcribers = TRANSCRIBER_CHOICES -if "all" in args.agents: - args.agents = AGENT_CHOICES -if "all" in args.synthesizers: - args.synthesizers = SYNTHESIZER_CHOICES - -if args.all_num_cycles is not None: - args.transcriber_num_cycles = args.all_num_cycles - args.agent_num_cycles = args.all_num_cycles - args.synthesizer_num_cycles = args.all_num_cycles - -if args.create_graphs or args.just_graphs: - try: - import matplotlib.pyplot as plt - except ImportError: - print( - "ERROR: The --create_graphs flag requires matplotlib. Please " - + "install matplotlib and try again.", - ) - exit(1) - -if args.just_graphs: - print( - "--just_graphs is set! Skipping computing statistics and instead " - + "generating graphs from the last saved benchmark result JSON file.", - ) - -should_generate_responses = not args.no_generate_responses - -os.makedirs(args.results_dir, exist_ok=True) - - -def get_transcriber(transcriber_name, file_input): - if transcriber_name == "deepgram": - transcriber = DeepgramTranscriber( - DeepgramTranscriberConfig.from_input_device( - file_input, - endpointing_config=PunctuationEndpointingConfig(), - ), - ) - elif transcriber_name == "assemblyai": - transcriber = AssemblyAITranscriber( - AssemblyAITranscriberConfig.from_input_device( - file_input, - ), - ) - return transcriber - - -trace.set_tracer_provider(TracerProvider(resource=Resource.create({}))) -span_exporter = InMemorySpanExporter() -trace.get_tracer_provider().add_span_processor(SimpleSpanProcessor(span_exporter)) # type: ignore - -reader = InMemoryMetricReader() -provider = MeterProvider(metric_readers=[reader]) -metrics.set_meter_provider(provider) - - -async def run_agents(): - for agent_name in tqdm(args.agents, desc="Agents"): - company, model_name = agent_name.rsplit("_", 1) - length_meter = meter.create_counter( - remove_non_letters_digits(f"agent.agent_chat_{company}-{model_name}.total_characters"), - ) - for _ in tqdm(range(args.agent_num_cycles), desc="Agent Cycles"): - if company == "gpt": - agent = ChatGPTAgent( - ChatGPTAgentConfig( - initial_message=None, - prompt_preamble=args.agent_prompt_preamble, - allow_agent_to_be_cut_off=False, - model_name=model_name, - generate_responses=should_generate_responses, - ), - ) - elif company == "azuregpt": - agent = ChatGPTAgent( - ChatGPTAgentConfig( - initial_message=None, - prompt_preamble=args.agent_prompt_preamble, - allow_agent_to_be_cut_off=False, - azure_params=AzureOpenAIConfig(deployment_name=model_name), - generate_responses=should_generate_responses, - ), - ) - agent.attach_transcript(Transcript()) - agent_task = agent.start() # noqa: F841 - message = TranscriptionAgentInput( - transcription=Transcription( - message=args.agent_first_input, - confidence=1.0, - is_final=True, - ), - conversation_id=0, - ) - agent.consume_nonblocking( - agent.interruptible_event_factory.create_interruptible_event(message), - ) - - while True: - try: - message = await asyncio.wait_for(agent.output_queue.get(), timeout=15) - if isinstance(message.payload.message, BaseMessage): - length_meter.add(len(message.payload.message.text)) - logger.debug( - f"[Agent: {agent_name}] Response from API: {message.payload.message.text}", - ) - except asyncio.TimeoutError: - logger.debug(f"[Agent: {agent_name}] Agent queue is empty, stopping...") - break - - -async def run_synthesizers(): - def create_file_output_device(synthesizer_name, extra_info=""): - return FileOutputDevice( - os.path.join(args.results_dir, f"{synthesizer_name}{extra_info}.wav"), - ) - - for synthesizer_name in args.synthesizers: - file_output = create_file_output_device(synthesizer_name) - synthesizer_class, synthesizer_config_class = synthesizer_classes[synthesizer_name] - extra_config = {} - if synthesizer_name == "playht": - extra_config["voice_id"] = "larry" - elif synthesizer_name == "rime": - extra_config["speaker"] = "young_male-1" - elif synthesizer_name == "playht2": - extra_config["voice_id"] = ( - "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json" - ) - extra_config["version"] = "2" - elif synthesizer_name == "elevenlabs": - extra_config["experimental_streaming"] = True - elif synthesizer_name == "elevenlabsws": - extra_config["experimental_websocket"] = True - config = synthesizer_config_class.from_output_device(file_output, **extra_config) - synthesizer = synthesizer_class(config) - - chunk_size = 0.1 * get_chunk_size_per_second( - synthesizer.get_synthesizer_config().audio_encoding, - synthesizer.get_synthesizer_config().sampling_rate, - ) - - current_synthesizer_is_streaming = synthesizer_name in STREAMING_SYNTHESIZERS - for _ in tqdm( - range(args.synthesizer_num_cycles), - desc=f"Synthesizer Cycles ({synthesizer_name})", - ): - if current_synthesizer_is_streaming: - total_synthesis_span = tracer.start_span( - f"synthesizer.{synthesizer_name}.create_total", - ) - first_synthesis_span = tracer.start_span( - f"synthesizer.{synthesizer_name}.create_first", - ) - - try: - synthesis_result = await synthesizer.create_speech_uncached( - message=BaseMessage(text=args.synthesizer_text), - chunk_size=int(chunk_size), - ) - except asyncio.TimeoutError: - logger.error( - f"[Synthesizer: {synthesizer_name}] Timed out while synthesizing. Skipping {synthesizer_name}...", - ) - continue - except Exception as e: - logger.error( - f"[Synthesizer: {synthesizer_name}] Exception while synthesizing: {e}. Skipping {synthesizer_name}...", - ) - continue - chunk_generator = synthesis_result.chunk_generator - - with tqdm(desc=f"{synthesizer_name.title()} Synthesizing") as pbar: - first_chunk = True - while True: - pbar.update(1) - try: - chunk_result = await chunk_generator.__anext__() - if current_synthesizer_is_streaming and first_chunk: - first_chunk = False - first_synthesis_span.end() - file_output.consume_nonblocking(chunk_result.chunk) - except StopAsyncIteration: - break - if chunk_result.is_last_chunk: - break - - if current_synthesizer_is_streaming: - total_synthesis_span.end() - - await synthesizer.tear_down() - - -async def run_transcribers(): - sample_rate = 44100 - chunk_size = 2048 - sleep_time = chunk_size / sample_rate - if args.transcriber_use_mic: - input_device_info = sd.query_devices(kind="input") - input_device = MicrophoneInput(input_device_info) - else: - input_device = FileInputDevice( - args.transcriber_audio, - chunk_size=chunk_size, - silent_duration=0.01, - skip_initial_load=True, - ) - - for transcriber_cycle_idx in tqdm( - range(args.transcriber_num_cycles), - desc="Transcriber Cycles", - ): - for transcriber_name in tqdm(args.transcribers, desc="Transcribers"): - transcriber = get_transcriber(transcriber_name, input_device) - if not args.transcriber_use_mic: - input_device.load() - transcriber_task = transcriber.start() # noqa: F841 - - if args.transcriber_use_mic: - - async def record_audio_task(): - while True: - chunk = await input_device.get_audio() - transcriber.send_audio(chunk) - - send_audio = asyncio.create_task(record_audio_task()) - else: - - async def send_audio_task(): - while not input_device.is_done(): - chunk = await input_device.get_audio() - transcriber.send_audio(chunk) - await asyncio.sleep(sleep_time) - - send_audio = asyncio.create_task(send_audio_task()) - - # `get` from `transcriber.output_queue` until it's empty for 5 seconds - pbar = tqdm( - desc=f"{transcriber_name.title()} Transcribing", - total=input_device.duration if not args.transcriber_use_mic else None, - unit="chunk", - ) - while True: - try: - transcription = await asyncio.wait_for( # noqa: F841 - transcriber.output_queue.get(), - timeout=5, - ) - # update the progress bar status - pbar.update(round(transcriber.audio_cursor - pbar.n, 2)) - except asyncio.TimeoutError: - logger.debug( - f"[Transcriber: {transcriber_name}] Transcriber queue is empty, stopping transcription...", - ) - send_audio.cancel() - break - if not args.transcriber_use_mic: - pbar.update(pbar.total - pbar.n) - transcriber.terminate() - - -def create_graphs(final_results): - logger.info("Creating graphs from benchmark results...") - results_split = [] - for name, value in final_results.items(): - first_name = name.split(".", 1) - second_name = first_name[1].rsplit(".", 1) - results_split.append((first_name[0], *second_name, value)) - - graph_data = defaultdict(lambda: defaultdict(list)) - for category, name, metric, value in results_split: - graph_data[f"{category} - {metric}"]["labels"].append(name) - graph_data[f"{category} - {metric}"]["values"].append(value) - - graph_dir = os.path.join(args.results_dir, "graphs") - os.makedirs(graph_dir, exist_ok=True) - - for graph_title, data in graph_data.items(): - plt.title(graph_title) - plt.bar(data["labels"], data["values"]) - plt.xticks(rotation=45) - plt.tight_layout() - plt.savefig(os.path.join(graph_dir, f"{graph_title}.png")) - plt.clf() - - -async def main(): - result_file_path = os.path.join(args.results_dir, args.results_file) - if not args.just_graphs: - if args.agents: - await run_agents() - if args.transcribers: - await run_transcribers() - if args.synthesizers: - await run_synthesizers() - - trace_results = span_exporter.get_finished_spans() - final_spans = defaultdict(list) - for span in trace_results: - duration_ns = span.end_time - span.start_time - duration_s = duration_ns / 1e9 - final_spans[span.name].append(duration_s) - - scope_metrics = reader.get_metrics_data().resource_metrics[0].scope_metrics - final_metrics = get_final_metrics(scope_metrics, final_spans=final_spans) - - final_spans = {k: sum(v) / len(v) for k, v in final_spans.items() if len(v) > 0} - if len(scope_metrics) > 0: - final_results = {**final_spans, **final_metrics} - else: - final_results = final_spans - print(json.dumps(final_results, indent=4)) - if args.results_file: - with open(result_file_path, "w") as f: - json.dump(final_results, f, indent=4) - else: - with open(result_file_path, "r") as f: - final_results = json.load(f) - - if args.create_graphs or args.just_graphs: - create_graphs(final_results) - - print("Benchmarking complete!") - - -if __name__ == "__main__": - asyncio.run(main()) From 8f8d1ade2abcfb4dd57702a43ecc7983d1bec913 Mon Sep 17 00:00:00 2001 From: adnaans Date: Fri, 7 Jun 2024 15:13:38 -0700 Subject: [PATCH 06/10] updated documentation for agent factory --- docs/open-source/agent-factory.mdx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/open-source/agent-factory.mdx b/docs/open-source/agent-factory.mdx index 7b0c25d7c5..9332c772eb 100644 --- a/docs/open-source/agent-factory.mdx +++ b/docs/open-source/agent-factory.mdx @@ -5,18 +5,17 @@ description: 'How to link a custom agent to your app' # Agent Factories -Agent factories specify which agents are available to your app. In order to connect an agent to your app, you must first define an agent factory. To do so, subclass the [`AgentFactory`](https://github.com/vocodedev/vocode-python/blob/main/vocode/streaming/agent/factory.py) class to specify how agents are created. Here you can import and use your own custom agents. +Agent factories specify which agents are available to your app. In order to connect an agent to your app, you must first define an agent factory. To do so, subclass the [`AbstractAgentFactory`](https://github.com/vocodedev/vocode-python/blob/main/vocode/streaming/agent/abstract_factory.py) class to specify how agents are created. Here you can import and use your own custom agents. ## Example -First define your `AgentFactory`: +First define your `AgentFactory`. In this example, we are creating a factory for a new type of agent called MyActionAgent: ```python -from vocode.streaming.agent.factory import AgentFactory +from vocode.streaming.agent.abstract_factory import AbstractAgentFactory -class MyAgentFactory(AgentFactory): - def __init__(self, agent_config: AgentConfig, action_factory: MyActionFactory): - self.agent_config = agent_config +class MyAgentFactory(AbstractAgentFactory): + def __init__(self, action_factory: MyActionFactory): self.action_factory = action_factory def create_agent( @@ -24,10 +23,13 @@ class MyAgentFactory(AgentFactory): ) -> BaseAgent: if agent_config.type == "MY_ACTION": return MyActionAgent( - agent_config=typing.cast(ActionAgentConfig, self.agent_config), + agent_config=agent_config, action_factory=self.action_factory ) - raise Exception("Invalid agent config") + elif agent_config.type == "other_agent_type": + ... + else: + raise Exception("Invalid agent config") ``` Then, in your app, you can connect the agent to the app: From c5f697d8ebd5b7af43228fa977942bbd9998503c Mon Sep 17 00:00:00 2001 From: adnaans Date: Fri, 7 Jun 2024 15:14:35 -0700 Subject: [PATCH 07/10] update navbar --- docs/mint.json | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/mint.json b/docs/mint.json index bd97c3dbd9..5b56d3989f 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -49,7 +49,11 @@ "navigation": [ { "group": "Getting Started", - "pages": ["welcome", "hosted-quickstart", "open-source-quickstart"] + "pages": [ + "welcome", + "hosted-quickstart", + "open-source-quickstart" + ] }, { "group": "Vocode 101", @@ -75,7 +79,6 @@ "open-source/playground", "open-source/turn-based-conversation", "open-source/language-support", - "open-source/tracing", "open-source/agent-factory" ] }, @@ -109,7 +112,9 @@ }, { "group": "Usage", - "pages": ["api-reference/usage/get-usage"] + "pages": [ + "api-reference/usage/get-usage" + ] }, { "group": "Actions", @@ -223,4 +228,4 @@ "twitter": "https://twitter.com/vocodehq", "website": "https://www.vocode.dev/" } -} +} \ No newline at end of file From 7c10440198c5b944a567f9b35632f1b327d24a40 Mon Sep 17 00:00:00 2001 From: adnaans Date: Mon, 10 Jun 2024 12:24:45 -0700 Subject: [PATCH 08/10] update agent factory telephony server code --- docs/open-source/agent-factory.mdx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/open-source/agent-factory.mdx b/docs/open-source/agent-factory.mdx index 9332c772eb..553de9f7f0 100644 --- a/docs/open-source/agent-factory.mdx +++ b/docs/open-source/agent-factory.mdx @@ -35,10 +35,11 @@ class MyAgentFactory(AbstractAgentFactory): Then, in your app, you can connect the agent to the app: ```python +from vocode.streaming.telephony.server.base import TelephonyServer +from vocode.streaming.agent.my_agent_factory import MyAgentFactory telephony_server = TelephonyServer( - agent_factory=MyAgentFactory( - agent_config=agent_config, action_factory=action_factory), + agent_factory=MyAgentFactory() ... ) ``` From 9f1c1bff6544ff5a772f31fdf567f7b5a70c8616 Mon Sep 17 00:00:00 2001 From: adnaans Date: Mon, 10 Jun 2024 16:25:23 -0700 Subject: [PATCH 09/10] updated action factory in agent factory init --- docs/open-source/agent-factory.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/open-source/agent-factory.mdx b/docs/open-source/agent-factory.mdx index 553de9f7f0..c13a3f77d7 100644 --- a/docs/open-source/agent-factory.mdx +++ b/docs/open-source/agent-factory.mdx @@ -39,7 +39,7 @@ from vocode.streaming.telephony.server.base import TelephonyServer from vocode.streaming.agent.my_agent_factory import MyAgentFactory telephony_server = TelephonyServer( - agent_factory=MyAgentFactory() + agent_factory=MyAgentFactory(action_factory=action_factory) ... ) ``` From 109f58f202d35e10dab76f6df0b335a44776b08a Mon Sep 17 00:00:00 2001 From: adnaans Date: Mon, 10 Jun 2024 16:28:19 -0700 Subject: [PATCH 10/10] add init for action factory --- docs/open-source/agent-factory.mdx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/open-source/agent-factory.mdx b/docs/open-source/agent-factory.mdx index c13a3f77d7..58b17af184 100644 --- a/docs/open-source/agent-factory.mdx +++ b/docs/open-source/agent-factory.mdx @@ -13,6 +13,7 @@ First define your `AgentFactory`. In this example, we are creating a factory for ```python from vocode.streaming.agent.abstract_factory import AbstractAgentFactory +from vocode.streaming.action.my_action_factory import MyActionFactory class MyAgentFactory(AbstractAgentFactory): def __init__(self, action_factory: MyActionFactory): @@ -37,9 +38,10 @@ Then, in your app, you can connect the agent to the app: ```python from vocode.streaming.telephony.server.base import TelephonyServer from vocode.streaming.agent.my_agent_factory import MyAgentFactory +from vocode.streaming.action.my_action_factory import MyActionFactory telephony_server = TelephonyServer( - agent_factory=MyAgentFactory(action_factory=action_factory) + agent_factory=MyAgentFactory(action_factory=MyActionFactory()) ... ) ```