Do text-to-speech over completion chunks

So that the assistant can start talking while data is still bein streamed.
paulovcmedeiros · Nov 16, 2023 · aa7ef16 · aa7ef16
2 parents 109c7ae + 7046c74
commit aa7ef16
Show file tree

Hide file tree

Showing 10 changed files with 149 additions and 75 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@
   license = "MIT"
   name = "pyrobbot"
   readme = "README.md"
-  version = "0.3.0"
+  version = "0.3.1"
 
 [build-system]
   build-backend = "poetry.core.masonry.api"
@@ -31,6 +31,7 @@
   streamlit = "^1.28.0"
   tiktoken = "^0.5.1"
   # Text to speech
+  chime = "^0.7.0"
   gtts = "^2.4.0"
   pydub = "^0.25.1"
   pygame = "^2.5.2"
@@ -60,6 +61,7 @@
   ##################
   # Linter configs #
   ##################
+  pytest-xdist = "^3.4.0"
 
 [tool.black]
   line-length = 90
@@ -130,7 +132,7 @@
   ##################
 
 [tool.pytest.ini_options]
-  addopts = "-v --cache-clear --failed-first --cov-report=term-missing --cov-report=term:skip-covered --cov-report=xml:.coverage.xml --cov=./"
+  addopts = "-v --cache-clear -n auto --failed-first --cov-report=term-missing --cov-report=term:skip-covered --cov-report=xml:.coverage.xml --cov=./"
   log_cli_level = "INFO"
   testpaths = ["tests/smoke", "tests/unit"]
 

diff --git a/pyrobbot/argparse_wrapper.py b/pyrobbot/argparse_wrapper.py
@@ -26,6 +26,15 @@ def _populate_parser_from_pydantic_model(parser, model):
             for key in _argarse2pydantic
             if _argarse2pydantic[key](field_name) is not None
         }
+
+        if args_opts.get("type") == bool:
+            if args_opts.get("default") is True:
+                args_opts["action"] = "store_false"
+            else:
+                args_opts["action"] = "store_true"
+            args_opts.pop("default", None)
+            args_opts.pop("type", None)
+
         args_opts["required"] = field.is_required()
         if "help" in args_opts:
             args_opts["help"] = f"{args_opts['help']} (default: %(default)s)"

diff --git a/pyrobbot/chat.py b/pyrobbot/chat.py
@@ -3,7 +3,6 @@
 import json
 import shutil
 import uuid
-from collections import defaultdict
 
 from loguru import logger
 
@@ -23,9 +22,10 @@ class Chat(AlternativeConstructors):
     responses.
     """
 
-    _initial_greeting_translations = defaultdict(lambda: defaultdict(str))
+    _initial_greeting_translations = {}  # map language:translation
+    default_configs = ChatOptions()
 
-    def __init__(self, configs: ChatOptions = None):
+    def __init__(self, configs: ChatOptions = default_configs):
         """Initializes a chat instance.
 
         Args:
@@ -37,9 +37,6 @@ def __init__(self, configs: ChatOptions = None):
         self.id = str(uuid.uuid4())
         self.initial_openai_key_hash = GeneralConstants.openai_key_hash()
 
-        if configs is None:
-            configs = ChatOptions()
-
         self._passed_configs = configs
         for field in self._passed_configs.model_fields:
             setattr(self, field, self._passed_configs[field])
@@ -157,26 +154,24 @@ def load_history(self):
     @property
     def initial_greeting(self):
         """Return the initial greeting for the chat."""
+        default_greeting = f"Hi! I'm {self.assistant_name}. How can I assist you today?"
         try:
             passed_greeting = self._initial_greeting.strip()
         except AttributeError:
             passed_greeting = ""
 
         if not passed_greeting:
-            self._initial_greeting = (
-                f"Hello! I'm {self.assistant_name}. How can I assist you today?"
-            )
+            self._initial_greeting = default_greeting
 
-        translated_greeting = type(self)._initial_greeting_translations[  # noqa: SLF001
-            self._initial_greeting
-        ][self.language]
-        if not translated_greeting:
-            translated_greeting = self._translate(self._initial_greeting)
-            type(self)._initial_greeting_translations[  # noqa: SLF001
-                self._initial_greeting
-            ][self.language] = translated_greeting
+        if passed_greeting or self.language != "en":
+            translation_cache = type(self)._initial_greeting_translations  # noqa: SLF001
+            translated_greeting = translation_cache.get(self.language)
+            if not translated_greeting:
+                translated_greeting = self._translate(self._initial_greeting)
+                translation_cache[self.language] = translated_greeting
+            self._initial_greeting = translated_greeting
 
-        return translated_greeting
+        return self._initial_greeting
 
     @initial_greeting.setter
     def initial_greeting(self, value: str):
@@ -280,6 +275,7 @@ def _respond_prompt(self, prompt: str, role: str, **kwargs):
 
     def _translate(self, text):
         lang = self.language
+        logger.debug("Processing translation of '{}' to '{}'...", text, lang)
         translation_prompt = f"Translate the text between triple quotes to {lang}. "
         translation_prompt += "DO NOT WRITE ANYTHING ELSE. Only the translation. "
         translation_prompt += f"If the text is already in {lang}, then just repeat "

diff --git a/pyrobbot/chat_configs.py b/pyrobbot/chat_configs.py
@@ -149,7 +149,7 @@ class ChatOptions(OpenAiApiCallOptions):
     )
     private_mode: Optional[bool] = Field(
         default=None,
-        description="Toggle private mode. If set to `True`, the chat will not "
+        description="Toggle private mode. If this flag is used, the chat will not "
         + "be logged and the chat history will not be saved.",
     )
     api_connection_max_n_attempts: int = Field(
@@ -174,15 +174,19 @@ class VoiceAssistantConfigs(BaseConfigModel):
     openai_tts_voice: Literal[
         "alloy", "echo", "fable", "onyx", "nova", "shimmer"
     ] = Field(default="onyx", description="Voice to use for OpenAI's TTS")
+    exit_expressions: list[str] = Field(
+        default=["bye-bye", "ok bye-bye", "okay bye-bye"],
+        description="Expression(s) to use in order to exit the chat",
+    )
 
     inactivity_timeout_seconds: int = Field(
-        default=2,
+        default=1,
         gt=0,
         description="How much time user should be inactive "
         "for the assistant to stop listening",
     )
     speech_likelihood_threshold: float = Field(
-        default=0.85,
+        default=0.5,
         ge=0.0,
         le=1.0,
         description="Accept audio as speech if the likelihood is above this threshold",
@@ -195,6 +199,9 @@ class VoiceAssistantConfigs(BaseConfigModel):
     frame_duration: Literal[10, 20, 30] = Field(
         default=30, description="Frame duration for audio recording, in milliseconds."
     )
+    skip_initial_greeting: Optional[bool] = Field(
+        default=None, description="Skip initial greeting."
+    )
 
 
 class VoiceChatConfigs(ChatOptions, VoiceAssistantConfigs):

diff --git a/pyrobbot/command_definitions.py b/pyrobbot/command_definitions.py
@@ -8,7 +8,7 @@
 from .chat import Chat
 from .chat_configs import ChatOptions
 from .openai_utils import CannotConnectToApiError
-from .text_to_speech import VoiceChat
+from .voice_chat import VoiceChat
 
 
 def voice_chat(args):

diff --git a/pyrobbot/general_utils.py b/pyrobbot/general_utils.py
@@ -21,8 +21,7 @@ def from_dict(cls, configs: dict):
         Returns:
             cls: An instance of Chat initialized with the given configurations.
         """
-        dummy = cls()
-        return cls(configs=dummy.configs.model_validate(configs))
+        return cls(configs=cls.default_configs.model_validate(configs))
 
     @classmethod
     def from_cli_args(cls, cli_args):
@@ -37,11 +36,10 @@ def from_cli_args(cls, cli_args):
         Returns:
             cls: An instance of the class initialized with CLI-specified configurations.
         """
-        dummy = cls()
         chat_opts = {
             k: v
             for k, v in vars(cli_args).items()
-            if k in dummy.configs.model_fields and v is not None
+            if k in cls.default_configs.model_fields and v is not None
         }
         return cls.from_dict(chat_opts)