pyRobBot v0.2.0: Interact with GPT by voice

paulovcmedeiros · Nov 14, 2023 · 53ab989 · 53ab989
2 parents ecb51b1 + 63f7c9c
commit 53ab989
Show file tree

Hide file tree

Showing 8 changed files with 210 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -7,13 +7,20 @@
 [![Tests](https://github.com/paulovcmedeiros/pyRobBot/actions/workflows/tests.yaml/badge.svg)](https://github.com/paulovcmedeiros/pyRobBot/actions/workflows/tests.yaml)
 [![codecov](https://codecov.io/gh/paulovcmedeiros/pyRobBot/graph/badge.svg?token=XI8G1WH9O6)](https://codecov.io/gh/paulovcmedeiros/pyRobBot)
 
-# pyRobBot
+# pyRobBot: Talk and Chat with GPT LLMs
 
-A simple chatbot that uses the OpenAI API to get responses from [GPT LLMs](https://platform.openai.com/docs/models) via OpenAI API. Written in Python with a Web UI made with [Streamlit](https://streamlit.io). Can also be used directly from the terminal.
+An interface to OpenAI's [GPT large language models (LLMs)](https://platform.openai.com/docs/models) that implements:
+* A conventional chatbot that can be used either via web UI or terminal
+* A personal assistant that can actually interact with you by voice
 
-**See and try the [demo app on Streamlit](https://pyrobbot.streamlit.app)!**
+The package is written in Python. The web chatbot UI is made with [Streamlit](https://streamlit.io).
+
+**See and try the [demo web app on Streamlit](https://pyrobbot.streamlit.app)!**
 
 ## Features
+- [x] Text to speech and speech to text (`rob voice`)
+  - Talk to the GPT assistant!
+  - You can choose your preferred language (e.g., `rob voice --lang pt-br`)
 - [x] Web UI
   - Add/remove conversations dynamically
   - Automatic/editable conversation summary title
@@ -27,7 +34,6 @@ A simple chatbot that uses the OpenAI API to get responses from [GPT LLMs](https
 - [x] Autosave & retrieve chat history
 - [x] Chat context handling using [embeddings](https://platform.openai.com/docs/guides/embeddings)
 - [x] Estimated API token usage and associated costs
-- [x] Terminal UI (for a more "Wake up, Neo" experience")
 - [x] OpenAI API key is **never** stored on disk
 
 
@@ -62,6 +68,11 @@ and general `rob` options. For info about specific subcommands and the
 options that apply to them only, **please run `rob SUBCOMMAND -h`** (note
 that the `-h` goes after the subcommand in this case).
 
+### Chatting by Voice
+```shell
+rob voice
+```
+
 ### Using the Web UI
 ```shell
 rob
@@ -71,11 +82,10 @@ rob
 ```shell
 rob .
 ```
+
 ## Disclaimers
-This project's main purpose is to serve as a learning exercise for me (the author) and to serve as tool for and experimenting with OpenAI API and GPT LLMs. It does not aim to be the best or more robust OpenAI-powered chatbot out there.
+This project's main purpose is to serve as a learning exercise for me, as well as tool for experimenting with OpenAI API, GPT LLMs and text-to-voice/voice-to-text. It does not claim to be the best or more robust OpenAI-powered chatbot out there.
 
-Having said this, this project *does* aim to have a friendly user interface and to be easy to use and configure. So, please feel free to open an issue or submit a pull request if you find a bug or have a suggestion.
+Having said this, this project *does* aim to provide a friendly user interface that is easy to use and configure. Feel free to open an issue or submit a pull request if you find a bug or have a suggestion.
 
 Last but not least: this project is **not** affiliated with OpenAI in any way.
-
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@
   license = "MIT"
   name = "pyrobbot"
   readme = "README.md"
-  version = "0.1.6"
+  version = "0.2.0"
 
 [build-system]
   build-backend = "poetry.core.masonry.api"
@@ -30,6 +30,12 @@
   pydantic = "^2.4.2"
   streamlit = "^1.28.0"
   tiktoken = "^0.5.1"
+  # Text to speech
+  gtts = "^2.4.0"
+  pygame = "^2.5.2"
+  sounddevice = "^0.4.6"
+  soundfile = "^0.12.1"
+  speechrecognition = "^3.10.0"
 
 [tool.poetry.group.dev.dependencies]
   ipython = "^8.16.1"

diff --git a/pyrobbot/__init__.py b/pyrobbot/__init__.py
@@ -2,13 +2,18 @@
 """Unnoficial OpenAI API UI and CLI tool."""
 import hashlib
 import os
+import sys
 import tempfile
 import uuid
 from dataclasses import dataclass
 from importlib.metadata import metadata, version
 from pathlib import Path
 
 import openai
+from loguru import logger
+
+logger.remove()
+logger.add(sys.stderr, level="INFO")
 
 
 @dataclass

diff --git a/pyrobbot/argparse_wrapper.py b/pyrobbot/argparse_wrapper.py
@@ -5,7 +5,7 @@
 
 from . import GeneralConstants
 from .chat_configs import ChatOptions
-from .command_definitions import accounting, run_on_terminal, run_on_ui
+from .command_definitions import accounting, run_on_terminal, run_on_ui, run_over_voice
 
 
 def get_parsed_args(argv=None, default_command="ui"):
@@ -92,6 +92,19 @@ def get_parsed_args(argv=None, default_command="ui"):
     )
     parser_terminal.set_defaults(run_command=run_on_terminal)
 
+    parser_over_voice = subparsers.add_parser(
+        "voice",
+        aliases=["v"],
+        parents=[chat_options_parser],
+        help="Run the chat over voice.",
+    )
+    parser_over_voice.add_argument(
+        "--report-accounting-when-done",
+        action="store_true",
+        help="Report estimated costs when done with the chat.",
+    )
+    parser_over_voice.set_defaults(run_command=run_over_voice)
+
     parser_accounting = subparsers.add_parser(
         "accounting",
         aliases=["acc"],

diff --git a/pyrobbot/chat.py b/pyrobbot/chat.py
@@ -11,6 +11,7 @@
 from .chat_configs import ChatOptions
 from .chat_context import EmbeddingBasedChatContext, FullHistoryChatContext
 from .openai_utils import CannotConnectToApiError, make_api_chat_completion_call
+from .text_to_speech import LiveAssistant
 from .tokens import TokenUsageDatabase
 
 
@@ -274,6 +275,34 @@ def start(self):
             print(f"{self.api_connection_error_msg}\n")
             logger.error("Leaving chat: {}", error)
 
+    def start_talking(self):
+        """Start the chat."""
+        # ruff: noqa: T201
+        lang = self.language_speech
+        en_greeting = self.initial_greeting
+        translation_prompt = f"Translate the greeting in the net line to {lang}. "
+        translation_prompt += "Do NOT write anything else. Only the translation.\n"
+        translation_prompt += f"{en_greeting}"
+        initial_greeting = "".join(self.respond_system_prompt(prompt=translation_prompt))
+        assistant = LiveAssistant(language=self.language_speech)
+        assistant.speak(initial_greeting)
+        try:
+            while True:
+                logger.info(f"{self.assistant_name}> Listening...")
+                question = assistant.listen()
+                if not question:
+                    continue
+                logger.info(f"{self.assistant_name}> Let me think...")
+                answer = "".join(self.respond_user_prompt(prompt=question))
+                logger.info(f"{self.assistant_name}> Ok, here we go:")
+                assistant.speak(answer)
+        except (KeyboardInterrupt, EOFError):
+            print("", end="\r")
+            logger.info("Leaving chat.")
+        except CannotConnectToApiError as error:
+            print(f"{self.api_connection_error_msg}\n")
+            logger.error("Leaving chat: {}", error)
+
     def report_token_usage(self, report_current_chat=True, report_general: bool = False):
         """Report token usage and associated costs."""
         dfs = {}

diff --git a/pyrobbot/chat_configs.py b/pyrobbot/chat_configs.py
@@ -149,6 +149,9 @@ class ChatOptions(OpenAiApiCallOptions):
         gt=0,
         description="Maximum number of attempts to connect to the OpenAI API",
     )
+    language_speech: str = Field(
+        default="en", description="Language for text to speech/speech to text"
+    )
     private_mode: Optional[bool] = Field(
         default=None,
         description="Toggle private mode. If set to `True`, the chat will not "

diff --git a/pyrobbot/command_definitions.py b/pyrobbot/command_definitions.py
@@ -42,3 +42,11 @@ def run_on_ui(args):
         )
     except (KeyboardInterrupt, EOFError):
         logger.info("Exiting.")
+
+
+def run_over_voice(args):
+    """Run the chat on the terminal."""
+    chat = Chat.from_cli_args(cli_args=args)
+    chat.start_talking()
+    if args.report_accounting_when_done:
+        chat.report_token_usage(report_general=True)
diff --git a/pyrobbot/text_to_speech.py b/pyrobbot/text_to_speech.py
@@ -0,0 +1,126 @@
+"""Functions for converting text to speech and speech to text."""
+import io
+import queue
+from dataclasses import dataclass
+from datetime import datetime
+
+import numpy as np
+import pygame
+import scipy.io.wavfile as wav
+import sounddevice as sd
+import soundfile as sf
+import speech_recognition as sr
+from gtts import gTTS
+from loguru import logger
+from pygame import mixer
+
+
+@dataclass
+class LiveAssistant:
+    """Class for converting text to speech and speech to text."""
+
+    language: str = "en"
+    recording_duration_seconds: int = 5
+    inactivity_timeout_seconds: int = 2
+    inactivity_sound_intensity_threshold: float = 0.02
+
+    def __post_init__(self):
+        mixer.init()
+
+    def speak(self, text):
+        """Convert text to speech."""
+        logger.debug("Converting text to speech...")
+        # Initialize gTTS with the text to convert
+        speech = gTTS(text, lang=self.language)
+
+        # Convert the recorded array to an in-memory wav file
+        byte_io = io.BytesIO()
+        speech.write_to_fp(byte_io)
+        byte_io.seek(0)
+
+        logger.debug("Done converting text to speech.")
+
+        # Play the audio file
+        speech = mixer.Sound(byte_io)
+        channel = speech.play()
+        while channel.get_busy():
+            pygame.time.wait(100)
+
+    def listen_time_limited(self):
+        """Record audio from the mic, for a limited timelength, and convert it to text."""
+        sample_rate = 44100  # Hz
+        n_frames = int(self.recording_duration_seconds * sample_rate)
+        # Record audio from the microphone
+        rec_as_array = sd.rec(
+            frames=n_frames, samplerate=sample_rate, channels=2, dtype="int16"
+        )
+        logger.debug("Recording Audio")
+        sd.wait()
+        logger.debug("Done Recording")
+
+        logger.debug("Converting audio to text...")
+        # Convert the recorded array to an in-memory wav file
+        byte_io = io.BytesIO()
+        wav.write(byte_io, rate=sample_rate, data=rec_as_array.astype(np.int16))
+        text = self._audio_buffer_to_text(self, byte_io)
+        logger.debug("Done converting audio to text.")
+
+        return text
+
+    def listen(self):
+        """Record audio from the microphone, until user stops, and convert it to text."""
+        # Adapted from
+        # <https://python-sounddevice.readthedocs.io/en/0.4.6/examples.html#
+        #  recording-with-arbitrary-duration>
+        logger.debug("The assistant is listening...")
+        q = queue.Queue()
+
+        def callback(indata, frames, time, status):  # noqa: ARG001
+            """This is called (from a separate thread) for each audio block."""
+            q.put(indata.copy())
+
+        overall_max_intensity = 0.0
+        raw_buffer = io.BytesIO()
+        with sf.SoundFile(
+            raw_buffer,
+            mode="x",
+            samplerate=44100,
+            channels=2,
+            format="wav",
+            subtype="PCM_16",
+        ) as audio_file, sd.InputStream(samplerate=44100, channels=2, callback=callback):
+            # Recording will stop after self.inactivity_timeout_seconds of silence
+            max_intensity = 1.0
+            last_checked = datetime.now()
+            while max_intensity > self.inactivity_sound_intensity_threshold:
+                new_data = q.get()
+                audio_file.write(new_data)
+                now = datetime.now()
+                if (now - last_checked).seconds > self.inactivity_timeout_seconds:
+                    last_checked = now
+                    max_intensity = np.max([abs(np.min(new_data)), abs(np.max(new_data))])
+                    if max_intensity > overall_max_intensity:
+                        overall_max_intensity = max_intensity
+
+        if overall_max_intensity < self.inactivity_sound_intensity_threshold:
+            logger.debug("No sound detected")
+            return ""
+
+        logger.debug("Converting audio to text...")
+        text = self._audio_buffer_to_text(byte_io=raw_buffer)
+        logger.debug("Done converting audio to text.")
+
+        return text
+
+    def _audio_buffer_to_text(self, byte_io):
+        """Use SpeechRecognition to convert the audio to text."""
+        byte_io.seek(0)  # Reset the file pointer to the beginning of the file
+        r = sr.Recognizer()
+        with sr.AudioFile(byte_io) as source:
+            audio_data = r.listen(source)
+
+        try:
+            return r.recognize_google(audio_data, language=self.language)
+        except sr.exceptions.UnknownValueError:
+            logger.debug("Could not understand audio")
+            return ""