From 09e87a026b363c52ab991cd4d6b83aa8373c5109 Mon Sep 17 00:00:00 2001
From: Justin Hayes <52832301+justinh-rahb@users.noreply.github.com>
Date: Fri, 11 Aug 2023 23:10:09 -0400
Subject: [PATCH] Automatically get voice data (#54)

* Delete data directory

* Load /voices data dynamically

* Update index.md

* Update privacy.md

* Update setup.md

* Update README.md
---
 README.md        |  12 ++---
 data/voices.json |   1 -
 docs/index.md    |   2 +-
 docs/privacy.md  |   4 +-
 docs/setup.md    |   4 +-
 main.py          | 117 +++++++++++++++++++++++++++++++++++------------
 6 files changed, 100 insertions(+), 40 deletions(-)
 delete mode 100644 data/voices.json
diff --git a/README.md b/README.md
index dfc4e08..c20900e 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
 </p>
 <hr/>
 
-**Chat²GPT** is a [ChatGPT](https://openai.com/chatgpt) chat bot for Google Chat 🤖💬. It's designed to amplify the experience in your Google Chat rooms by offering personalized user sessions for coherent dialogues, a manual reset capability, the power to generate images via OpenAI's [DALL·E 2 API](https://openai.com/dall-e-2), and dynamic interactions through mentions or direct messaging. Moreover, with the integration of Eleven Labs TTS API, Chat²GPT now brings voice interactions, letting users convert textual prompts into audio. User input and text output is moderated with OpenAI's [Moderation API](https://platform.openai.com/docs/guides/moderation).
+**Chat²GPT** is a [ChatGPT](https://openai.com/chatgpt) chat bot for Google Chat 🤖💬. It's designed to amplify the experience in your Google Chat rooms by offering personalized user sessions for coherent dialogues, a manual reset capability, the power to generate images via OpenAI's [DALL·E 2 API](https://openai.com/dall-e-2), and dynamic interactions through mentions or direct messaging. Moreover, with the integration of ElevenLabs' [Text-to-Speech API](https://docs.elevenlabs.io/api-reference/text-to-speech), Chat²GPT now brings voice interactions, letting users convert textual prompts into audio. User input and text output is moderated with OpenAI's [Moderation API](https://platform.openai.com/docs/guides/moderation).
 
 ## 📖 Table of Contents
 - [🛠️ Setup](#%EF%B8%8F-setup)
@@ -75,8 +75,8 @@ In your GitHub repository:
   - `TEMPERATURE`: This sets the temperature for the OpenAI API. Default: 0.8.
   - `IMAGE_SIZE`: This sets the image size for the DALL-E API. Default: "512x512".
   - `API_URL`: This sets the API endpoint for the chat completions API. Default: "https://api.openai.com/v1/chat/completions".
-  - `ELEVENLABS_API_KEY`: Your Eleven Labs API key. Can be disabled by omitting this secret.
-  - `ELEVENLABS_MODEL_NAME`: Eleven Labs model you're using. Default: "eleven_monolingual_v1".
+  - `ELEVENLABS_API_KEY`: Your ElevenLabs API key. Can be disabled by omitting this secret.
+  - `ELEVENLABS_MODEL_NAME`: ElevenLabs model you're using. Default: "eleven_monolingual_v1".
   - `GCS_BUCKET_NAME`: Your chosen name for the GCS bucket meant for TTS audio file storage.
 
 **5. GitHub Actions 🚀**
@@ -115,17 +115,17 @@ Remember, Chat²GPT is flexible, suitable for deployment on Google Cloud, FaaS (
 
 - **Ephemeral Conversations:** Chat²GPT doesn't store or retain conversation history. Every session is temporary, ending when a conversation concludes or times out.
 
-- **Temporary Audio Storage:** Text-To-Speech audio files are stored temporarily in Google Cloud Storage to allow users enough time for downloading. To ensure data privacy and efficient storage utilization, these files are deleted with each app redeployment.
+- **Temporary Audio Storage:** Audio files are stored temporarily in Google Cloud Storage to allow users enough time for downloading. To ensure data privacy and efficient storage utilization, these files are deleted with each app redeployment.
 
 - **Reactive Responses:** The bot only reacts to direct prompts, such as @mentions or direct messages, and doesn't "read the room".
 
 - **Anonymous Sessions:** Users are tracked using anonymous ID numbers solely for session consistency. These IDs are cleared with each app redeployment.
 
-### AI APIs and User Awareness ℹ️
+## AI APIs and User Awareness ℹ️
 
 - **OpenAI's Commitment:** We use OpenAI's APIs, which, as per OpenAI's policy, don't use user inputs for model training. More details are on [OpenAI's official site](https://openai.com/policies/api-data-usage-policies).
 
-- **Eleven Labs' Commitment:** We use Eleven Labs' APIs, which, as per Eleven Labs' policy, don't use user inputs for model training. More details are on [Eleven Labs' official site](https://elevenlabs.io/terms)).
+- **ElevenLabs' Commitment:** We use ElevenLabs' APIs, which, as per ElevenLabs' policy, don't use user inputs for model training. More details are on [ElevenLabs' official site](https://elevenlabs.io/terms)).
 
 - **User Awareness:** Discussing sensitive topics? Exercise caution, especially in group settings. Chat²GPT doesn't log conversations, but your organization or platform might.
 
diff --git a/data/voices.json b/data/voices.json
deleted file mode 100644
index 4a434ec..0000000
--- a/data/voices.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"voice_id": "21m00Tcm4TlvDq8ikWAM", "name": "Rachel", "labels": {"accent": "american", "description": "calm", "age": "young", "gender": "female", "use case": "narration"}}, {"voice_id": "2EiwWnXFnvU5JabPnv8n", "name": "Clyde", "labels": {"accent": "american", "description": "war veteran", "age": "middle aged", "gender": "male", "use case": "video games"}}, {"voice_id": "AZnzlk1XvdvUeBnXmlld", "name": "Domi", "labels": {"accent": "american", "description": "strong", "age": "young", "gender": "female", "use case": "narration"}}, {"voice_id": "CYw3kZ02Hs0563khs1Fj", "name": "Dave", "labels": {"accent": "british-essex", "description": "conversational", "age": "young", "gender": "male", "use case": "video games"}}, {"voice_id": "D38z5RcWu1voky8WS1ja", "name": "Fin", "labels": {"accent": "irish", "description": "sailor", "age": "old", "gender": "male", "use case": "video games"}}, {"voice_id": "EXAVITQu4vr4xnSDxMaL", "name": "Bella", "labels": {"accent": "american", "description": "soft", "age": "young", "gender": "female", "use case": "narration"}}, {"voice_id": "ErXwobaYiN019PkySvjV", "name": "Antoni", "labels": {"accent": "american", "description": "well-rounded", "age": "young", "gender": "male", "use case": "narration"}}, {"voice_id": "GBv7mTt0atIp3Br8iCZE", "name": "Thomas", "labels": {"accent": "american", "description": "calm", "age": "young", "gender": "male", "use case": "meditation"}}, {"voice_id": "IKne3meq5aSn9XLyUdCD", "name": "Charlie", "labels": {"accent": "australian", "description": "casual", "age": "middle aged", "gender": "male", "use case": "conversational"}}, {"voice_id": "LcfcDJNUP1GQjkzn1xUU", "name": "Emily", "labels": {"accent": "american", "description": "calm", "age": "young", "gender": "female", "use case": "meditation"}}, {"voice_id": "MF3mGyEYCl7XYWbV9V6O", "name": "Elli", "labels": {"accent": "american", "description": "emotional", "age": "young", "gender": "female", "use case": "narration"}}, {"voice_id": "N2lVS1w4EtoT3dr4eOWO", "name": "Callum", "labels": {"accent": "american", "description": "hoarse", "age": "middle aged", "gender": "male", "use case": "video games"}}, {"voice_id": "ODq5zmih8GrVes37Dizd", "name": "Patrick", "labels": {"accent": "american", "description": "shouty", "age": "middle aged", "gender": "male", "use case": "video games"}}, {"voice_id": "SOYHLrjzK2X1ezoPC6cr", "name": "Harry", "labels": {"accent": "american", "description": "anxious", "age": "young", "gender": "male", "use case": "video games"}}, {"voice_id": "TX3LPaxmHKxFdv7VOQHJ", "name": "Liam", "labels": {"accent": "american", "age": "young", "gender": "male", "use case": "narration", "description ": "neutral"}}, {"voice_id": "ThT5KcBeYPX3keUQqHPh", "name": "Dorothy", "labels": {"accent": "british", "description": "pleasant", "age": "young", "gender": "female", "use case": "children's stories"}}, {"voice_id": "TxGEqnHWrfWFTfGW9XjX", "name": "Josh", "labels": {"accent": "american", "description": "deep", "age": "young", "gender": "male", "use case": "narration"}}, {"voice_id": "VR6AewLTigWG4xSOukaG", "name": "Arnold", "labels": {"accent": "american", "description": "crisp", "age": "middle aged", "gender": "male", "use case": "narration"}}, {"voice_id": "XB0fDUnXU5powFXDhCwa", "name": "Charlotte", "labels": {"accent": "english-swedish", "description": "seductive", "age": "middle aged", "gender": "female", "use case": "video games"}}, {"voice_id": "XrExE9yKIg1WjnnlVkGX", "name": "Matilda", "labels": {"accent": "american", "description": "warm", "age": "young", "gender": "female", "use case": "audiobook"}}, {"voice_id": "Yko7PKHZNXotIFUBG7I9", "name": "Matthew", "labels": {"accent": "british", "age": "middle aged", "gender": "male", "use case": "audiobook", "description ": "calm"}}, {"voice_id": "ZQe5CZNOzWyzPSCn5a3c", "name": "James", "labels": {"accent": "australian", "description": "calm ", "age": "old", "gender": "male", "use case": "news"}}, {"voice_id": "Zlb1dXrM653N07WRdFW3", "name": "Joseph", "labels": {"accent": "british", "age": "middle aged", "gender": "male", "use case": "news", "description ": "ground reporter "}}, {"voice_id": "bVMeCyTHy58xNoL34h3p", "name": "Jeremy", "labels": {"accent": "american-irish", "description": "excited", "age": "young", "gender": "male", "use case": "narration"}}, {"voice_id": "flq6f7yk4E4fJM5XTYuZ", "name": "Michael", "labels": {"accent": "american", "age": "old", "gender": "male", "use case": "audiobook", "description ": "orotund"}}, {"voice_id": "g5CIjZEefAph4nQFvHAz", "name": "Ethan", "labels": {"accent": "american", "age": "young", "gender": "male", "use case": "ASMR", "description ": "whisper"}}, {"voice_id": "jBpfuIE2acCO8z3wKNLl", "name": "Gigi", "labels": {"accent": "american", "description": "childlish", "age": "young", "gender": "female", "use case": "animation"}}, {"voice_id": "jsCqWAovK2LkecY7zXl4", "name": "Freya", "labels": {"accent": "american", "age": "young", "gender": "female", "description ": "overhyped", "usecase": "video games"}}, {"voice_id": "oWAxZDx7w5VEj9dCyTzz", "name": "Grace", "labels": {"accent": "american-southern", "age": "young", "gender": "female", "use case": "audiobook ", "description ": "gentle"}}, {"voice_id": "onwK4e9ZLuTAKqWW03F9", "name": "Daniel", "labels": {"accent": "british", "description": "deep", "age": "middle aged", "gender": "male", "use case": "news presenter"}}, {"voice_id": "pMsXgVXv3BLzUgSXRplE", "name": "Serena", "labels": {"accent": "american", "description": "pleasant", "age": "middle aged", "gender": "female", "use case": "interactive"}}, {"voice_id": "pNInz6obpgDQGcFmaJgB", "name": "Adam", "labels": {"accent": "american", "description": "deep", "age": "middle aged", "gender": "male", "use case": "narration"}}, {"voice_id": "piTKgcLEGmPE4e6mEKli", "name": "Nicole", "labels": {"accent": "american", "description": "whisper", "age": "young", "gender": "female", "use case": "audiobook"}}, {"voice_id": "t0jbNlBVZ17f02VDIeMI", "name": "Jessie", "labels": {"accent": "american", "description": "raspy ", "age": "old", "gender": "male", "use case": "video games"}}, {"voice_id": "wViXBPUzp2ZZixB1xQuM", "name": "Ryan", "labels": {"age": "middle aged", "description": "soldier", "gender": "male", "use case": "audiobook", "accent ": "american"}}, {"voice_id": "yoZ06aMxZJJ28mfd3POQ", "name": "Sam", "labels": {"accent": "american", "description": "raspy", "age": "young", "gender": "male", "use case": "narration"}}, {"voice_id": "z9fAnlkpzviPz146aGWa", "name": "Glinda", "labels": {"accent": "american", "description": "witch", "age": "middle aged", "gender": "female", "use case": "video games"}}, {"voice_id": "zcAOhNBS3c14rBihAFp1", "name": "Giovanni", "labels": {"accent": "english-italian", "description": "foreigner", "age": "young", "gender": "male", "use case": "audiobook"}}, {"voice_id": "zrHiDhphv9ZnVXBqCLjz", "name": "Mimi", "labels": {"accent": "english-swedish", "description": "childish", "age": "young", "gender": "female", "use case": "animation"}}]
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index 28afeaa..5d09dec 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -8,7 +8,7 @@ nav_order: 0
 </p>
 <hr/>
 
-**Chat²GPT** is a [ChatGPT](https://openai.com/chatgpt) chat bot for Google Chat 🤖💬. It's designed to amplify the experience in your Google Chat rooms by offering personalized user sessions for coherent dialogues, a manual reset capability, the power to generate images via OpenAI's [DALL·E 2 API](https://openai.com/dall-e-2), and dynamic interactions through mentions or direct messaging. Moreover, with the integration of Eleven Labs TTS API, Chat²GPT now brings voice interactions, letting users convert textual prompts into audio. User input and text output is moderated with OpenAI's [Moderation API](https://platform.openai.com/docs/guides/moderation).
+**Chat²GPT** is a [ChatGPT](https://openai.com/chatgpt) chat bot for Google Chat 🤖💬. It's designed to amplify the experience in your Google Chat rooms by offering personalized user sessions for coherent dialogues, a manual reset capability, the power to generate images via OpenAI's [DALL·E 2 API](https://openai.com/dall-e-2), and dynamic interactions through mentions or direct messaging. Moreover, with the integration of ElevenLabs' [Text-to-Speech API](https://docs.elevenlabs.io/api-reference/text-to-speech), Chat²GPT now brings voice interactions, letting users convert textual prompts into audio. User input and text output is moderated with OpenAI's [Moderation API](https://platform.openai.com/docs/guides/moderation).
 
 ## 📖 Table of Contents
 - [🛠️ Setup](setup.html)
diff --git a/docs/privacy.md b/docs/privacy.md
index 7fffdef..1c2ffad 100644
--- a/docs/privacy.md
+++ b/docs/privacy.md
@@ -11,7 +11,7 @@ nav_order: 3
 
 - **Ephemeral Conversations:** Chat²GPT doesn't store or retain conversation history. Every session is temporary, ending when a conversation concludes or times out.
 
-- **Temporary Audio Storage:** Text-To-Speech audio files are stored temporarily in Google Cloud Storage to allow users enough time for downloading. To ensure data privacy and efficient storage utilization, these files are deleted with each app redeployment.
+- **Temporary Audio Storage:** Audio files are stored temporarily in Google Cloud Storage to allow users enough time for downloading. To ensure data privacy and efficient storage utilization, these files are deleted with each app redeployment.
 
 - **Reactive Responses:** The bot only reacts to direct prompts, such as @mentions or direct messages, and doesn't "read the room".
 
@@ -21,6 +21,6 @@ nav_order: 3
 
 - **OpenAI's Commitment:** We use OpenAI's APIs, which, as per OpenAI's policy, don't use user inputs for model training. More details are on [OpenAI's official site](https://openai.com/policies/api-data-usage-policies).
 
-- **Eleven Labs' Commitment:** We use Eleven Labs' APIs, which, as per Eleven Labs' policy, don't use user inputs for model training. More details are on [Eleven Labs' official site](https://elevenlabs.io/terms)).
+- **ElevenLabs' Commitment:** We use ElevenLabs' APIs, which, as per ElevenLabs' policy, don't use user inputs for model training. More details are on [ElevenLabs' official site](https://elevenlabs.io/terms)).
 
 - **User Awareness:** Discussing sensitive topics? Exercise caution, especially in group settings. Chat²GPT doesn't log conversations, but your organization or platform might.
diff --git a/docs/setup.md b/docs/setup.md
index 0ea09ce..6632d20 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -58,8 +58,8 @@ In your GitHub repository:
   - `TEMPERATURE`: This sets the temperature for the OpenAI API. Default: 0.8.
   - `IMAGE_SIZE`: This sets the image size for the DALL-E API. Default: "512x512".
   - `API_URL`: This sets the API endpoint for the chat completions API. Default: "https://api.openai.com/v1/chat/completions".
-  - `ELEVENLABS_API_KEY`: Your Eleven Labs API key. Can be disabled by omitting this secret.
-  - `ELEVENLABS_MODEL_NAME`: Eleven Labs model you're using. Default: "eleven_monolingual_v1".
+  - `ELEVENLABS_API_KEY`: Your ElevenLabs API key. Can be disabled by omitting this secret.
+  - `ELEVENLABS_MODEL_NAME`: ElevenLabs model you're using. Default: "eleven_monolingual_v1".
   - `GCS_BUCKET_NAME`: Your chosen name for the GCS bucket meant for TTS audio file storage.
 
 **5. GitHub Actions 🚀**
diff --git a/main.py b/main.py
index bec5627..de5d893 100644
--- a/main.py
+++ b/main.py
@@ -72,26 +72,21 @@
 API_URL = os.getenv('API_URL') # Defaults to OpenAI API if not set
 
 # Eleven Labs Text-to-Speech API
-ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
-ELEVENLABS_MODEL_NAME = os.getenv('ELEVENLABS_MODEL_NAME', 'eleven_monolingual_v1')
-
-with open("data/voices.json", "r") as file:
-    voice_list = json.load(file)
-
-voices_data = {voice["name"].lower(): voice["voice_id"] for voice in voice_list}
-voice_names = list(voices_data.keys())
-
-GCS_BUCKET_NAME = os.getenv('GCS_BUCKET_NAME')
-
-# Decode the base64 service account JSON
-decoded_service_account_info = base64.b64decode(os.getenv('GCP_SA_KEY')).decode('utf-8')
-service_account_info = json.loads(decoded_service_account_info)
-
-# Create credentials from the decoded service account JSON
-credentials = Credentials.from_service_account_info(service_account_info)
-
-# Create a GCS client with the credentials
-storage_client = storage.Client(credentials=credentials)
+xi_api_key = os.getenv('ELEVENLABS_API_KEY')
+xi_model_name = os.getenv('ELEVENLABS_MODEL_NAME', 'eleven_monolingual_v1')
+
+bucket_name = os.getenv('GCS_BUCKET_NAME')
+
+if bucket_name:
+    # Decode the base64 service account JSON
+    decoded_service_account_info = base64.b64decode(os.getenv('GCP_SA_KEY')).decode('utf-8')
+    service_account_info = json.loads(decoded_service_account_info)
+    
+    # Create credentials from the decoded service account JSON
+    credentials = Credentials.from_service_account_info(service_account_info)
+    
+    # Create a GCS client with the credentials
+    storage_client = storage.Client(credentials=credentials)
 
 # Define globals
 user_sessions = {}  # A dictionary to track the AIChat instances for each user
@@ -122,19 +117,68 @@ def num_tokens_from_string(string: str) -> int:
     num_tokens = len(encoding.encode(string))
     return num_tokens
 
+# Define the function for downloading voices data
+def get_voices_data():
+    BASE_URL = "https://api.elevenlabs.io/v1/voices"
+
+    endpoint = BASE_URL
+    headers = {
+        "xi-api-key": xi_api_key,
+        "Content-Type": "application/json"
+    }
+    
+    try:
+        # Fetch data from the ElevenLabs voices API
+        response = requests.get(endpoint, headers=headers)
+        response.raise_for_status()
+        
+        data = response.json()
+
+        # Ensure 'voices' key exists in the data
+        if 'voices' not in data:
+            return None, "Error: 'voices' key not found in the API response."
+
+        # Extract the list of voices and filter it
+        voices_data = {
+            voice["name"].lower(): voice["voice_id"]
+            for voice in data["voices"]
+        }
+
+        return voices_data, None
+
+    except requests.RequestException as re:
+        return None, f"API request error: {str(re)}"
+    except Exception as e:
+        return None, f"Error fetching and filtering voice data: {str(e)}"
+
+
+def get_voice_id(voice_name):
+    voices_data, error = get_voices_data()
+    if error:
+        return None, error
+    
+    voice_id = voices_data.get(voice_name.lower())
+    if not voice_id:
+        return None, f"Voice {voice_name} not found."
+    
+    return voice_id, None
+
 
 def text_to_speech(prompt, voice_name):
     BASE_URL = "https://api.elevenlabs.io/v1/text-to-speech/"
 
-    voice_id = voices_data[voice_name.lower()]
+    voice_id, error = get_voice_id(voice_name)
+    if error:
+        return None, error
+
     endpoint = BASE_URL + voice_id
     headers = {
-        "xi-api-key": ELEVENLABS_API_KEY,
+        "xi-api-key": xi_api_key,
         "Content-Type": "application/json"
     }
     payload = {
         "text": prompt,
-        "model_id": ELEVENLABS_MODEL_NAME,
+        "model_id": xi_model_name,
     }
     response = requests.post(endpoint, json=payload, headers=headers)    
 
@@ -146,7 +190,7 @@ def text_to_speech(prompt, voice_name):
         file_name = f"tts_{uuid.uuid4()}.mp3"
 
         # Use the authenticated GCS client to upload
-        bucket = storage_client.bucket(GCS_BUCKET_NAME)
+        bucket = storage_client.bucket(bucket_name)
         blob = bucket.blob(file_name)
         blob.upload_from_string(audio_data, content_type="audio/mpeg")
 
@@ -253,21 +297,38 @@ def handle_message(user_id, user_message):
             except Exception as e:
                 return jsonify({'text': f"Sorry, I encountered an error generating the image: {str(e)}"})
 
-        # Check if the user input starts with /voices
+        # Check if the user input starts with /voice (assuming you meant /voices)
         elif user_message.strip().lower() == '/voices':
+            if not xi_api_key:
+                return jsonify({'text': 'This function is disabled.'})
+            
+            voices_data, error = get_voices_data()
+            if error:
+                return jsonify({'text': error})
+            
+            voice_names_list = list(voices_data.keys())
+            
             # Join voice names with commas and spaces for readability
-            voices_string = ', '.join(voice_names)
+            voices_string = ', '.join(voice_names_list)
             return jsonify({'text': f"Available voices: {voices_string}"})
 
         # Check if the user input starts with /tts
         elif user_message.strip().lower().startswith('/tts'):
+            if not xi_api_key:
+                return jsonify({'text': 'This function is disabled.'})
             parts = user_message.split(' ')
             if len(parts) < 3:  # Checking for /tts, voice, and message
                 return jsonify({'text': 'Please use the format /tts <voice> <message>.'})
             
             voice = parts[1].lower()
-            if voice not in voices_data:  # Checking against voices_data now
+            
+            voices_data_dict, error = get_voices_data()
+            if error:
+                return jsonify({'text': error})
+            
+            if voice not in voices_data_dict:
                 return jsonify({'text': f"Sorry, I couldn't recognize the voice {voice}. Please choose a valid voice."})
+
             
             prompt = ' '.join(parts[2:])
             audio_url, error = text_to_speech(prompt, voice)
@@ -292,7 +353,7 @@ def handle_message(user_id, user_message):
                                                 'buttonList': {
                                                     'buttons': [
                                                         {
-                                                            'text': 'Click to Play Audio',
+                                                            'text': 'Play ▶️',
                                                             'onClick': {
                                                                 'openLink': {
                                                                     'url': audio_url