-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathgptspeaker.py
244 lines (201 loc) · 10.4 KB
/
gptspeaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python
# coding: utf-8
# Copyright (c) Jack Wu. All rights reserved.
# Licensed under the BSD license. See LICENSE.md file in the project root for full license information.
"""
Smart Speaker using Azure Speech SDK and OpenAI ChatGPT API
"""
import azure.cognitiveservices.speech as speechsdk
import openai
import asyncio
import json
from collections import namedtuple
import tiktoken
import time
EOF = object()
# Load config.json
def load_config():
try:
with open('config.json', encoding='utf-8') as f:
config = json.load(f, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
if not config.AzureCognitiveServices.Key or not config.AzureCognitiveServices.Region or (not config.OpenAI.Key and not config.AzureOpenAI.Key):
raise ValueError("Missing required configuration.")
return config
except FileNotFoundError:
print("Error: config file not found.")
except Exception as e:
print(f"Error loading config: {e}")
# If tokens greater than 4096, then remove history message
def truncate_conversation(conversation, max_tokens):
total_tokens = 0
truncated_conversation = []
encoding = tiktoken.get_encoding("cl100k_base")
for message in reversed(conversation):
message_tokens = len(encoding.encode(message['content']))
if total_tokens + message_tokens > max_tokens - 100:
print(f'Total tokens is limit {total_tokens + message_tokens}')
break
total_tokens += message_tokens
truncated_conversation.append(message)
conversation = list(reversed(truncated_conversation))
# Prompts OpenAI with a request and async send sentences to queue.
async def ask_openai_async(client, model, prompt, max_token, conversation, queue, ending):
# Append user questions
conversation.append({"role":"user","content":prompt})
# Count token limit and remove early histroy conversation
truncate_conversation(conversation, max_token)
print(conversation)
# Save one sentence
collected_messages = ""
# Save whole GPT answer
full_answer = ""
# Ask OpenAI
response = await client.chat.completions.create(model=model,
messages=conversation,
stream=True)
# iterate through the stream of events
async for chunk in response:
if not chunk.choices:
continue
chunk_message = chunk.choices[0].delta.content
if not chunk_message:
continue
else:
chunk_message = chunk_message.replace('\n', ' ') # extract the message
collected_messages += chunk_message # save the message
if collected_messages.endswith(ending): # One sentence
print(f"ChatGPT Message received: {collected_messages}")
await queue.put(collected_messages)
full_answer += collected_messages
collected_messages = ""
# Save history message for continuous conversations
conversation.append({"role":"assistant","content":full_answer})
# async read message from queue and synthesized speech
async def text_to_speech_async(speech_synthesizer, queue):
while True:
text = await queue.get()
if text is EOF:
break
# Azure text to speech output
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
# Check result
if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("Speech synthesized to speaker for text [{}]".format(text))
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_synthesis_result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# Detect keyword and wakeup
def detect_keyword(recognizer, model, keyword, audio_config):
done = False
def recognized_cb(evt):
# Only a keyword phrase is recognized. The result cannot be 'NoMatch'
# and there is no timeout. The recognizer runs until a keyword phrase
# is detected or recognition is canceled (by stop_recognition_async()
# or due to the end of an input file or stream).
result = evt.result
if result.reason == speechsdk.ResultReason.RecognizedKeyword:
print("RECOGNIZED KEYWORD: {}".format(result.text))
nonlocal done
done = True
def canceled_cb(evt):
result = evt.result
if result.reason == speechsdk.ResultReason.Canceled:
print('CANCELED: {}'.format(result.cancellation_details.reason))
nonlocal done
done = True
# Connect callbacks to the events fired by the keyword recognizer.
recognizer.recognized.connect(recognized_cb)
recognizer.canceled.connect(canceled_cb)
# Start keyword recognition.
recognizer.start_keyword_recognition(model)
print('Say something starting with "{}" followed by whatever you want...'.format(keyword))
while not done:
time.sleep(.5)
recognizer.recognized.disconnect_all()
recognizer.canceled.disconnect_all()
recognizer.stop_keyword_recognition()
# Read result audio (incl. the keyword).
return done
def create_aysnc_client(config):
# Create async OpenAI Client
if config.OpenAI.Key:
client = openai.AsyncClient(api_key=config.OpenAI.Key)
if config.OpenAI.ApiBase:
client.base_url = config.OpenAI.ApiBase
return client, config.OpenAI.Model
elif config.AzureOpenAI.Key:
client = openai.AsyncAzureOpenAI(api_key=config.AzureOpenAI.Key,
api_version=config.AzureOpenAI.api_version,
azure_endpoint=config.AzureOpenAI.Endpoint
)
return client, config.AzureOpenAI.Model
# Continuously listens for speech input to recognize and send as text to Azure OpenAI
async def chat_with_open_ai():
# Load config.json
config = load_config()
# Create async client
client, gpt_model = create_aysnc_client(config=config)
# This example requires config.json
speech_config = speechsdk.SpeechConfig(subscription=config.AzureCognitiveServices.Key,
region=config.AzureCognitiveServices.Region)
audio_output_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
# Should be the locale for the speaker's language.
speech_config.speech_recognition_language = config.AzureCognitiveServices.SpeechRecognitionLanguage
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
ending_punctuations = (".", "?", "!", ";")
if (speech_config.speech_recognition_language == "zh-CN"):
ending_punctuations = ("。", "?", "!", ";", "”")
# The language of the voice that responds on behalf of Azure OpenAI.
speech_config.speech_synthesis_voice_name = config.AzureCognitiveServices.SpeechSynthesisVoiceName
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output_config)
# The phrase your keyword recognition model triggers on.
kws_model = speechsdk.KeywordRecognitionModel(config.AzureCognitiveServices.WakePhraseModel)
conversation = []
while True:
print("OpenAI is listening. Say '{}' to start.".format(config.AzureCognitiveServices.WakeWord))
try:
# Detect keyword
if (not detect_keyword(speech_recognizer, kws_model, config.AzureCognitiveServices.WakeWord, audio_config)):
continue
# Get audio from the microphone and then send it to the TTS service.
speech_recognition_result = speech_recognizer.recognize_once_async().get()
# If speech is recognized, send it to OpenAI and listen for the response.
if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
if speech_recognition_result.text == config.AzureCognitiveServices.StopWord:
print("Conversation ended.")
break
print("Recognized speech: {}".format(speech_recognition_result.text))
# Create queue for save GPT messages
queue = asyncio.Queue()
# Create async task for ask openai
task_ask_gpt = asyncio.create_task(ask_openai_async(client,
gpt_model,
speech_recognition_result.text,
config.OpenAI.MaxTokens,
conversation,
queue,
ending_punctuations))
# Add task done callback, add a EOF message to end
task_ask_gpt.add_done_callback(lambda _: queue.put_nowait(EOF))
# Create async task for Text-to-Speech
task_ask_tts = asyncio.create_task(text_to_speech_async(speech_synthesizer, queue))
# Wait all task completed
await asyncio.gather(task_ask_gpt, task_ask_tts)
elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_recognition_result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
except EOFError:
continue
if __name__ == '__main__':
# Main
try:
asyncio.run(chat_with_open_ai())
except Exception as err:
print("Encountered exception. {}".format(err))