-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathchatgpt.py
313 lines (285 loc) · 13.8 KB
/
chatgpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import os
import cv2
import time
import glob
import base64
import numpy as np
import openai
from config import candidate_keys
global_index = 0
openai.api_key = candidate_keys[global_index]
# request for one time
def func_get_completion(prompt, model="gpt-3.5-turbo-16k-0613"):
try:
messages = [{"role": "user", "content": prompt}]
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=0,
max_tokens=1000,
)
return response['choices'][0]['message']['content']
except Exception as e:
print ('Error:', e) # change key to avoid RPD
global global_index
global_index = (global_index + 1) % len(candidate_keys)
print (f'========== key index: {global_index} ==========')
openai.api_key = candidate_keys[global_index]
return ''
# request for three times
def get_completion(prompt, model, maxtry=5):
response = ''
try_number = 0
while len(response) == 0:
try_number += 1
if try_number == maxtry:
print (f'fail for {maxtry} times')
break
response = func_get_completion(prompt, model)
return response
# polish chatgpt's outputs
def func_postprocess_chatgpt(response):
response = response.strip()
if response.startswith("output"): response = response[len("output"):]
if response.startswith("Output"): response = response[len("Output"):]
response = response.strip()
if response.startswith(":"): response = response[len(":"):]
response = response.strip()
response = response.replace('\n', '')
response = response.strip()
return response
# ---------------------------------------------------------------------
## convert image/video into GPT4 support version
def func_image_to_base64(image_path, grey_flag=False): # support more types
image = cv2.imread(image_path)
if grey_flag:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
return func_opencv_to_base64(image)
def func_opencv_to_base64(image):
_, buffer = cv2.imencode('.jpg', image)
base64_image = base64.b64encode(buffer).decode('utf-8')
return base64_image
# deal with text
def func_nyp_to_text(npy_path):
text = np.load(npy_path).tolist()
text = text.strip()
text = text.replace('\n', '') # remove \n
text = text.replace('\t', '') # remove \t
text = text.strip()
return text
# support two types: (video) or (frames in dir)
def sample_frames_from_video(video_path, samplenum=3):
if os.path.isdir(video_path): # already sampled video, frame store in video_path
select_frames = sorted(glob.glob(video_path + '/*'))
select_frames = select_frames[:samplenum]
select_frames = [cv2.imread(item) for item in select_frames]
else: # original video
frames = []
cap = cv2.VideoCapture(video_path)
while True:
ret, frame = cap.read()
if ret == False: break
frames.append(frame)
cap.release()
# return frames
while len(frames) < samplenum:
frames.append(frames[-1])
tgt_length = int(len(frames)/samplenum)*samplenum
frames = frames[:tgt_length]
indices = np.arange(0, len(frames), int(len(frames) / samplenum)).astype(int).tolist()
print ('sample indexes: ', indices)
assert len(indices) == samplenum
select_frames = [frames[index] for index in indices]
assert len(select_frames) == samplenum, 'actual sampled frames is ont equal to tgt samplenum'
return select_frames
# ---------------------------------------------------------------------
## Emotion
# ---------------------------------------------------------------------
# 20 images per time
def get_image_emotion_batch(image_paths, candidate_list, sleeptime=0, grey_flag=False, model='gpt-4-vision-preview'):
prompt = [
{
"type": "text",
"text": f"Please play the role of a facial expression classification expert. We provide {len(image_paths)} images. Please ignore the speaker's identity and focus on the facial expression. \
For each image, please sort the provided categories from high to low according to the top 5 similarity with the input image. \
Here are the optional categories: {candidate_list}. Please ignore the speaker's identity and focus on the facial expression. The output format should be {{'name':, 'result':}} for each image."
}
]
for ii, image_path in enumerate(image_paths):
prompt.append(
{
"type": f"image-{ii+1}",
"image": func_image_to_base64(image_path, grey_flag),
}
)
print (prompt[0]['text']) # debug
for item in prompt: print (item['type']) # debug
time.sleep(sleeptime)
response = get_completion(prompt, model)
response = func_postprocess_chatgpt(response)
print (response)
return response
def get_evoke_emotion_batch(image_paths, candidate_list, sleeptime=0, model='gpt-4-vision-preview'):
prompt = [
{
"type": "text",
"text": f"Please play the role of a emotion recognition expert. We provide {len(image_paths)} images. \
Please recognize sentiments evoked by these images (i.e., guess how viewer might emotionally feel after seeing these images.) \
If there is a person in the image, ignore that person's identity. \
For each image, please sort the provided categories from high to low according to the similarity with the input image. \
Here are the optional categories: {candidate_list}. If there is a person in the image, ignore that person's identity. \
The output format should be {{'name':, 'result':}} for each image."
}
]
for ii, image_path in enumerate(image_paths):
prompt.append(
{
"type": f"image-{ii+1}",
"image": func_image_to_base64(image_path),
}
)
print (prompt[0]['text']) # debug
for item in prompt: print (item['type']) # debug
time.sleep(sleeptime)
response = get_completion(prompt, model)
response = func_postprocess_chatgpt(response)
print (response)
return response
def get_micro_emotion_batch(image_paths, candidate_list, sleeptime=0, model='gpt-4-vision-preview'):
prompt = [
{
"type": "text",
"text": f"Please play the role of a micro-expression recognition expert. We provide {len(image_paths)} images. Please ignore the speaker's identity and focus on the facial expression. \
For each image, please sort the provided categories from high to low according to the similarity with the input image. \
The expression may not be obvious, please pay attention to the details of the face. \
Here are the optional categories: {candidate_list}. Please ignore the speaker's identity and focus on the facial expression. The output format should be {{'name':, 'result':}} for each image."
}
]
for ii, image_path in enumerate(image_paths):
prompt.append(
{
"type": f"image-{ii+1}",
"image": func_image_to_base64(image_path),
}
)
print (prompt[0]['text']) # debug
for item in prompt: print (item['type']) # debug
time.sleep(sleeptime)
response = get_completion(prompt, model)
response = func_postprocess_chatgpt(response)
print (response)
return response
# # 20 images per time
# def get_audio_emotion_batch(image_paths, candidate_list, sleeptime=0, model='gpt-4-vision-preview'):
# prompt = [
# {
# "type": "text",
# "text": f"Please play the role of a audio expression classification expert. We provide {len(image_paths)} audios, each with an image of Mel spectrogram. \
# Please ignore the speaker's identity and recognize the speaker's expression from the provided Mel spectrogram. \
# For each sample, please sort the provided categories from high to low according to the top 5 similarity with the input. \
# Here are the optional categories: {candidate_list}. The output format should be {{'name':, 'result':}} for each audio."
# }
# ]
# for ii, image_path in enumerate(image_paths):
# prompt.append(
# {
# "type": f"audio-{ii+1}",
# "image": func_image_to_base64(image_path),
# }
# )
# print (prompt[0]['text']) # debug
# for item in prompt: print (item['type']) # debug
# time.sleep(sleeptime)
# response = get_completion(prompt, model)
# response = func_postprocess_chatgpt(response)
# print (response)
# return response
def get_text_emotion_batch(npy_paths, candidate_list, sleeptime=0, model='gpt-4-vision-preview'):
prompt = [
{
"type": "text",
"text": f"Please play the role of a textual emotion classification expert. We provide {len(npy_paths)} texts. \
Please recognize the speaker's expression from the provided text. \
For each text, please sort the provided categories from high to low according to the top 5 similarity with the input. \
Here are the optional categories: {candidate_list}. The output format should be {{'name':, 'result':}} for each text."
}
]
for ii, npy_path in enumerate(npy_paths):
prompt.append(
{
"type": f"text",
"text": f"{func_nyp_to_text(npy_path)}",
}
)
print (prompt[0]['text']) # debug
for item in prompt: print (item['type']) # debug
time.sleep(sleeptime)
response = get_completion(prompt, model)
response = func_postprocess_chatgpt(response)
print (response)
return response
# 20 images per time
def get_video_emotion_batch(video_paths, candidate_list, sleeptime=0, samplenum=3, model='gpt-4-vision-preview'):
prompt = [
{
"type": "text",
"text": f"Please play the role of a video expression classification expert. We provide {len(video_paths)} videos, each with {samplenum} temporally uniformly sampled frames. Please ignore the speaker's identity and focus on their facial expression. \
For each video, please sort the provided categories from high to low according to the top 5 similarity with the input video. \
Here are the optional categories: {candidate_list}. Please ignore the speaker's identity and focus on the facial expression. The output format should be {{'name':, 'result':}} for each video."
}
]
for ii, video_path in enumerate(video_paths):
video_frames = sample_frames_from_video(video_path, samplenum)
for jj, video_frame in enumerate(video_frames):
prompt.append(
{
"type": f"video{ii+1}_image{jj+1}",
"image": func_opencv_to_base64(video_frame),
},
)
print (prompt[0]['text']) # debug
for item in prompt: print (item['type']) # debug
time.sleep(sleeptime)
response = get_completion(prompt, model)
response = func_postprocess_chatgpt(response)
print (response)
return response
def get_multi_emotion_batch(video_paths, candidate_list, sleeptime=0, samplenum=3, model='gpt-4-vision-preview'):
prompt = [
{
"type": "text",
"text": f"Please play the role of a video expression classification expert. We provide {len(video_paths)} videos, each with the speaker's content and {samplenum} temporally uniformly sampled frames.\
Please ignore the speaker's identity and focus on their emotions. Please ignore the speaker's identity and focus on their emotions. \
For each video, please sort the provided categories from high to low according to the top 5 similarity with the input video. \
Here are the optional categories: {candidate_list}. Please ignore the speaker's identity and focus on their emotions. The output format should be {{'name':, 'result':}} for each video."
}
]
for ii, video_path in enumerate(video_paths):
# convert video_path to text path
split_paths = video_path.split('/')
split_paths[-2] = 'text'
split_paths[-1] = split_paths[-1].rsplit('.', 1)[0] + '.npy'
text_path = "/".join(split_paths)
assert os.path.exists(text_path)
prompt.append(
{
"type": "text",
"text": f"{func_nyp_to_text(text_path)}",
},
)
# read frames
video_frames = sample_frames_from_video(video_path, samplenum=3)
for jj, video_frame in enumerate(video_frames):
prompt.append(
{
"type": f"video{ii+1}_image{jj+1}",
"image": func_opencv_to_base64(video_frame),
},
)
print (prompt[0]['text']) # debug
for item in prompt: print (item['type']) # debug
time.sleep(sleeptime)
response = get_completion(prompt, model)
response = func_postprocess_chatgpt(response)
print (response)
return response