-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalyze_result.py
129 lines (105 loc) · 4.68 KB
/
analyze_result.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json
import torch
import time
import logging
from tqdm import tqdm
from config import args
from utils import print_exp
from torchmetrics import AUROC
from openai import OpenAI, APIError
SYSTEM_PROMPT_ORACLE_EQUIVALENCY = (
"You are an automated grading assistant helping a teacher grade student answers."
)
PROMPT_ANSWER_KEY_EQUIVALENCY = (
"The problem is: <question>\n\n The correct answer for this problem is: <ground-truth>\n "
+ "A student submitted the answer: <prediction>\n "
+ "The student's answer must be correct and specific but not overcomplete "
+ "(for example, if they provide two different answers, they did not get the question right). "
+ "However, small differences in formatting should not be penalized (for example, 'New York City' is equivalent to 'NYC'). "
+ "Did the student provide an equivalent answer to the ground truth? Please answer yes or no without any explanation: "
)
def openai_query(system_prompt, prompt, openai_model_name="gpt-4o-mini"):
client = OpenAI()
sampled_response = None
while sampled_response is None:
try:
response = client.chat.completions.create(
model=openai_model_name,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
)
sampled_response = response.choices[0].message.content
except APIError:
logging.exception("OpenAI API Error.", exc_info=True)
time.sleep(1)
return sampled_response
def label_samples():
with open(f"{args.output_path}/output_v1.json", 'r', encoding='utf-8') as f:
json_data = []
for line in f.readlines():
dic = json.loads(line)
json_data.append(dic)
with open(f"{args.output_path}/output_v1_w_labels.json", "a", encoding="utf-8") as f:
for idx, line in enumerate(tqdm(json_data, total=len(json_data))):
# give labels for open-ended llm answer
id = line['id']
question = line['question']
correct_answer = line['correct answer']
llm_answer = line['llm answer']
if args.dataset in ["gsm8k", "svamp", "ASDiv"]:
label = (str(correct_answer) in llm_answer.lower()) or (str(int(correct_answer)) in llm_answer.lower())
else:
t = line['type']
prompt = (
PROMPT_ANSWER_KEY_EQUIVALENCY.replace("<ground-truth>", str(correct_answer))
.replace("<prediction>", llm_answer)
.replace("<question>", question)
)
sampled_response = openai_query(
system_prompt=SYSTEM_PROMPT_ORACLE_EQUIVALENCY, prompt=prompt
)
label = "yes" in sampled_response.strip().lower()
formatted_data = {
"id": id,
"question": question,
# "type": t,
"correct answer": correct_answer,
"llm answer": llm_answer,
"label": label,
"llm response": line['llm response'],
"llm answer token probability": line['llm answer token probability'],
"step-wise keywords": line['step-wise keywords'],
"keyword token probability": line['keyword token probability'],
"keyword contribution": line['keyword contribution'],
}
f.write(json.dumps(formatted_data, ensure_ascii=False) + "\n")
def compute_auroc():
json_data_labels = []
with open(f"{args.output_path}/output_v1_w_labels.json", 'r', encoding='utf-8') as f:
for line in f.readlines():
dic = json.loads(line)
json_data_labels.append(dic)
json_data_output = []
with open(f"{args.output_path}/confidences/output_v1_{args.uq_engine}.json", 'r', encoding='utf-8') as f:
for line in f.readlines():
dic = json.loads(line)
json_data_output.append(dic)
label_dict = {}
for idx, line in enumerate(json_data_labels):
label_dict[line['question']] = 1 if line['label'] == True else 0
all_confidences = []
all_auroc_target = []
for idx, line in enumerate(json_data_output):
question = line['question']
all_confidences.append(line['confidence'])
all_auroc_target.append(label_dict[question])
# calculate AUROC
auroc = AUROC(task="binary")
auroc_value = auroc(torch.tensor(all_confidences), torch.tensor(all_auroc_target))
print(f"AUROC: {auroc_value}")
if __name__ == '__main__':
print_exp(args)
label_samples()
compute_auroc()