-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
76 lines (62 loc) · 2.66 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import openai
import os
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.environ.get('OPENAI_API_KEY')
def call_gpt(prompt, endpoint, max_tokens=200):
return openai.chat.completions.create(
model=endpoint,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
).choices[0].message.content
def truncate_text(text, max_tokens):
tokens = text.split()
if len(tokens) <= max_tokens:
return text
return ' '.join(tokens[:max_tokens])
def human_rater_performance():
"""
Evaluates the agreement between different human raters in classifying notes as helpful or not.
It compares user ratings from two files with the current status of notes to calculate
metrics like accuracy, precision, recall, specificity, and F1 score of user ratings.
"""
ratings_1_file_path = './Data/ratings-00000.tsv'
ratings_2_file_path = './Data/ratings-00000.tsv'
note_status_history_file_path = './Data/noteStatusHistory-00000.tsv'
rating_1_df = pd.read_csv(ratings_1_file_path, sep='\t', usecols=['noteId', 'helpfulnessLevel'])
rating_2_df = pd.read_csv(ratings_2_file_path, sep='\t', usecols=['noteId', 'helpfulnessLevel'])
note_status_df = pd.read_csv(note_status_history_file_path, sep='\t', usecols=['noteId', 'currentStatus'])
# Map note id to the current status. 0 if not helpful, 1 if helpful
note_id_to_status = {}
for i, row in note_status_df.iterrows():
if row['currentStatus'] == 'CURRENTLY_RATED_NOT_HELPFUL':
note_id_to_status[row['noteId']] = 0
elif row['currentStatus'] == 'CURRENTLY_RATED_HELPFUL':
note_id_to_status[row['noteId']] = 1
print(note_id_to_status)
# iterate through all user ratings. if they have a note in the map see if its wrong or right
tp = 0
fp = 0
tn = 0
fn = 0
for i, row in rating_1_df.iterrows():
if row['noteId'] in note_id_to_status:
is_currently_helpful = note_id_to_status[row['noteId']]
if is_currently_helpful:
if row['helpfulnessLevel'] == 'HELPFUL':
tp += 1
else:
fn += 1
else:
if row['helpfulnessLevel'] == 'HELPFUL':
fp += 1
else:
tn += 1
print(tp, fp, tn, fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
specificity = tn / (tn + fp)
f1 = 2 * precision * recall / (precision + recall)
return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'Specificity': specificity, 'F1': f1}