-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgpt.py
251 lines (221 loc) · 11.5 KB
/
gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import base64
import requests
import os
import json
from collections import defaultdict
from pathlib import Path
from typing import Optional
api_key = os.environ.get("OPENAI_API_KEY")
def ask_classes_and_descriptions(text, term, termlist, out_file_path: Optional[str|Path] = None, abstract_id: Optional[int] = None):
"""Get GPT results based only on the labels of the terms."""
# Get the Labels
labels = defaultdict(list)
descriptions = defaultdict(list)
for curie, annotation in termlist.items():
labels[(annotation["label"], annotation["biolink_type"])].append(curie)
descriptions[(annotation["label"], annotation["biolink_type"])].append(annotation["description"])
synonym_list = [(x[0], x[1], d) for x, d in descriptions.items()]
# Define the Prompt
prompt = f""" You are an expert in biomedical vocabularies and ontologies. I will provide you with the abstract to a scientific paper, as well as
a query term: biomedical entity that occurs in that abstract. I will also provide you a list of possible synonyms for the query term, along
with their class as defined within their vocabulary, such as Gene or Disease. This will help you distinguish between
entities with the same name such as HIV, which could refer to either a particular virus (class OrganismTaxon) or a disease (class Disease). It can also
help distinguish between a disease hyperlipidemia (class Disease) versus hyperlipidemia as a symptom of another disease (class PhenotpyicFeature).
For some entities, I will also provide a description of the entity along with the name and class.
Please determine whether the query term, as it is used in the abstract, is an exact synonym of any of the terms in the list. There should be at most one
exact synonym of the query term. If there are no exact synonyms for the query term in the list, please look for narrow, broad, or related synonyms,
The synonym is narrow if the query term is a more specific form of one of the list terms. For example, the query term "Type 2 Diabetes" would be a
narrow synonym of "Diabetes" because it is not an exact synonym, but a more specific form.
The synonym is broad if the query term is a more general form of the list term. For instance, the query term "brain injury" would be a broad synonym
of "Cerebellar Injury" because it is more generic.
The synonym is related if it is neither exact, narrow, or broad, but is still a similar enough term. For instance the query term "Pain" would be
a related synonym of "Pain Disorder".
It is also possible that there are neither exact nor narrow synonyms of the query term in the list.
Provide your answers in the following JSON structure:
[
{{
"synonym": ...,
"vocabulary class": ...,
"synonymType": ...
}}
]
where the value for synonym is the element from the synonym list, vocabulary class is the
class that I input associated with that synonym, and synonymType is either "exact" or "narrow".
abstract: {text}
query_term: {term}
possible_synonyms_classes_and_descriptions: {synonym_list}
"""
results = query(prompt)
if out_file_path is not None:
temp = {}
temp['abstract_id'] = abstract_id
temp['term'] = term
temp['prompt'] = prompt
temp['output'] = results
if os.path.isfile(out_file_path):
with open(out_file_path, "r") as f:
out = json.load(f)
out.append(temp)
else:
out = [temp]
with open(out_file_path, "w") as f:
json.dump(out, f, indent=4)
for result in results:
syn = result['synonym']
cls = result['vocabulary class']
syntype = result['synonymType']
curies = labels[(syn,cls)]
for curie in curies:
termlist[curie]["synonym_Type"] = syntype
grouped_by_syntype = defaultdict(list)
for curie in termlist:
syntype = termlist[curie].get("synonym_Type", "unrelated")
termlist[curie]["curie"] = curie
grouped_by_syntype[syntype].append(termlist[curie])
return grouped_by_syntype
def ask_classes(text, term, termlist, out_file_path: Optional[str|Path] = None, abstract_id: Optional[int] = None):
"""Get GPT results based only on the labels of the terms."""
# Get the Labels
labels = defaultdict(list)
for curie, annotation in termlist.items():
labels[(annotation["label"], annotation["biolink_type"])].append(curie)
synonym_list = list(labels.keys())
# Define the Prompt
prompt = f""" You are an expert in biomedical vocabularies and ontologies. I will provide you with the abstract to a scientific paper, as well as
a query term: biomedical entity that occurs in that abstract. I will also provide you a list of possible synonyms for the query term, along
with their class as defined within their vocabulary, such as Gene or Disease. This will help you distinguish between
entities with the same name such as HIV, which could refer to either a particular virus (class OrganismTaxon) or a disease (class Disease). It can also
help distinguish between a disease hyperlipidemia (class Disease) versus hyperlipidemia as a symptom of another disease (class PhenotpyicFeature).
Please determine whether the query term, as it is used in the abstract, is an exact synonym of any of the terms in the list. There should be at most one
exact synonym of the query term. If there are no exact synonyms for the query term in the list, please look for narrow, broad, or related synonyms,
The synonym is narrow if the query term is a more specific form of one of the list terms. For example, the query term "Type 2 Diabetes" would be a
narrow synonym of "Diabetes" because it is not an exact synonym, but a more specific form.
The synonym is broad if the query term is a more general form of the list term. For instance, the query term "brain injury" would be a broad synonym
of "Cerebellar Injury" because it is more generic.
The synonym is related if it is neither exact, narrow, or broad, but is still a similar enough term. For instance the query term "Pain" would be
a related synonym of "Pain Disorder".
It is also possible that there are neither exact nor narrow synonyms of the query term in the list.
Provide your answers in the following JSON structure:
[
{{
"synonym": ...,
"vocabulary class": ...,
"synonymType": ...
}}
]
where the value for synonym is the element from the synonym list, vocabulary class is the
class that I input associated with that synonym, and synonymType is either "exact" or "narrow".
abstract: {text}
query_term: {term}
possible_synonyms_and_classes: {synonym_list}
"""
results = query(prompt)
if out_file_path is not None:
temp = {}
temp['abstract_id'] = abstract_id
temp['term'] = term
temp['prompt'] = prompt
temp['output'] = results
if os.path.isfile(out_file_path):
with open(out_file_path, "r") as f:
out = json.load(f)
out.append(temp)
else:
out = [temp]
with open(out_file_path, "w") as f:
json.dump(out, f, indent=4)
for result in results:
syn = result['synonym']
cls = result['vocabulary class']
syntype = result['synonymType']
curies = labels[(syn,cls)]
for curie in curies:
termlist[curie]["synonym_Type"] = syntype
grouped_by_syntype = defaultdict(list)
for curie in termlist:
syntype = termlist[curie].get("synonym_Type", "unrelated")
termlist[curie]["curie"] = curie
grouped_by_syntype[syntype].append(termlist[curie])
return grouped_by_syntype
def ask_labels(text, term, termlist, out_file_path: Optional[str|Path] = None, abstract_id: Optional[int] = None):
"""Get GPT results based only on the labels of the terms."""
# Get the Labels
labels = defaultdict(list)
for curie, annotation in termlist.items():
labels[annotation["label"]].append(curie)
synonym_list = list(labels.keys())
# Define the Prompt
prompt = f""" You are an expert in biomedical vocabularies and ontologies. I will provide you with the abstract to a scientific paper, as well as
a query term: biomedical entity that occurs in that abstract. I will also provide you a list of possible synonyms for the query term. Please
determine whether the query term, as it is used in the abstract, is an exact synonym of any of the terms in the list. There should be at most one
exact synonym of the query term. If there are no exact synonyms for the query term in the list, please look for narrow, broad, or related synonyms,
The synonym is narrow if the query term is a more specific form of one of the list terms. For example, the query term "Type 2 Diabetes" would be a
narrow synonym of "Diabetes" because it is not an exact synonym, but a more specific form.
The synonym is broad if the query term is a more general form of the list term. For instance, the query term "brain injury" would be a broad synonym
of "Cerebellar Injury" because it is more generic.
The synonym is related if it is neither exact, narrow, or broad, but is still a similar enough term. For instance the query term "Pain" would be
a related synonym of "Pain Disorder".
It is also possible that there are neither exact nor narrow synonyms of the query term in the list.
Provide your answers in the following JSON structure:
[
{{
"synonym": ...,
"synonymType": ...
}}
]
where the value for synonym is the element from the synonym list, and synonymType is either "exact" or "narrow".
abstract: {text}
query_term: {term}
possible_synonyms: {synonym_list}
"""
results = query(prompt)
if out_file_path is not None:
temp = {}
temp['abstract_id'] = abstract_id
temp['term'] = term
temp['prompt'] = prompt
temp['output'] = results
if os.path.isfile(out_file_path):
with open(out_file_path, "r") as f:
out = json.load(f)
out.append(temp)
else:
out = [temp]
with open(out_file_path, "w") as f:
json.dump(out, f, indent=4)
for result in results:
syn = result['synonym']
syntype = result['synonymType']
curies = labels[syn]
for curie in curies:
termlist[curie]["synonym_Type"] = syntype
grouped_by_syntype = defaultdict(list)
for curie in termlist:
syntype = termlist[curie].get("synonym_Type","unrelated")
termlist[curie]["curie"] = curie
grouped_by_syntype[syntype].append(termlist[curie])
return grouped_by_syntype
def query(prompt):
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4-0125-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
]
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
content = response.json()["choices"][0]["message"]["content"]
chunk = content[content.index("["):(content.rindex("]")+1)]
output = json.loads(chunk)
return output