-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprompt_generator.py
313 lines (270 loc) · 16.2 KB
/
prompt_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
import numpy as np
import random
from utils import to_char
from utils import smiles2iupac
system_message_1 = """
Please choose the correct answer in the choices according to the context, the question and the answer candidates.
Each candidate is accociated with a confidence score within a bracket, where a lower score is better.
Your answer should be one uppercase character, representing your choice.
"""
system_message_iupac_smiles = """
Please choose the correct answer in the choices according to the context, the question and the answer candidates.
Each candidate is accociated with a confidence score within a bracket, where a lower score is better.
Your answer should be one uppercase character, representing your choice.
The compounds are given in the form of NAME<SMILES>, where NAME indicates the IUPAC name of the candidate, and SMILES stands for the smiles string of the candidate.
"""
system_message_2 = """
You are an expert at correctly answering molecular reaction multiple-choice questions.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer and only that answer.
Your answer should be one uppercase character, representing your choice.
I will give you three examples. The compounds are given in the form of IUPAC names.
"""
system_message_3 = """
You are an expert at correctly answering molecular reaction multiple-choice questions.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer and your confidence score.
The confidence score should be between 1 and 9, the higher it is, the more confident you are in your choice.
You should provide the score according to your confidence and familiarity with the problem
I will give you three examples with potential confidence score.
The compounds are given in the form of IUPAC names.
"""
system_message_4 = """
Please choose the correct answer in the choices according to the context, the question and the answer candidates.
Each candidate is accociated with a confidence score within a bracket, where a lower score is better.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer and your confidence score.
The confidence score should be between 1 and 9, the higher it is, the more confident you are in your choice.
You should provide the score according to your confidence and familiarity with the problem
The compounds are given in the form of IUPAC names.
"""
system_message_5 = """
Please choose the correct answer in the choices according to the context, the question and the answer candidates.
Each candidate is accociated with a confidence score within a bracket, where a lower score is better.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer and your confidence score.
The confidence score should be between 1 and 9, the higher it is, the more confident you are in your choice.
You should provide the score according to your confidence and familiarity with the problem
The compounds are given in the form of IUPAC<SMILES>, where IUPAC indicates the IUPAC name of the candidate, and SMILES stands for the smiles string of the candidate.
"""
system_message_6 = """
Please choose the correct answer in the choices according to the context, the question, and the answer candidates.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer and your confidence score.
The confidence score should be between 1 and 9. The higher it is, the more confident you are in your choice.
You should provide the score according to your confidence and familiarity with the problem.
The compounds are given in the form of IUPAC<SMILES>, where IUPAC indicates the IUPAC name of the candidate, and SMILES stands for the smiles string of the candidate.
"""
system_message_7 = """
You are ReactionGPT, an expert at correctly answering multiple choice questions about chemical reaction.
Please choose the correct answer in the choices according to the context, the question, and the answer candidates.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer.
The compounds are given in the form of IUPAC<SMILES>, where IUPAC indicates the IUPAC name of the candidate, and SMILES stands for the smiles string of the candidate.
"""
system_message_8 = """
You are ReactionGPT, an expert at correctly answering multiple choice questions about chemical reaction.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer and your confidence score.
Please choose the correct answer in the choices according to the context, the question, and the answer candidates.
The confidence score should be between 1 and 9. The higher it is, the more confident you are in your choice.
You should provide the score according to your confidence and familiarity with the problem.
The compounds are given in the form of IUPAC<SMILES>, where IUPAC indicates the IUPAC name of the candidate, and SMILES stands for the smiles string of the candidate.
"""
system_message_9 = """
You are ReactionGPT, an expert at correctly answering multiple choice questions about chemical reaction.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer and your confidence score.
Please choose the correct answer in the choices according to the context, the question, and the answer candidates.
The confidence score should be between 1 and 9. The higher it is, the more confident you are in your choice.
You should provide the score according to your confidence and familiarity with the problem.
The compounds are given in the form of IUPAC<SMILES>, where IUPAC indicates the IUPAC name of the candidate, and SMILES stands for the SMILES string of the candidate.
For those compounds without available IUPAC names, only the smiles string will be provided.
The given reaction is a multicomponent reaction that leads to the synthesis of diverse imidazopyridines.
"""
system_message_10 = """
You are ReactionGPT, an expert at correctly answering multiple choice questions about chemical reaction.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer and reply with that answer and your confidence score.
Please choose the correct answer in the choices according to the context, the question, and the answer candidates.
The confidence score should be between 1 and 9. The higher it is, the more confident you are in your choice.
You should provide the score according to your confidence and familiarity with the problem.
The compounds are given in the form of IUPAC<SMILES>, where IUPAC indicates the IUPAC name of the candidate, and SMILES stands for the SMILES string of the candidate.
For those compounds without available IUPAC names, only the smiles string will be provided.
The given reaction is a Nickel-catalyzed cross-couplings reaction, which forms carbon-carbon (C-C) and carbon-heteroatom (C-X, where X can be O, N, S, etc.) bonds.
"""
system_message_json = '''
You are ReactionGPT, an expert at correctly answering multiple choice questions about chemical reaction.
Please choose the correct answer in the choices according to the context, the question, and the answer candidates.
I will write a question, and then a short list of answers you can choose from, and then you must choose the correct answer, and provide your confidence score about the answer.
The confidence score should be between 1 and 9. The higher it is, the more confident you are in your choice.
You should provide the score according to your confidence and familiarity with the problem.
Besides the answer and the confidence score, you should provide your thoughts about your answer, including your thought, your reasoning, and you self-criticism.
The compounds are given in the form of IUPAC<SMILES>, where IUPAC indicates the IUPAC name of the candidate, and SMILES stands for the smiles string of the candidate.
You should only respond in JSON format as described below
Response Format:
{
"thoughts": {
"text": "thought",
"reatant": "roles and properties of reactants",
"reaction type": "a 3 component reaction approach towards diverse imidazopyridines",
"product: "properties of the possible products"
"criticism": "constructive self-criticism",
},
"answer": {
"choice": "one capital letter, representing your choice",
"confidence": "one integer, representing your confidence score"
}
}
Ensure the response can be parsed by Python json.loads
'''
# use_catalyst=0
SYSTEM_MESSAGE = system_message_8
query_info=0
SYSTEM_MESSAGE_NOCONF = system_message_7
# catalyst_name = None
Imidazo_Catalyst = '2,2,2-trifluoroacetic acid<C(=O)(C(F)(F)F)O>'
# if use_catalyst:
# catalyst_name = Imidazo_Catalyst
in_context_json = [
{"role": "user", "content":
'''
Question: Which of the following is the major product of the organic reaction between 5-methylpyridin-2-amine<CC1=CN=C(C=C1)N>, isocyanocyclohexane<[C-]#[N+]C1CCCCC1> and 2,3-dihydroxybenzaldehyde<C(c1cccc(c1O)O)=O>?
A. tert-butyl 3-(4-chlorophenyl)-1,2-thiazole-4-carboxylate<CC(C)(C)OC(=O)c1csnc1-c1ccc(Cl)cc1>
B. tert-butyl-dimethyl-oct-7-ynoxysilane<C#CCCCCCCO[Si](C)(C)C(C)(C)C>
C. 5,6,7,8-tetrahydrothieno[3,2-b]thiepin-8-ol<OC1CCCSc2ccsc21>
'''
},
{"role": "assistant", "content":
'''
{
"thoughts": {
"text": "The given reaction is a multicomponent reaction that leads to the synthesis of diverse imidazopyridines. The given compounds include a pyridine, an isocyanide, and an aldehyde which suggests that the reaction might involve a multi-component reaction strategy such as the Ugi reaction or the Passerini reaction.",
"reactant": "5-methylpyridin-2-amine, isocyanocyclohexane, and 2,3-dihydroxybenzaldehyde are the reactants. 5-methylpyridin-2-amine is a pyridine derivative, isocyanocyclohexane is an isocyanide, and 2,3-dihydroxybenzaldehyde is an aldehyde. ",
"reaction type": "The given reaction is a 3 component reaction approach towards diverse imidazopyridines.",
"product": "The product should have imidazopyridine as the main functional group.",
"reasoning": "There is no silicon element in the reactants, so the answer can't be B; there's also no sulfur elements in the reactants, so the answer can't be A.",
"criticism": "Since I know the reaction is about imidazopyridine synthesis, thus I'm prety confident about my answer."
},
"answer": {
"choice": "A",
"confidence": 9
}
}
'''
}
]
def prompt_json(context, func=lambda x:x, useD=False, conf=True):
messages = [{"role": "system", "content": system_message_json}]
reactant, options, dist, answer, _ = context[-1]
messages.extend(in_context_json)
messages.append({
"role": "user",
"content": prompt_single(reactant, options, dist, None, func=func, test=True, useD=useD)
})
return messages
def prompt_json_in_context(context, func=lambda x:x, useD=False, conf=True):
prompt = []
prompt.append(system_message_json)
context_prompt = [i['content'] for i in in_context_json]
context_prompt = '\n'.join(context_prompt)
prompt.append(context_prompt)
prompt.append(f'Please answer the following question based on the example above.')
reactant, options, dist, _, _ = context[-1]
query = prompt_single(reactant, options, dist, None, func=func, test=True, useD=useD)
prompt.append(query)
messages = '\n\n'.join(prompt)
return messages
def prompt_chat(context, func=lambda x:x, useD=False, conf=True, name=''):
if '10b9' in name:
system_message = system_message_9
catalyst_name = Imidazo_Catalyst
elif '9b8a' in name:
system_message = system_message_10
catalyst_name = None
else:
system_message = SYSTEM_MESSAGE
catalyst_name = None
if conf:
messages = [{"role": "system", "content": system_message}]
else:
messages = [{"role": "system", "content": SYSTEM_MESSAGE_NOCONF}]
for item in context[:-1]:
reactant, options, dist, answer, confidence = item
messages.append({
"role": "user",
"content": prompt_single(reactant, options, dist, None, func=func, test=False, useD=useD)
})
messages.append({
"role": "assistant",
"content": f"{to_char(answer)} {confidence}" if conf else f"{to_char(answer)}"
})
reactant, options, dist, _, _ = context[-1]
messages.append({
"role": "user",
"content": prompt_single(reactant, options, dist, None, func=func, test=True, useD=useD, catalyst=catalyst_name)
})
return messages
def prompt_in_context(context, func=lambda x:x, useD=False, conf=True, name=''):
if '10b9' in name:
system_message = system_message_9
catalyst_name = Imidazo_Catalyst
elif '9b8a' in name:
system_message = system_message_10
catalyst_name = None
else:
system_message = SYSTEM_MESSAGE
catalyst_name = None
prompt = []
if conf:
prompt.append(system_message)
else:
prompt.append(SYSTEM_MESSAGE_NOCONF)
for idx, item in enumerate(context[:-1]):
prompt.append(f'Example No.{idx+1}:')
reactant, options, dist, answer, confidence = item
context_prompt = prompt_single(reactant, options, dist, None, func=func, test=False, useD=useD)
context_prompt += f"{to_char(answer)} {confidence}" if conf else f"{to_char(answer)}"
prompt.append(context_prompt)
prompt.append(f'Please answer the following question based on the examples above.')
reactant, options, dist, _, _ = context[-1]
query_prompt = prompt_single(reactant, options, dist, None, func=func, test=True, useD=useD, catalyst=catalyst_name)
prompt.append(query_prompt)
prompt = '\n\n'.join(prompt)
return prompt
def prompt_single(reactant, options, dist, answer, func=lambda x:x, test=False, useD=False, catalyst=None):
question = ''
if query_info and test:
question += 'The given reaction is a Nickel-catalyzed cross-couplings reaction, which forms carbon-carbon (C-C) and carbon-heteroatom (C-X, where X can be O, N, S, etc.) bonds.\n'
question += 'Question: Which of the following is the major product of the organic reaction '
reactant, reactant_iupac = reactant
if '.' in reactant:
reactant_iupac = reactant_iupac.split('|')
reactants = reactant.split('.')
if len(reactant_iupac) != len(reactants) or reactant_iupac==reactants:
reactant_iupac = [func(s) for s in reactants]
reactants = [i if i==j else f'{i}<{j}>' for i, j in zip(reactant_iupac, reactants)]
reactants = ' '.join([', '.join(reactants[:-1]), 'and', reactants[-1]])
if catalyst:
question += f'between {reactants}, with {catalyst} as catalyst?'
else:
question += f'between {reactants}?'
else:
if reactant == reactant_iupac:
reactant_iupac = func(reactant)
reactant = f'{reactant_iupac}<{reactant}>'
question += f'of {func(reactant)} in water?'
choice = ''
for idx, (opt, dis) in enumerate(zip(options, dist)):
smiles, iupac = opt
if '.' in smiles:
iupac = iupac.split('|')
smiles = smiles.split('.')
if len(iupac) != len(smiles) or smiles==iupac:
iupac = smiles2iupac(smiles)
opt = [f'{i}<{j}>' for i, j in zip(iupac, smiles)]
opt = ' '.join([', '.join(opt[:-1]), 'and', opt[-1]])
else:
if smiles==iupac:
iupac = smiles2iupac(smiles)
opt = f'{iupac}<{smiles}>'
choice += f'{to_char(idx)}. {opt}\n'
if useD:
choice += f'{to_char(len(options))}. None of above'
if test or (not answer):
answer = f'Answer: '
else:
answer = answer if answer>=0 else len(options)
answer = f'Answer: {to_char(answer)}'
return '\n'.join((question, choice, answer))