-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_rephrase.py
357 lines (291 loc) · 17.5 KB
/
generate_rephrase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# Generates the rephrased data for detecting syntax-specific contamination
from constat.openai import OpenAIQuery
from ast import literal_eval
import pandas as pd
from transformers import set_seed
import numpy as np
import json
import os
import asyncio
from loguru import logger
import numpy as np
import re
np.random.seed(0)
set_seed(0)
base_path = 'data/contamination'
def parse_response(response, is_multiple_choice, contains_answer, q_word, a_word, question_contains_options=True):
"""
Parses the response string and extracts the question, answer, and options (if applicable).
Specifically, this code extracts these components from the response string that is expected to be in the following format:
### {{ q_word }}
[[ QUESTION ]]
### {{ a_word }}
[[ ANSWER ]]
Options can be included in the question or answer section, depending on the question_contains_options parameter.
They will be extracted by looking for lines that start with A:, B:, C:, D:, E: or A., B., C., D., E. or A), B), C), D), E).
Args:
response (str): The response string to parse.
is_multiple_choice (bool): Indicates whether the question is a multiple-choice question.
contains_answer (bool): Indicates whether the response contains an answer.
q_word (str): The keyword used to identify the question.
a_word (str): The keyword used to identify the answer.
question_contains_options (bool, optional): Indicates whether the question contains options. Defaults to True.
Returns:
tuple: A tuple containing the parsed question, answer, and options (if applicable).
"""
q_small = q_word.lower()
a_small = a_word.lower()
question, answer, options = None, None, None
response = response.replace(f'### {q_word}:', f'### {q_word}').replace(f'### {q_small}:', f'### {q_small}')
response = response.replace(f'### {a_word}:', f'### {a_word}').replace(f'### {a_small}:', f'### {a_small}')
if f'### {q_word}' in response:
question = response.split(f'### {q_word}')[-1].strip()
elif f'### {q_small}' in response:
question = response.split(f'### {q_small}')[-1].strip()
else:
logger.warning(f'Failed to parse response (no question): {response}')
return None, None, None
if contains_answer:
if f'### {a_word}' in question:
question, answer = question.split(f'### {a_word}')
elif f'### {a_small}' in question:
question, answer = question.split(f'### {a_small}')
else:
logger.warning(f'Failed to parse response (no answer): {response}')
return None, None, None
question = question.strip()
answer = answer.strip()
if is_multiple_choice:
if question_contains_options:
question_and_options = question.split('\n')
if len(question_and_options) <= 2:
logger.warning(f'Failed to parse response (no options): {response}')
return None, None, None
question = question_and_options[0]
options = question_and_options[1:]
while len(options) > 0 and not options[0].startswith('A:') \
and not options[0].startswith('A.') \
and not options[0].startswith('A)'):
question += '\n' + options[0]
options = options[1:]
if len(options) == 0:
logger.warning(f'Failed to parse response (no options): {response}')
return None, None, None
else:
options = answer.split('\n')
question = question.strip()
options = [option.strip() for option in options]
for idx in range(len(options)):
# regex if the option starts with A: or A. or A) (or same things with B, C, D, E)
options[idx] = re.sub(r'^[A-E]\s*[:\)\.]\s*', '', options[idx])
return question, answer, options
def generate(input_prompts, system_prompt, store_file, is_multiple_choice, contains_answer=False,
temperature=0.7, max_tokens=1024, question_word='Question', answer_word='Answer',
question_contains_options=True):
"""
Generate rephrased questions and answers based on the given input prompts.
We use GPT-4-Turbo to generate the rephrased questions and answers.
Args:
input_prompts (list): A list of input prompts for generating rephrased questions and answers.
system_prompt (str): The system prompt to be used in the generation process.
store_file (str): The file path to store the generated responses.
is_multiple_choice (bool): Indicates whether the generated questions are multiple-choice or not.
contains_answer (bool, optional): Indicates whether the input prompts contain the answer or not. Defaults to False.
temperature (float, optional): The temperature value for controlling the randomness of the generated responses. Defaults to 0.7.
max_tokens (int, optional): The maximum number of tokens allowed in the generated responses. Defaults to 1024.
question_word (str, optional): The word to be used for denoting a question in the generated responses. Defaults to 'Question'.
answer_word (str, optional): The word to be used for denoting an answer in the generated responses. Defaults to 'Answer'.
question_contains_options (bool, optional): Indicates whether the generated questions contain options or not. Defaults to True.
Returns:
pandas.DataFrame: A DataFrame containing the generated questions, options (if applicable), and answers.
"""
querier = OpenAIQuery(model='gpt-4-turbo', max_tokens=max_tokens, temperature=temperature,
error_stop=100, timeout=120, read_cost=0.01, write_cost=0.03)
queries = [
[
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt},
] for user_prompt in input_prompts
]
store_file_json = store_file.replace('.csv', '.json')
if not os.path.isfile(store_file_json):
generated_responses, cost = asyncio.run(querier.run_string_prompts(queries))
print(cost)
json.dump(generated_responses, open(store_file.replace('.csv', '.json'), 'w'))
else:
generated_responses = json.load(open(store_file_json))
responses = [response['message']['content'] for response in generated_responses]
parsed_responses = [parse_response(response, is_multiple_choice, contains_answer, question_word,
answer_word, question_contains_options) for response in responses]
parsed_responses = [parsed_response for parsed_response in parsed_responses if parsed_response[0] is not None]
questions, answers, options = zip(*parsed_responses)
# create a pandas df
df = pd.DataFrame({
'question': questions,
'options': options,
'answer': answers
})
return df
def generate_gsm8k(contains_answer=False, contamination=True, temperature=0.7, max_tokens=1024):
if contamination:
data = pd.read_csv(os.path.join(base_path, 'gsm8k', 'contamination.csv'))
else:
data = pd.read_csv(os.path.join(base_path, 'gsm8k', 'no_contamination.csv'))
system_prompt = '''Significantly rephrase the given question, but make sure the answer is still the same. Do not include the answer in your response.
Format your reply as:
### Question
[New rephrased question]'''
if contains_answer:
system_prompt = '''You are a helpful assistant. The user will give you a question and answer from the gsm8k dataset. Rewrite the question and answer. Make significant changes to the used vocabulary, length and structure. Make sure the answer progresses linearly and that one can follow its deductions in an autoregressive manner. Still employ the '####' sign to indicate the output, and the '<< >>' signs to indicate computations. Ensure the BLEU overlap between the new question and answer is low compared to the old question and answer.
Format your reply as:
### Question
[New Rephrased Question]
### Answer
[New Answer]'''
file_name = 'rephrase'
if contains_answer:
file_name += '_answer'
if not contamination:
file_name += '_no'
input_prompts_cont = data['question'].tolist()
if contains_answer:
input_prompts_answer = data['answer'].tolist()
input_prompts_cont = [f'### Question\n{question}\n### Answer\n{answer}' for question, answer in zip(input_prompts_cont, input_prompts_answer)]
else:
input_prompts_cont = [f'### Question\n{question}' for question in input_prompts_cont]
df = generate(input_prompts_cont, system_prompt, os.path.join(base_path, 'gsm8k', f'{file_name}_cont.csv'),
is_multiple_choice=False, contains_answer=contains_answer,
temperature=temperature, max_tokens=max_tokens)
if not contains_answer:
df['answer'] = data['answer'].tolist()
df.to_csv(os.path.join(base_path, 'gsm8k', f'{file_name}_cont.csv'), index=False)
def generate_mmlu(contains_answer=False, contamination=False, temperature=0.7, max_tokens=1024):
if contamination:
data = pd.read_csv(os.path.join(base_path, 'mmlu', 'contamination.csv'), converters={'choices': literal_eval})
else:
data = pd.read_csv(os.path.join(base_path, 'mmlu', 'no_contamination.csv'), converters={'choices': literal_eval})
system_prompt = '''Significantly rephrase the given question and options, but make sure that all possible options still have the same label. Label the multiple choice answers with A:, B:, C:, D:, E:. Do not include the answer in your response.
Format your reply as:
### Question
[New rephrased question]'''
if contains_answer:
system_prompt = '''Significantly rephrase the given question and options, but make sure that the correct option still has the same label. Label the multiple choice answers with A:, B:, C:, D:, E:. Do not include the answer in your response. Wrong options can be completely changed (and thus do not need to be simple rephrases).
Format your reply as:
### Reasoning
[Reasoning about how to rephrase question and options]
### Question
[New rephrased question]'''
file_name = 'rephrase'
if contains_answer:
file_name += '_answer'
if not contamination:
file_name += '_no'
input_prompts_cont = data['question'].tolist()
input_prompts_choices = data['choices'].tolist()
input_prompt_option = ['\n'.join([f'{chr(65 + idx)}: {option}' for idx, option in enumerate(choices)]) for choices in input_prompts_choices]
input_prompts_cont = [f'{question}\n{options}' for question, options in zip(input_prompts_cont, input_prompt_option)]
input_prompts_cont = [f'### Question\n{question}' for question in input_prompts_cont]
if contains_answer:
input_prompts_answer = data['answer'].tolist()
input_prompts_cont = [f'{question}\n### Answer\n{choices[answer]}' for question, answer, choices in zip(input_prompts_cont, input_prompts_answer, input_prompts_choices)]
df = generate(input_prompts_cont, system_prompt, os.path.join(base_path, 'mmlu', f'{file_name}_cont.csv'),
is_multiple_choice=True, contains_answer=False, temperature=temperature, max_tokens=max_tokens)
df['choices'] = df['options']
df['answer'] = data['answer'].tolist()
df['subject'] = data['subject']
df.to_csv(os.path.join(base_path, 'mmlu', f'{file_name}_cont.csv'), index=False)
def generate_arc(contains_answer=False, contamination=False, temperature=0.7, max_tokens=1024):
if contamination:
data = pd.read_csv(os.path.join(base_path, 'arc', 'contamination.csv'), converters={'choices': literal_eval})
else:
data = pd.read_csv(os.path.join(base_path, 'arc', 'no_contamination.csv'), converters={'choices': literal_eval})
system_prompt = '''Significantly rephrase the given question and options, but make sure that all possible options still have the same label. Label the multiple choice answers with A:, B:, C:, D:, E:. Do not include the answer in your response.
Format your reply as:
### Question
[New rephrased question]'''
if contains_answer:
system_prompt = '''Significantly rephrase the given question and options, but make sure that the correct option still has the same label. Label the multiple choice answers with A:, B:, C:, D:, E:. Do not include the answer in your response. Wrong options can be completely changed (and thus do not need to be simple rephrases).
Format your reply as:
### Reasoning
[Reasoning about how to rephrase question and options]
### Question
[New rephrased question]'''
file_name = 'rephrase'
if contains_answer:
file_name += '_answer'
if not contamination:
file_name += '_no'
input_prompts_cont = data['question'].tolist()
input_prompts_choices = [choices['text'] for choices in data['choices'].tolist()]
input_prompt_option = ['\n'.join([f'{chr(65 + idx)}: {option}' for idx, option in enumerate(choices)]) for choices in input_prompts_choices]
input_prompts_cont = [f'{question}\n{options}' for question, options in zip(input_prompts_cont, input_prompt_option)]
input_prompts_cont = [f'### Question\n{question}' for question in input_prompts_cont]
if contains_answer:
input_prompts_answer = data['answerKey'].tolist()
input_prompts_cont = [f'{question}\n### Answer\n{choices[ord(answer) - 65 if 0 <= ord(answer) - 65 < len(choices) else int(answer) - 1]}' for question, answer, choices in zip(input_prompts_cont, input_prompts_answer, input_prompts_choices)]
df = generate(input_prompts_cont, system_prompt, os.path.join(base_path, 'arc', f'{file_name}_cont.csv'),
is_multiple_choice=True, contains_answer=False, temperature=temperature, max_tokens=max_tokens)
df['choices'] = [{'text': choices, 'label': [chr(65 + idx) for idx in range(len(choices))]} for choices in df['options'].tolist()]
df['answerKey'] = data['answerKey'].tolist()
df['answerKey'] = df['answerKey'].apply(lambda x: chr(65 + int(x) - 1) if x.isdigit() else x)
df.to_csv(os.path.join(base_path, 'arc', f'{file_name}_cont.csv'), index=False)
def generate_hellaswag(contains_answer=False, contamination=False, temperature=0.7, max_tokens=1024):
if contamination:
data = pd.read_csv(os.path.join(base_path, 'hellaswag', 'contamination.csv'), converters={'endings': literal_eval})
else:
data = pd.read_csv(os.path.join(base_path, 'hellaswag', 'no_contamination.csv'), converters={'endings': literal_eval})
system_prompt = '''Significantly rephrase the given context and several possible continuations and make sure that all possible continuations still have the same label. Label the continuations with A:, B:, C:, D:, E:. Keep the ':' separating the activity label from the context in the context. Make sure that each continuation can still follow the rephrased context.
Format your reply as:
### Context
[New rephrased question]
### Continuation
[New rephrased continuation]'''
if contains_answer:
system_prompt = '''Significantly rephrase the given context and continuation. Keep the ':' separating the activity label from the context in the context. Make sure the continuation can still follow the rephrased context.
Format your reply as:
### Reasoning
[Reasoning about how to rephrase context and continuation]
### Context
[New rephrased question]
### Continuation
[New rephrased continuation]'''
file_name = 'rephrase'
if contains_answer:
file_name += '_answer'
if not contamination:
file_name += '_no'
input_prompts_cont = [f'{activity_label}: {ctx_a} {ctx_b}'.capitalize() for activity_label, ctx_a, ctx_b in zip(data['activity_label'].tolist(), data['ctx_a'].tolist(), data['ctx_b'].tolist())]
input_prompts_choices = data['endings'].tolist()
input_prompt_option = ['\n'.join([f'{chr(65 + idx)}: {option}' for idx, option in enumerate(choices)]) for choices in input_prompts_choices]
if contains_answer:
input_prompts_answer = data['label'].tolist()
input_prompts_cont = [f'### Context\n{question}\n### Continuation\n{choices[answer]}' for question, answer, choices in zip(input_prompts_cont, input_prompts_answer, input_prompts_choices)]
else:
input_prompts_cont = [f'### Context\n{question}\n### Continuation\n{options}' for question, options in zip(input_prompts_cont, input_prompt_option)]
df = generate(input_prompts_cont, system_prompt, os.path.join(base_path, 'hellaswag', f'{file_name}_cont.csv'),
is_multiple_choice=not contains_answer, contains_answer=True, temperature=temperature,
max_tokens=max_tokens, question_word='Context', answer_word='Continuation',
question_contains_options=False)
df['activity_label'] = df['question'].apply(lambda x: x.split(':')[0] if ':' in x else '')
df['ctx_a'] = df['question'].apply(lambda x: (':').join(x.split(':')[1:]) if ':' in x else x)
df['ctx_b'] = ''
df['endings'] = df['options']
df['label'] = data['label'].tolist()
df.to_csv(os.path.join(base_path, 'hellaswag', f'{file_name}_cont.csv'), index=False)
if __name__ == '__main__':
generate_gsm8k(False, True)
generate_gsm8k(True, True)
generate_gsm8k(False, False)
generate_gsm8k(True, False)
generate_mmlu(False, True)
generate_mmlu(False, False)
generate_mmlu(True, True)
generate_mmlu(True, False)
generate_arc(False, True)
generate_arc(False, False)
generate_arc(True, True)
generate_arc(True, False)
generate_hellaswag(False, False)
generate_hellaswag(True, False)
generate_hellaswag(False, True)
generate_hellaswag(True, True)