-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthesize_gpt.py
233 lines (201 loc) · 13.1 KB
/
synthesize_gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import pandas as pd
import os
# from utils import *
from tqdm import tqdm
import random
from openai import OpenAI
import api_secrets
os.environ['OPENAI_API_KEY'] = api_secrets.openai_api_key
client = OpenAI(
api_key=os.environ["OPENAI_API_KEY"],
organization=api_secrets.openai_org
)
def formulate_prompt(context, sentence):
return [
{"role": "system", "content": """
You are an English teacher aiming to improve coherence in student writing. You are about to synthesize data for the coherence detection task. Concretely, for each data point, you will be given: a sentence S and a context C, which comprises all preceding sentences up to and immediately before sentence S in an essay written by an English second language learner. Then, you should follow the following steps to create a complete data point: \n1) For sentence S and context C, determine if sentence S is coherent with context C. You need to output 1 for [Coherence] if the sentence S is coherent when appended to the context C; otherwise, output 0; \n2) Then, if you output 1 in the previous step, output "Done" and finish; otherwise, move on to the following steps; \n3) You need to output 1 for [Reason 1] if the sentence S does not connect semantically with the context C; otherwise, output 0; \n4) You need to output 1 for [Reason 2] if the new sentence S discusses an entity that has not been introduced in C yet, or the new sentence S discusses an entity that is ambiguous in C; otherwise, output 0; \n5) You need to output 1 for [Reason 3] if the relation between sentence S and previous ones in C doesn't make sense due to a missing discourse marker; otherwise, output 0; \n6) You need to output 1 for [Reason 4] if the new sentence S contradicts or is inconsistent with previously presented information in C; otherwise, output 0; \n7) You need to output 1 for [Reason 5] if the new sentence S introduces information that is completely irrelevant to the context C; otherwise, output 0; \n8) You need to output 1 for [Reason 6] if the new sentence S introduces information that is either tangential or slightly irrelevant to the context C; otherwise, output 0; \n9) You need to output 1 for [Reason 7] if the comment (rheme/focus) of the sentence does not agree with the topic of the sentence; otherwise, output 0 \n10) [Rewrite] You should modify sentence S as minimally as possible to improve its coherence based on the following suggestions for each reason you might select above: \n- [Reason 1]: add reference words or repeated words or substitutions that can semantically connect sentence S to the context C; \n- [Reason 2]: link the newly introduced entity or ambiguous entity in S to the given context C \n- [Reason 3]: add or change a discourse marker that ties the sentence S with the given context C \n- [Reason 4]: align the newly introduced information with previously introduced information so that the new information in S does not contradict the context C \n- [Reason 5]: modify the sentence S so that it is relevant to the context C established by the writer \n- [Reason 6]: only output "DELETE" for deleting the sentence S \n- [Reason 7]: rewrite sentence S so that the comment of sentence S agrees with the topic of sentence S \n\nPlease disregard any incoherences in context C. You should output 1 for [Coherence] only if: \na) sentence S semantically connects to context C, and \nb) all entities discussed in the new sentence S have been introduced in C, and \nc) sentence S demonstrates reasonable discourse relation with previous ones, and \nd) sentence S contains a meaning consistent with previously presented data in C, and \ne) sentence S contains a meaning relevant to previously presented data in C. \n\nHere are some examples:\nC: I believe that young people nowadays do not give enough time to helping their communities. \nS: This, i believe is caused by the environment we live in. \n- [Coherence]: 1 \n- Done \n\nC: Then, I wanna indicate that young people can study many things that are interesting or exciting things for young people. \nS: About students, they can learn various fields that students want to study. \n- [Coherence]: 0 \n- [Reason 1]: 1 \n- [Reason 2]: 0 \n- [Reason 3]: 1 \n- [Reason 4]: 0 \n- [Reason 5]: 0 \n- [Reason 6]: 0 \n- [Reason 7]: 0 \n- [Rewrite]: For example when they study, they can learn various fields that they want to study.\n\nC: There are three main reasons that my ideas support effectively, like action, study and knowledge. \nS: First of all, I wanna introduce young people's active points in comparison with older people. \n- [Coherence]: 0 \n- [Reason 1]: 0 \n- [Reason 2]: 0 \n- [Reason 3]: 0 \n- [Reason 4]: 1 \n- [Reason 5]: 0 \n- [Reason 6]: 0 \n- [Reason 7]: 0 \n- [Rewrite]: First of all, I wanna introduce young people's actions in comparison with older people's. \n\nC: These publicity agents use a lot of techniques to make the products look better, for example they use specialized software like photoshop to increase the size of the product or make it brighter, or maybe an artificial imitation of the product that does not necessarily have the same texture of look. \nS: Even though one can observe this situation mostly in food products.\n- [Coherence]: 0\n- [Reason 1]: 0\n- [Reason 2]: 0 \n- [Reason 3]: 0 \n- [Reason 4]: 0 \n- [Reason 5]: 0 \n- [Reason 6]: 1 \n- [Reason 7]: 0 \n- [Rewrite]: DELETE \n\nC: I, however, think in terms of physical and mental factors young people are superior to older people. \nS: For example, in the case of sports young people can run and jump, and they can train their muscles that are used in each sport such as transitional sports or silence sports. \n- [Coherence]: 0 \n- [Reason 1]: 0 \n- [Reason 2]: 0 \n- [Reason 3]: 0 \n- [Reason 4]: 0 \n- [Reason 5]: 0 \n- [Reason 6]: 0 \n- [Reason 7]: 1 \n- [Rewrite]: For example, in the case of sports young people can run and jump, and they can train their muscles for sports more than older people can. \n\nNow, please generate:
"""},
{"role": "user", "content": f"""
C: {context} \nS: {sentence} \n
"""}
]
def formulate_prompt_test(context, sentence):
return [
{"role": "system", "content": """
You are about to perform the task of sentence rewriting for the sentences written by second-language English learners. In this task, given a context C and a sentence S, where S is incoherent with C, you need to rewrite sentence S to make it coherent with C. Now, please generate:
"""},
{"role": "user", "content": f"""
C: {context} \nS: {sentence} \n Rewrite:"""}
]
def rewrite_sentence(data_df):
essay_ids = []
topics = []
contexts = []
sentences = []
rewrites = []
for i in tqdm(range(len(data_df))):
row = data_df.iloc[i]
essay_id = row['essay_id']
topic = row['topic']
context = row['context']
sentence = row['sentence']
# formulate the prompt
prompt = formulate_prompt_test(context, sentence)
# initialize all items with empty string
rewrite = ''
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=prompt,
max_tokens=2048,
temperature=0
)
# output = response['choices'][0]['message']['content']
output = response.choices[0].message.content
print(output)
# list_output = output.split('\n')
rewrite = output
except Exception as e:
print(e)
print('Error with index ', i)
print('Context: ', context)
print('Sentence: ', sentence)
# append the results to the lists
essay_ids.append(essay_id)
topics.append(topic)
contexts.append(context)
sentences.append(sentence)
rewrites.append(rewrite)
return essay_ids, topics, contexts, sentences, rewrites
def detect_reason_rewrite(data_df, test=False, save_every=1000):
essay_ids = []
topics = []
contexts = []
sentences = []
labels = []
reason1, reason2, reason3, reason4, reason5, reason6, reason7 = [], [], [], [], [], [], []
rewrites = []
for i in tqdm(range(len(data_df))):
row = data_df.iloc[i]
essay_id = row['essay_id']
topic = row['topic']
context = row['context']
sentence = row['sentence']
# formulate the prompt
if test == True:
prompt = formulate_prompt_test(context, sentence)
else:
prompt = formulate_prompt(context, sentence)
# initialize all items with empty string
label = ''
r1, r2, r3, r4, r5, r6, r7 = '', '', '', '', '', '', ''
rewrite = ''
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=prompt,
max_tokens=4096,
temperature=0
)
# output = response['choices'][0]['message']['content']
output = response.choices[0].message.content
print(output)
list_output = output.split('\n')
if test == True:
label = row['label']
r1 = list_output[0].split(': ')[1].strip()
r2 = list_output[1].split(': ')[1].strip()
r3 = list_output[2].split(': ')[1].strip()
r4 = list_output[3].split(': ')[1].strip()
r5 = list_output[4].split(': ')[1].strip()
r6 = list_output[5].split(': ')[1].strip()
r7 = list_output[6].split(': ')[1].strip()
rewrite = list_output[7].split(': ')[1].strip()
else:
label = list_output[0].split(': ')[1].strip()
if label == '0':
r1 = list_output[1].split(': ')[1].strip()
r2 = list_output[2].split(': ')[1].strip()
r3 = list_output[3].split(': ')[1].strip()
r4 = list_output[4].split(': ')[1].strip()
r5 = list_output[5].split(': ')[1].strip()
r6 = list_output[6].split(': ')[1].strip()
r7 = list_output[7].split(': ')[1].strip()
rewrite = list_output[8].split(': ')[1].strip()
except Exception as e:
print(e)
print('Error with index ', i)
print('Context: ', context)
print('Sentence: ', sentence)
# append the results to the lists
essay_ids.append(essay_id)
topics.append(topic)
contexts.append(context)
sentences.append(sentence)
labels.append(label)
reason1.append(r1)
reason2.append(r2)
reason3.append(r3)
reason4.append(r4)
reason5.append(r5)
reason6.append(r6)
reason7.append(r7)
rewrites.append(rewrite)
# save the results to a csv file after every 1000 iterations
if i % save_every == 0:
df = pd.DataFrame({
'essay_id': essay_ids,
'topic': topics,
'context': contexts,
'sentence': sentences,
'label': labels,
'R1': reason1,
'R2': reason2,
'R3': reason3,
'R4': reason4,
'R5': reason5,
'R6': reason6,
'R7': reason7,
'Rewrite': rewrites
})
f_name = 'data/train/syn_train_{}.csv'.format(i)
df.to_csv(f_name, index=False)
return essay_ids, topics, contexts, sentences, labels, reason1, reason2, reason3, reason4, reason5, reason6, reason7, rewrites
def generate_by_detect_reason_rewrite(inc_file, output_file):
# data_inc_df = pd.read_csv('data/train/sample_889_inc.csv')
# data_inc_df = pd.read_csv('data/test/test_neg_448.csv')
data_inc_df = pd.read_csv(inc_file)
essay_ids, topics, contexts, sentences, labels, reason1, reason2, reason3, reason4, reason5, reason6, reason7, rewrites = detect_reason_rewrite(data_inc_df, test=True, save_every=50)
# save the results to a csv file
df_all = pd.DataFrame({
'essay_id': essay_ids,
'topic': topics,
'context': contexts,
'sentence': sentences,
'label': labels,
'R1': reason1,
'R2': reason2,
'R3': reason3,
'R4': reason4,
'R5': reason5,
'R6': reason6,
'R7': reason7,
'Rewrite': rewrites
})
# f_name = 'data/train/syn_train_plus_plus_13548.csv'
# f_name = 'data/test/test_neg_448_gpt_rewrite.csv'
df_all.to_csv(output_file, index=False)
if __name__ == '__main__':
generate_by_detect_reason_rewrite('./data/raw/sample_759_inc.csv', 'data/train/syn_train.csv')
# data_df = pd.read_csv('data/test/test_rewrite_213_no_delete.csv')
# essay_ids, topics, contexts, sentences, rewrites = rewrite_sentence(data_df)
# # save the results to a csv file
# df_all = pd.DataFrame({
# 'essay_id': essay_ids,
# 'topic': topics,
# 'context': contexts,
# 'sentence': sentences,
# 'rewrite': rewrites
# })
# f_name = 'data/test/test_rewrite_213_gpt_turbo_ref.csv'
# df_all.to_csv(f_name, index=False)