Skip to content

Commit e1161d3

Browse files
committed
modify the template; complete the reasoning test for 3 LLMs using ChatLogic; test the Llama2-7B finetune ver; create the ablation study template for 3 LLMs
1 parent 5dc2433 commit e1161d3

10 files changed

+278
-15
lines changed

Diff for: Ablation Study/Ablation_study_Llama2.py

Whitespace-only changes.

Diff for: Ablation Study/Ablation_study_gpt3.5.py

Whitespace-only changes.

Diff for: Ablation Study/Ablation_study_gpt4.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import csv
2+
import json
3+
import re
4+
5+
json_files = [
6+
"../../PARARULE_plus_step2_Animal_sample.json",
7+
"../../PARARULE_plus_step3_Animal_sample.json",
8+
"../../PARARULE_plus_step4_Animal_sample.json",
9+
"../../PARARULE_plus_step5_Animal_sample.json",
10+
"../../PARARULE_plus_step2_People_sample.json",
11+
"../../PARARULE_plus_step3_People_sample.json",
12+
"../../PARARULE_plus_step4_People_sample.json",
13+
"../../PARARULE_plus_step5_People_sample.json"
14+
]
15+
16+
def remove_spaces(text):
17+
# Replace multiple spaces with a single space
18+
text = re.sub(r' +', ' ', text)
19+
# Remove leading and trailing spaces from each line
20+
text = re.sub(r'^ +| +$', '', text, flags=re.MULTILINE)
21+
return text
22+
23+
with open("Llama2-7B-finetune-prompt.csv", "w", newline="", encoding="utf-8") as csv_file:
24+
csv_writer = csv.writer(csv_file)
25+
for json_file in json_files:
26+
step = '_'.join(json_file.split("_")[2:4])
27+
with open(json_file, "r", encoding="utf-8") as f:
28+
data = json.load(f)
29+
for entry in data:
30+
context = entry["context"]
31+
question = entry["question"]
32+
label = entry["label"]
33+
# Replace this with your actual function call
34+
prompt = f"""instruction: Based on the closed world assumption, please help me complete a multi-step logical reasoning task (judge true or not). Please help me answer whether the question is correct or not based on the facts and rules formed by these natural language propositions. You should just return me one number as the final answer (1 for true and 0 for wrong) without providing any reasoning process. The input contains all propositions, each sentence is an independent proposition, and the question, and the output is the answer to the question.,\
35+
input: Propositions: {context}, Question: {question}, output:"""
36+
csv_writer.writerow([remove_spaces(prompt)])
37+
csv_writer.writerow([label])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from peft import PeftModel
2+
from transformers import AutoTokenizer, LlamaForCausalLM
3+
import torch
4+
import csv
5+
import json
6+
import re
7+
8+
# run this code on VM machine (server from the lab)
9+
10+
11+
# json_files = [
12+
# "PARARULE_plus_step2_Animal_sample.json"
13+
# ]
14+
15+
# "PARARULE_plus_step3_Animal_sample.json",
16+
# "PARARULE_plus_step4_Animal_sample.json",
17+
# "PARARULE_plus_step5_Animal_sample.json"
18+
# "../PARARULE_plus_step2_People_sample.json",
19+
# "../PARARULE_plus_step3_People_sample.json",
20+
# "../PARARULE_plus_step4_People_sample.json",
21+
# "../PARARULE_plus_step5_People_sample.json"
22+
23+
24+
25+
device = torch.device('cuda:0')
26+
# load the original llm
27+
model_path = "meta-llama/Llama-2-7b-hf"
28+
model = LlamaForCausalLM.from_pretrained(model_path, trust_remote_code=True).half().to(device)
29+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
30+
model = PeftModel.from_pretrained(model, "lora-alpaca").half()
31+
prompt = "instruction: Based on the closed world assumption, please help me complete a multi-step logical reasoning task (judge true or not). Please help me answer whether the question is correct or not based on the facts and rules formed by these natural language propositions. You should just return me one number as the final answer (1 for true and 0 for wrong) without providing any reasoning process. The input contains all propositions, each sentence is an independent proposition, and the question, and the output is the answer to the question., input: Propositions: The dinosaur is lazy. The dinosaur is rough. The wolf is heavy. The wolf is fierce. The dinosaur visits the squirrel. The wolf likes the rabbit. The squirrel is quiet. The rabbit is quiet. The rabbit is furry. The rabbit is small. If something is not quiet then it attacks the squirrel. If something attacks the squirrel then it is dull. If something is not smart then it is heavy. If something is not strong then it is beautiful. If something is furry then it is small. If something is small and not big then it is lovely. If something is heavy and not smart then it is awful. If something is lazy and rough then it is big. If something is big and not small then it is fierce. All beautiful animals are cute., Question: The dinosaur is awful., output:"
32+
inputs = tokenizer(prompt, return_tensors="pt").to(device)
33+
generate_ids = model.generate(input_ids=inputs.input_ids, max_length=2048)
34+
print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
35+
36+
# model.generate(tokenizer, text, history=[])
37+
38+
39+
# # load LoRA hyperpara to the original LLM
40+
# model = PeftModel.from_pretrained(model, "lora-alpaca").half()
41+
# model.generate(tokenizer, text, history=[])
42+
43+
# from transformers import Pipeline, LlamaForCausalLM, AutoTokenizer
44+
# import torch
45+
# import csv
46+
# import json
47+
48+
# # 初始化模型和tokenizer
49+
# model_path = "meta-llama/Llama-2-7b-hf"
50+
# model = LlamaForCausalLM.from_pretrained(model_path, trust_remote_code=True).half()
51+
# tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
52+
53+
# # define pipeline
54+
# generate_pipeline = Pipeline(model=model, tokenizer=tokenizer)
55+
56+
# # process multiple json files
57+
# json_files = [
58+
# "PARARULE_plus_step2_Animal_sample.json",
59+
# "PARARULE_plus_step3_Animal_sample.json",
60+
# "PARARULE_plus_step4_Animal_sample.json",
61+
# "PARARULE_plus_step5_Animal_sample.json"
62+
# ]
63+
64+
# # initialize the csv file
65+
# with open("Llama2-7B-finetune-animal.csv", "w", newline="", encoding="utf-8") as csv_file:
66+
# csv_writer = csv.writer(csv_file)
67+
# csv_writer.writerow(["step", "return", "label"]) # Write header
68+
69+
# # process each json file
70+
# for json_file in json_files:
71+
# step = '_'.join(json_file.split("_")[2:4])
72+
# with open(json_file, "r", encoding="utf-8") as f:
73+
# data = json.load(f)
74+
# for entry in data:
75+
# context = entry["context"]
76+
# question = entry["question"]
77+
# label = entry["label"]
78+
# prompt = f"""instruction: Based on the closed world assumption, please help me complete a multi-step logical reasoning task (judge true or not). Please help me answer whether the question is correct or not based on the facts and rules formed by these natural language propositions. You should just return me one number as the final answer (1 for true and 0 for wrong) without providing any reasoning process. The input contains all propositions, each sentence is an independent proposition, and the question, and the output is the answer to the question.,\
79+
# input: Propositions: {context}, Question: {question}, output:"""
80+
81+
# # generate the context
82+
# generated_text = generate_pipeline(prompt, max_length=2048)
83+
84+
# # write into csv file
85+
# csv_writer.writerow([step, generated_text[0]['generated_text'], label])

Diff for: complete_reasoning_3.5.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def Regeneration(demo, code, text, model = "gpt-3.5-turbo"):
7272

7373

7474
correct_num = 0
75-
for i in range(0, 1):
75+
for i in range(0, 40):
7676
try:
7777
# first time generate the code from propositions
7878
result_string = extract_string(Generation(templates.templates["agent_engineer"], data[i]['context'],
@@ -90,7 +90,8 @@ def Regeneration(demo, code, text, model = "gpt-3.5-turbo"):
9090
print(f"tag: {tag}")
9191
print(f"tag_final: {tag_final}")
9292
# if it pass the comparison
93-
if "1" in tag_final:
93+
if "true" in tag_final:
94+
print("no need to regenerate")
9495
flag = 0
9596
with open(PY_filename, 'w') as file:
9697
file.write("{}".format(result_string))

Diff for: complete_reasoning_4.py

+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import json
2+
import call_openai_API
3+
import templates
4+
import openai
5+
import subprocess
6+
import csv
7+
import os
8+
9+
# Initialize the OpenAI API client
10+
openai.api_key = api_key = os.getenv("OPENAI_API_KEY")
11+
#Define the file name
12+
JSON_filename = 'PARARULE_plus_step2_People_sample.json'
13+
PY_filename = 'pyDatalog_processing.py'
14+
15+
16+
def extract_string(input_string):
17+
left_boundary = 'import'
18+
right_boundary = ')'
19+
20+
start_index = input_string.find(left_boundary)
21+
end_index = input_string.rfind(right_boundary, start_index)
22+
23+
if start_index != -1 and end_index != -1:
24+
extracted_string = input_string[start_index:end_index + 1]
25+
return extracted_string.strip()
26+
27+
return None
28+
29+
30+
def Judgement(demo, question, model):
31+
result_string = call_openai_API.ai_generation_check(demo, question, model = "gpt-4")
32+
return result_string
33+
34+
35+
# Complete Communication with ChatGPT
36+
def Generation(demo, context, question, requirements, model = "gpt-4"):
37+
38+
result_string = call_openai_API.ai_function_generation(demo, context, question, requirements, model)
39+
return result_string
40+
41+
def BackConvertion(demo, code, model = "gpt-4"):
42+
result_string = call_openai_API.ai_function_backconvertion(demo, code, model)
43+
return result_string
44+
45+
# Communication(templates.templates["agent_engineer"], PARARULE_Plus.PARARULE_Plus_dataset['train'][200]['context'], PARARULE_Plus.PARARULE_Plus_dataset['train'][200]['question'], templates.templates["no_extra_content"], "gpt-3.5-turbo")
46+
47+
def Adjustment(demo, code, error_message, model = "gpt-4"):
48+
49+
result_string = call_openai_API.ai_generation_adjustment(demo, code, error_message, model)
50+
return result_string
51+
52+
def Extraction(demo, text, model = "gpt-4"):
53+
result_string = call_openai_API.ai_function_extraction(demo, text, model)
54+
return result_string
55+
56+
def Comparison(demo, original, generated, model = "gpt-4"):
57+
58+
result_string = call_openai_API.ai_function_comparison(demo, original, generated, model)
59+
return result_string
60+
61+
62+
def Regeneration(demo, code, text, model = "gpt-4"):
63+
result_string = call_openai_API.ai_function_regeneration(demo, code, text, model)
64+
return result_string
65+
66+
67+
68+
69+
70+
with open(JSON_filename, 'r') as file:
71+
data = json.load(file)
72+
73+
74+
correct_num = 0
75+
for i in range(0, 40):
76+
try:
77+
# first time generate the code from propositions
78+
result_string = extract_string(Generation(templates.templates["agent_engineer"], data[i]['context'],
79+
data[i]['question'],
80+
templates.templates["no_extra_content"]))
81+
# print(result_string)
82+
83+
# convert code back 2 propositions
84+
propositions_generated = BackConvertion(templates.templates["agent_engineer_neg"], result_string)
85+
86+
# Comparison
87+
# zero-shot CoT is here
88+
tag = Comparison(templates.templates["check_error_part1"], f"Propositions:{data[i]['context']}, Question:{data[i]['question']}", propositions_generated)
89+
tag_final = Extraction(templates.templates["check_error_part2"], tag)
90+
print(f"tag: {tag}")
91+
print(f"tag_final: {tag_final}")
92+
# if it pass the comparison
93+
if "true" in tag_final:
94+
print("no need to regenerate")
95+
flag = 0
96+
with open(PY_filename, 'w') as file:
97+
file.write("{}".format(result_string))
98+
output = subprocess.check_output(['python', PY_filename], universal_newlines=True)
99+
print(f"output: {output}")
100+
while (output.strip() != '1' and output.strip() != '0'):
101+
result_string = extract_string(Adjustment(templates.templates["adjustment_agent"],
102+
result_string, output))
103+
with open(PY_filename, 'w') as file:
104+
file.write("{}".format(result_string))
105+
print("reprocessing...")
106+
output = subprocess.check_output(['python', PY_filename], universal_newlines=True)
107+
print("New output:" + output)
108+
print(type(output))
109+
flag += 1
110+
if (flag == 3):
111+
break
112+
else:
113+
print("enter the regeneration part")
114+
# regenaration
115+
result_string = extract_string(Regeneration(templates.templates["regeneration"], f"Propositions:{data[i]['context']}, Question:{data[i]['question']}", result_string, tag_final))
116+
print(f"regeneration result: {result_string}")
117+
with open(PY_filename, 'w') as file:
118+
file.write("{}".format(result_string))
119+
output = subprocess.check_output(['python', PY_filename], universal_newlines=True)
120+
flag = 0
121+
while (output.strip() != '1' and output.strip() != '0'):
122+
result_string = extract_string(Adjustment(templates.templates["adjustment_agent"],
123+
result_string, output))
124+
with open(PY_filename, 'w') as file:
125+
file.write("{}".format(result_string))
126+
print("reprocessing...")
127+
output = subprocess.check_output(['python', PY_filename], universal_newlines=True)
128+
print("New output:" + output)
129+
print(type(output))
130+
flag += 1
131+
if (flag == 3):
132+
break
133+
134+
# check correctness
135+
if (output.strip() != '1' and output.strip() != '0'):
136+
correct_num += 1
137+
if int(output.strip()) == data[i]['label']:
138+
correct_num += 1
139+
else:
140+
continue
141+
except Exception as e:
142+
continue
143+
print(f"correct_num: {correct_num}")

Diff for: complete_reasoning_Llama2.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def batch_process(text):
9494
# csv_writer.writerow([step, responses, label])
9595

9696
correct_num = 0
97-
for i in range(0, 1):
97+
for i in range(0, 40):
9898
try:
9999

100100
# first time generate the code from propositions
@@ -111,7 +111,8 @@ def batch_process(text):
111111
tag_final = batch_process(f"""{templates.templates['check_error_part2']}, the following is the analysis processing: {tag}""")
112112

113113
# if it pass the comparison
114-
if "1" in tag_final:
114+
if "true" in tag_final:
115+
115116
flag = 0
116117
with open(PY_filename, 'w') as file:
117118
file.write("{}".format(result_string))

Diff for: pyDatalog_processing.py

+4-9
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
from pyDatalog import pyDatalog
33
try:
44
# Declare the pyDatalog variables
5-
pyDatalog.create_terms('X, strong, huge, big, short, little, quiet, wealthy, smart, dull, rough, sad, thin, bad, kind, nice, poor, small')
6-
5+
pyDatalog.create_terms('X, strong, huge, big, short, little, thin, dull, rough, bad, quiet, wealthy, kind, small, nice, poor, smart, sad')
76
# Define the facts
87
+strong('Dave')
98
+huge('Dave')
@@ -16,25 +15,21 @@
1615
+dull('Harry')
1716
+rough('Harry')
1817
+sad('Harry')
19-
2018
# Define the rules
2119
quiet(X) <= strong(X)
2220
thin(X) <= short(X) & little(X)
2321
bad(X) <= dull(X) & rough(X)
2422
kind(X) <= quiet(X) & wealthy(X)
25-
26-
# Define the constraints
2723
small(X) <= thin(X)
2824
wealthy(X) <= quiet(X)
2925
nice(X) <= kind(X)
3026
poor(X) <= bad(X)
31-
3227
# Query the knowledge base
33-
result = ~small('Gary')
28+
result = small('Gary')
3429
if result:
35-
print(1)
36-
else:
3730
print(0)
31+
else:
32+
print(1)
3833
except Exception as e:
3934
traceback_info = traceback.format_exc()
4035
print(traceback_info)

Diff for: templates.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,9 @@ def remove_spaces(text):
119119
2. The number of propositions in the two texts must be consistent (including propositions and rules), otherwise it will be regarded as a difference.\n \
120120
3. If there is a difference, please tell me the difference.\n \
121121
Please think about this question step by step. """),
122-
"check_error_part2": remove_spaces("""Based on your analysis process of text comparison, please give me a final conclusion. We only consider differences in content expression of texts and ignore differences in expression or structure. \
123-
If there is no difference between the two texts, please return only the number 1 to me. If there is a difference, please return to me the content of the difference."""),
122+
"check_error_part2": remove_spaces("""Based on your analysis process of text comparison, please give me a final conclusion. We only consider differences in content expression of texts and ignore differences in expression or structure. If there is no difference between the two texts, please return only the word "true" to me. If there is a difference, please return to me the content of the difference without anything else.
123+
your expected output should be like: \n \
124+
Bob is huge.(original) vs Bob is big.(generated):they are different."""),
124125
"regeneration": remove_spaces("""I interacted with you and completed the following actions:\n \
125126
1. I asked you to help me convert logical reasoning problems described in natural language into pydatalog code.\n \
126127
2. After the first step is completed, I asked you to convert the pydatalog code you generated back into a logical reasoning problem described in natural language (note that in this step, I did not provide you with the context of the first step's behavior)\n \

0 commit comments

Comments
 (0)