modify the template; complete the reasoning test for 3 LLMs using ChatLogic; test the Llama2-7B finetune ver; create the ablation study template for 3 LLMs

Wzs010429 · Wzs010429 · commit e1161d3d5e0c · 2023-10-06T00:25:30.000+13:00
diff --git a/Ablation Study/Ablation_study_Llama2.py b/Ablation Study/Ablation_study_Llama2.py
diff --git a/Ablation Study/Ablation_study_gpt3.5.py b/Ablation Study/Ablation_study_gpt3.5.py
diff --git a/Ablation Study/Ablation_study_gpt4.py b/Ablation Study/Ablation_study_gpt4.py
diff --git a/Baseline Experiment/Llama_2_7B_Finetune/prompt_processing.py b/Baseline Experiment/Llama_2_7B_Finetune/prompt_processing.py
@@ -0,0 +1,37 @@
+import csv
+import json
+import re
+
+json_files = [
+    "../../PARARULE_plus_step2_Animal_sample.json",
+    "../../PARARULE_plus_step3_Animal_sample.json",
+    "../../PARARULE_plus_step4_Animal_sample.json",
+    "../../PARARULE_plus_step5_Animal_sample.json",
+    "../../PARARULE_plus_step2_People_sample.json",
+    "../../PARARULE_plus_step3_People_sample.json",
+    "../../PARARULE_plus_step4_People_sample.json",
+    "../../PARARULE_plus_step5_People_sample.json"
+]
+
+def remove_spaces(text):
+    # Replace multiple spaces with a single space
+    text = re.sub(r' +', ' ', text)
+    # Remove leading and trailing spaces from each line
+    text = re.sub(r'^ +| +$', '', text, flags=re.MULTILINE)
+    return text
+
+with open("Llama2-7B-finetune-prompt.csv", "w", newline="", encoding="utf-8") as csv_file:
+    csv_writer = csv.writer(csv_file)
+    for json_file in json_files:
+        step = '_'.join(json_file.split("_")[2:4])
+        with open(json_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            for entry in data:
+                context = entry["context"]
+                question = entry["question"]
+                label = entry["label"]
+                # Replace this with your actual function call
+                prompt = f"""instruction: Based on the closed world assumption, please help me complete a multi-step logical reasoning task (judge true or not). Please help me answer whether the question is correct or not based on the facts and rules formed by these natural language propositions. You should just return me one number as the final answer (1 for true and 0 for wrong) without providing any reasoning process. The input contains all propositions, each sentence is an independent proposition, and the question, and the output is the answer to the question.,\
+                            input: Propositions: {context}, Question: {question}, output:"""
+                csv_writer.writerow([remove_spaces(prompt)])
+                csv_writer.writerow([label])
diff --git a/Baseline Experiment/Llama_2_7B_Finetune/test_Llama2-7B_finetune.py b/Baseline Experiment/Llama_2_7B_Finetune/test_Llama2-7B_finetune.py
@@ -0,0 +1,85 @@
+from peft import PeftModel
+from transformers import AutoTokenizer, LlamaForCausalLM
+import torch
+import csv
+import json
+import re
+
+# run this code on VM machine (server from the lab)
+
+
+# json_files = [
+#     "PARARULE_plus_step2_Animal_sample.json"
+# ]
+
+# "PARARULE_plus_step3_Animal_sample.json",
+# "PARARULE_plus_step4_Animal_sample.json",
+# "PARARULE_plus_step5_Animal_sample.json"
+# "../PARARULE_plus_step2_People_sample.json",
+#     "../PARARULE_plus_step3_People_sample.json",
+#     "../PARARULE_plus_step4_People_sample.json",
+#     "../PARARULE_plus_step5_People_sample.json"
+
+
+
+device = torch.device('cuda:0')
+# load the original llm
+model_path = "meta-llama/Llama-2-7b-hf"
+model = LlamaForCausalLM.from_pretrained(model_path, trust_remote_code=True).half().to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = PeftModel.from_pretrained(model, "lora-alpaca").half()
+prompt = "instruction: Based on the closed world assumption, please help me complete a multi-step logical reasoning task (judge true or not). Please help me answer whether the question is correct or not based on the facts and rules formed by these natural language propositions. You should just return me one number as the final answer (1 for true and 0 for wrong) without providing any reasoning process. The input contains all propositions, each sentence is an independent proposition, and the question, and the output is the answer to the question., input: Propositions: The dinosaur is lazy. The dinosaur is rough. The wolf is heavy. The wolf is fierce. The dinosaur visits the squirrel. The wolf likes the rabbit. The squirrel is quiet. The rabbit is quiet. The rabbit is furry. The rabbit is small. If something is not quiet then it attacks the squirrel. If something attacks the squirrel then it is dull. If something is not smart then it is heavy. If something is not strong then it is beautiful. If something is furry then it is small. If something is small and not big then it is lovely. If something is heavy and not smart then it is awful. If something is lazy and rough then it is big. If something is big and not small then it is fierce. All beautiful animals are cute., Question: The dinosaur is awful., output:"
+inputs = tokenizer(prompt, return_tensors="pt").to(device)
+generate_ids = model.generate(input_ids=inputs.input_ids, max_length=2048)
+print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
+
+# model.generate(tokenizer, text, history=[])
+
+
+# # load LoRA hyperpara to the original LLM
+# model = PeftModel.from_pretrained(model, "lora-alpaca").half()
+# model.generate(tokenizer, text, history=[])
+
+# from transformers import Pipeline, LlamaForCausalLM, AutoTokenizer
+# import torch
+# import csv
+# import json
+
+# # 初始化模型和tokenizer
+# model_path = "meta-llama/Llama-2-7b-hf"
+# model = LlamaForCausalLM.from_pretrained(model_path, trust_remote_code=True).half()
+# tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+# # define pipeline
+# generate_pipeline = Pipeline(model=model, tokenizer=tokenizer)
+
+# # process multiple json files
+# json_files = [
+#     "PARARULE_plus_step2_Animal_sample.json",
+#     "PARARULE_plus_step3_Animal_sample.json",
+#     "PARARULE_plus_step4_Animal_sample.json",
+#     "PARARULE_plus_step5_Animal_sample.json"
+# ]
+
+# # initialize the csv file
+# with open("Llama2-7B-finetune-animal.csv", "w", newline="", encoding="utf-8") as csv_file:
+#     csv_writer = csv.writer(csv_file)
+#     csv_writer.writerow(["step", "return", "label"])  # Write header
+
+#     # process each json file
+#     for json_file in json_files:
+#         step = '_'.join(json_file.split("_")[2:4])
+#         with open(json_file, "r", encoding="utf-8") as f:
+#             data = json.load(f)
+#             for entry in data:
+#                 context = entry["context"]
+#                 question = entry["question"]
+#                 label = entry["label"]
+#                 prompt = f"""instruction: Based on the closed world assumption, please help me complete a multi-step logical reasoning task (judge true or not). Please help me answer whether the question is correct or not based on the facts and rules formed by these natural language propositions. You should just return me one number as the final answer (1 for true and 0 for wrong) without providing any reasoning process. The input contains all propositions, each sentence is an independent proposition, and the question, and the output is the answer to the question.,\
+#                             input: Propositions: {context}, Question: {question}, output:"""
+
+#                 # generate the context
+#                 generated_text = generate_pipeline(prompt, max_length=2048)
+
+#                 # write into csv file
+#                 csv_writer.writerow([step, generated_text[0]['generated_text'], label])
diff --git a/complete_reasoning_3.5.py b/complete_reasoning_3.5.py
@@ -72,7 +72,7 @@ def Regeneration(demo, code, text, model = "gpt-3.5-turbo"):
 
 
 correct_num = 0
-for i in range(0, 1):
+for i in range(0, 40):
     try:
         # first time generate the code from propositions
         result_string = extract_string(Generation(templates.templates["agent_engineer"], data[i]['context'],
@@ -90,7 +90,8 @@ def Regeneration(demo, code, text, model = "gpt-3.5-turbo"):
         print(f"tag: {tag}")
         print(f"tag_final: {tag_final}")
         # if it pass the comparison
-        if "1" in tag_final:
+        if "true" in tag_final:
+            print("no need to regenerate")
             flag = 0
             with open(PY_filename, 'w') as file:
                 file.write("{}".format(result_string))
diff --git a/complete_reasoning_4.py b/complete_reasoning_4.py
@@ -0,0 +1,143 @@
+import json
+import call_openai_API
+import templates
+import openai
+import subprocess
+import csv
+import os
+
+# Initialize the OpenAI API client
+openai.api_key = api_key = os.getenv("OPENAI_API_KEY")
+#Define the file name
+JSON_filename = 'PARARULE_plus_step2_People_sample.json'
+PY_filename = 'pyDatalog_processing.py'
+
+
+def extract_string(input_string):
+    left_boundary = 'import'
+    right_boundary = ')'
+
+    start_index = input_string.find(left_boundary)
+    end_index = input_string.rfind(right_boundary, start_index)
+
+    if start_index != -1 and end_index != -1:
+        extracted_string = input_string[start_index:end_index + 1]
+        return extracted_string.strip()
+
+    return None
+
+
+def Judgement(demo, question, model):
+    result_string = call_openai_API.ai_generation_check(demo, question, model = "gpt-4")
+    return result_string
+
+
+# Complete Communication with ChatGPT
+def Generation(demo, context, question, requirements, model = "gpt-4"):
+
+    result_string = call_openai_API.ai_function_generation(demo, context, question, requirements, model)
+    return result_string
+
+def BackConvertion(demo, code, model = "gpt-4"):
+    result_string = call_openai_API.ai_function_backconvertion(demo, code, model)
+    return result_string
+
+# Communication(templates.templates["agent_engineer"], PARARULE_Plus.PARARULE_Plus_dataset['train'][200]['context'], PARARULE_Plus.PARARULE_Plus_dataset['train'][200]['question'], templates.templates["no_extra_content"], "gpt-3.5-turbo")
+
+def Adjustment(demo, code, error_message, model = "gpt-4"):
+
+    result_string = call_openai_API.ai_generation_adjustment(demo, code, error_message, model)
+    return result_string
+
+def Extraction(demo, text, model = "gpt-4"):
+    result_string = call_openai_API.ai_function_extraction(demo, text, model)
+    return result_string
+
+def Comparison(demo, original, generated, model = "gpt-4"):
+
+    result_string = call_openai_API.ai_function_comparison(demo,  original, generated, model)
+    return result_string
+
+
+def Regeneration(demo, code, text, model = "gpt-4"):
+    result_string = call_openai_API.ai_function_regeneration(demo, code, text, model)
+    return result_string
+
+
+
+
+
+with open(JSON_filename, 'r') as file:
+    data = json.load(file)
+
+
+correct_num = 0
+for i in range(0, 40):
+    try:
+        # first time generate the code from propositions
+        result_string = extract_string(Generation(templates.templates["agent_engineer"], data[i]['context'],
+                        data[i]['question'],
+                        templates.templates["no_extra_content"]))
+        # print(result_string)
+
+        # convert code back 2 propositions
+        propositions_generated = BackConvertion(templates.templates["agent_engineer_neg"], result_string)
+
+        # Comparison
+        # zero-shot CoT is here
+        tag = Comparison(templates.templates["check_error_part1"], f"Propositions:{data[i]['context']}, Question:{data[i]['question']}", propositions_generated)
+        tag_final = Extraction(templates.templates["check_error_part2"], tag)
+        print(f"tag: {tag}")
+        print(f"tag_final: {tag_final}")
+        # if it pass the comparison
+        if "true" in tag_final:
+            print("no need to regenerate")
+            flag = 0
+            with open(PY_filename, 'w') as file:
+                file.write("{}".format(result_string))
+            output = subprocess.check_output(['python', PY_filename], universal_newlines=True)
+            print(f"output: {output}")
+            while (output.strip() != '1' and output.strip() != '0'):
+                result_string = extract_string(Adjustment(templates.templates["adjustment_agent"],
+                                                            result_string, output))
+                with open(PY_filename, 'w') as file:
+                    file.write("{}".format(result_string))
+                print("reprocessing...")
+                output = subprocess.check_output(['python', PY_filename], universal_newlines=True)
+                print("New output:" + output)
+                print(type(output))
+                flag += 1
+                if (flag == 3):
+                    break
+        else:
+            print("enter the regeneration part")
+            # regenaration
+            result_string = extract_string(Regeneration(templates.templates["regeneration"], f"Propositions:{data[i]['context']}, Question:{data[i]['question']}", result_string, tag_final))
+            print(f"regeneration result: {result_string}")
+            with open(PY_filename, 'w') as file:
+                file.write("{}".format(result_string))
+            output = subprocess.check_output(['python', PY_filename], universal_newlines=True)
+            flag = 0
+            while (output.strip() != '1' and output.strip() != '0'):
+                result_string = extract_string(Adjustment(templates.templates["adjustment_agent"],
+                                                            result_string, output))
+                with open(PY_filename, 'w') as file:
+                    file.write("{}".format(result_string))
+                print("reprocessing...")
+                output = subprocess.check_output(['python', PY_filename], universal_newlines=True)
+                print("New output:" + output)
+                print(type(output))
+                flag += 1
+                if (flag == 3):
+                    break
+
+        # check correctness
+        if (output.strip() != '1' and output.strip() != '0'):
+            correct_num += 1
+        if int(output.strip()) == data[i]['label']:
+            correct_num += 1
+        else:
+            continue
+    except Exception as e:
+        continue
+print(f"correct_num: {correct_num}")
diff --git a/complete_reasoning_Llama2.py b/complete_reasoning_Llama2.py
@@ -94,7 +94,7 @@ def batch_process(text):
 #                 csv_writer.writerow([step, responses, label])
 
 correct_num = 0
-for i in range(0, 1):
+for i in range(0, 40):
     try:
 
         # first time generate the code from propositions
@@ -111,7 +111,8 @@ def batch_process(text):
         tag_final = batch_process(f"""{templates.templates['check_error_part2']}, the following is the analysis processing: {tag}""")
 
         # if it pass the comparison
-        if "1" in tag_final:
+        if "true" in tag_final:
+
             flag = 0
             with open(PY_filename, 'w') as file:
                 file.write("{}".format(result_string))
diff --git a/pyDatalog_processing.py b/pyDatalog_processing.py
@@ -2,8 +2,7 @@
 from pyDatalog import pyDatalog
 try:
     # Declare the pyDatalog variables
-    pyDatalog.create_terms('X, strong, huge, big, short, little, quiet, wealthy, smart, dull, rough, sad, thin, bad, kind, nice, poor, small')
-    
+    pyDatalog.create_terms('X, strong, huge, big, short, little, thin, dull, rough, bad, quiet, wealthy, kind, small, nice, poor, smart, sad')
     # Define the facts
     +strong('Dave')
     +huge('Dave')
@@ -16,25 +15,21 @@
     +dull('Harry')
     +rough('Harry')
     +sad('Harry')
-    
     # Define the rules
     quiet(X) <= strong(X)
     thin(X) <= short(X) & little(X)
     bad(X) <= dull(X) & rough(X)
     kind(X) <= quiet(X) & wealthy(X)
-    
-    # Define the constraints
     small(X) <= thin(X)
     wealthy(X) <= quiet(X)
     nice(X) <= kind(X)
     poor(X) <= bad(X)
-    
     # Query the knowledge base
-    result = ~small('Gary')
+    result = small('Gary')
     if result:
-        print(1)
-    else:
         print(0)
+    else:
+        print(1)
 except Exception as e:
     traceback_info = traceback.format_exc()
     print(traceback_info)
diff --git a/templates.py b/templates.py
@@ -119,8 +119,9 @@ def remove_spaces(text):
                                 2. The number of propositions in the two texts must be consistent (including propositions and rules), otherwise it will be regarded as a difference.\n \
                                 3. If there is a difference, please tell me the difference.\n \
                                 Please think about this question step by step. """),
-    "check_error_part2": remove_spaces("""Based on your analysis process of text comparison, please give me a final conclusion. We only consider differences in content expression of texts and ignore differences in expression or structure. \
-                                            If there is no difference between the two texts, please return only the number 1 to me. If there is a difference, please return to me the content of the difference."""),
+    "check_error_part2": remove_spaces("""Based on your analysis process of text comparison, please give me a final conclusion. We only consider differences in content expression of texts and ignore differences in expression or structure. If there is no difference between the two texts, please return only the word "true" to me. If there is a difference, please return to me the content of the difference without anything else.
+                                        your expected output should be like: \n \
+                                        Bob is huge.(original) vs Bob is big.(generated):they are different."""),
     "regeneration": remove_spaces("""I interacted with you and completed the following actions:\n \
                                 1. I asked you to help me convert logical reasoning problems described in natural language into pydatalog code.\n \
                                 2. After the first step is completed, I asked you to convert the pydatalog code you generated back into a logical reasoning problem described in natural language (note that in this step, I did not provide you with the context of the first step's behavior)\n \