garylin2099 · garylin2099 · May 16, 2024 · May 11, 2024 · May 14, 2024 · May 14, 2024
diff --git a/README_cta.md b/README_cta.md
@@ -1,19 +1,17 @@
-##What?
+## What?
 The goal is to combine the code saved in the plan(plan.json) generated by DI with the corresponding score, and then allow the LLM to analyze relevant experiences from the code and save them locally.
-##How to use?
+## How to use?
 Make the following modifications within the main() function：
 
 * step1:Replace with the new score list
 * step2:Replace with your path 
 * step3:Replace with your folder path (the path where DI is automatically saved)
 
-```javascript
-{
+```python
 async def main():
     score = [0.1241,0.1302,0.1313,0.1295,0.1292,0.1242,0.1375,0.1786,0.1567,0.1295,]  # Replace with the new score list
     path = "your_path"  # Replace with your path
     original_dir = (
         Path(path) / "metaGPT/MetaGPT/data/output_1"
     )  # Replace with your folder path (the path where DI is automatically saved)
-}
-```
+```
diff --git a/machine_learning_with_tools_insights.py b/machine_learning_with_tools_insights.py
@@ -0,0 +1,100 @@
+import asyncio
+import json
+import random
+
+from pathlib import Path
+from metagpt.roles.di.data_interpreter import DataInterpreter
+from metagpt.logs import logger
+from metagpt.utils.recovery_util import save_history
+
+
+async def random_sample_tools(tools, num_samples):
+    return random.sample(tools, num_samples)
+
+def load_baseline_code(file_path):
+    code_str = ""
+    with open(file_path, "r") as file:
+        for line in file:
+            code_str += line
+    return code_str 
+
+def load_json_data(json_dir):      
+    with open(json_dir, "r") as file:
+        json_data = json.load(file)
+        return json_data
+
+def load_analysis(file_path):
+    new_data = []
+    json_data = load_json_data(file_path)
+    data = json_data[0]['Analysis']
+    #     print(data['Data Preprocessing']['Insights'])
+    _data ={
+        'Data Preprocessing':{
+            'Insights':data['Data Preprocessing']['Insights']
+        },
+        'Feature Engineering':{
+            'Insights':data['Feature Engineering']['Insights']
+        },
+        'Model Training':{
+                'Insights':data['Model Training']['Insights']
+        }
+    }
+    new_data.append(_data)
+    return new_data
+    # new_file_path = f"/Users/aurora/Desktop/summarize/Second_summarize_DI_2/summarize_new_{i}.json"
+    # with open(new_file_path, "w") as file:
+    #     json.dump(new_data, file)   
+
+# async def main(requirement: str):
+#     preprocess_tools = ["FillMissingValue","MinMaxScale","StandardScale","MaxAbsScale","RobustScale","OrdinalEncode","OneHotEncode","LabelEncode"]
+#     feature_engineering_tools = ["PolynomialExpansion","CatCount","TargetMeanEncoder","KFoldTargetMeanEncoder","CatCross","GroupStat","SplitBins","ExtractTimeComps","GeneralSelection","TreeBasedSelection","VarianceBasedSelection"]
+#     tools = []
+#     tools.extend(await random_sample_tools(preprocess_tools, 2))
+#     tools.extend(await random_sample_tools(feature_engineering_tools, 2))
+#     print("The tools are:",tools)
+#     role = DataInterpreter(use_reflection=True, tools = tools)
+#     rsp = await role.run(requirement)
+#     logger.info(rsp)
+#     save_history(role=role)
+# async def clear_history():
+
+async def main(requirement: str):
+    role = DataInterpreter(use_reflection=True, tools=["<all>"])
+    rsp = await role.run(requirement)
+    logger.info(rsp)
+    save_history(role=role)
+
+if __name__ == "__main__":
+    #House Price
+    data_path = "/Users/aurora/Desktop/ml_benchmark/05_house-prices-advanced-regression-techniques" 
+    train_path = f"{data_path}/split_train.csv"
+    eval_path = f"{data_path}/split_eval.csv"
+    user_requirement = (
+        f"This is a house price dataset, your goal is to predict the sale price of a property based on its features. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sale prices on the eval data. The target column is 'SalePrice'. Don't transform skewed target column. Don't need to plot. "
+        )
+    code_path = "/Users/aurora/Desktop/metaGPT_new/MetaGPT/examples/di/first_best_code_house_price copy.py" #baseline_code path
+    path = "/Users/aurora/Desktop"  
+    directory = Path(path)/f"Summarize_new_test/Second_summarize_DI_0/"
+    i = 4
+    file_path = Path(directory)/f"summarize_{i}.json" #summarize_res_file_path in "random_sample_analysis.py"
+    analysis = load_analysis(file_path)
+    baseline_code = load_baseline_code(code_path)
+    query = user_requirement+"\n"+"Here are some insights derived from high-performance code:"+ str(analysis)+ "\n" + "Please generate new complete code referenced on the provided baseline code below:\n"+ baseline_code +"\n"+f"Train data path: {train_path}', eval data path: '{eval_path}'."  
+    print(query)
+    asyncio.run(main(query))
+
+##Titanic
+# if __name__ == "__main__":
+#     #data_path = "your/path/to/titanic"
+#     data_path = "/Users/aurora/Desktop/ml_benchmark/04_titanic" 
+#     train_path = f"{data_path}/split_train.csv"
+#     eval_path = f"{data_path}/split_eval.csv"
+#     requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target.Please recommend at least five models and evaluate their performance separately, and finally input the optimal result.  To predict the target, the  Report accuracy on the eval data. Train data path: '{train_path}', eval data path: '{eval_path}'."
+#ICR
+# if __name__ == "__main__":
+#     #data_path = "your/path/to/titanic"
+#     data_path = "/Users/aurora/Desktop/ml_benchmark/07_icr-identify-age-related-conditions" 
+#     train_path = f"{data_path}/split_train.csv"
+#     eval_path = f"{data_path}/split_eval.csv"
+#     requirement = f" ICR dataset is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions. The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{train_path}', eval data path: '{eval_path}'. "
+
diff --git a/random_sample_analysis.py b/random_sample_analysis.py
@@ -0,0 +1,159 @@
+import random
+import asyncio
+import json
+import re
+import os
+
+from pathlib import Path
+from metagpt.llm import LLM
+from metagpt.logs import logger
+from metagpt.schema import Message
+
+STRUCTUAL_PROMPT = """
+[Original Analysis]
+Below are 20 randomly sampled experiences from a previous dataset, each labeled with an experience, a corresponding score, and a unique ID:
+{analysis}
+
+**Task**:
+For each stage (Data Preprocessing, Feature Engineering, Model Training):
+- Reasoning: Analyze the provided experiences based on the variation in their scores to identify which experiences are most likely to improve model's performance. 
+- Reference: Connect each key point with the corresponding experience ID.
+- Insights: Based on the given reasons and referenced experience, provide some as specific and actionable as possible insights you believe can enhance the model's performance. Your insights must be listed in bullet points, with a minimum of 3 points(e.g.,1.). 
+
+**Instructions for Output**:
+Organize the output into three sections corresponding to each stage of data handling:
+   - Data Preprocessing
+   - Feature Engineering
+   - Model Training
+
+**Expected Output Format**:
+```json
+{{
+    "Data Preprocessing": {{
+        "Source": ["List all experience IDs related to data preprocessing."]
+        "Reference IDs": ["List of IDs that you main reference or choose from this stage source."],
+        "Reasoning(yes)": "Reasons for selecting these experiences.",
+        "Not Reference IDs": ["List of IDs that you did not reference or choose from this stage source."],
+        "Reasoning(No)": "Reasons for not selecting these experiences.",
+        "Insights": "Based on the reasons and experiences, propose as specific and actionable as possible insights for improving model's performance."
+    }},
+    "Feature Engineering": {{
+        "Source": ["List all experience IDs related to feature engineering."]
+        "Reference IDs": ["List of IDs that you main reference or choose from this stage source."],
+        "Reasoning(yes)": "Reasons for selecting these experiences.",
+        "Not Reference IDs": ["List of IDs that you did not reference or choose from this stage source."],
+        "Reasoning(No)": "Reasons for not selecting these experiences.",
+        "Insights": "Based on the reasons and experiences, propose as specific and actionable as possible insights for improving model's performance. If you need to create new features, please provide specific feature names."
+    }},
+    "Model Training": {{
+       "Source": ["List all experience IDs related to model training."]
+        "Reference IDs": ["List of IDs that you main reference or choose from this stage source."],
+        "Reasoning(yes)": "Reasons for selecting these experiences.",
+        "Not Reference IDs": ["List of IDs that you did not reference or choose from this stage source."],
+        "Reasoning(No)": "Reasons for not selecting these experiences.",
+        "Insights": "Based on the reasons and experiences, propose as specific and actionable as possible insights for improving model's performance."
+    }}
+}}
+
+"""
+
+REFLECTION_SYSTEM_MSG = "As a Kaggle grandmaster participating in a competition, you need to analyze your experiences and propose evolutionary points that are more likely to improve the performance of your model."
+#, with a minimum of 3 points(e.g.,1.)
+
+async def _random_sample(analysis, num_samples):
+    return random.sample(analysis, num_samples)
+
+async def load_json_data(json_dir):      
+    with open(json_dir, "r") as file:
+        json_data = json.load(file)
+        return json_data
+
+async def save_rsp_to_file(rsp, filename):
+    with open(filename, "w") as file:
+            json.dump(rsp, file)
+
+
+async def clean_json_from_rsp(text):
+    pattern = r"```json(.*?)```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    if matches:
+        json = "\n".join(matches)  #
+        return json
+    else:
+        return ""  
+
+async def format_output(rsp):
+    rsp_list = []
+    new_data = [] 
+    rsp_list.append(rsp)              
+    for item in rsp_list:
+        item_dict = json.loads(item)
+        data = {
+                "Analysis": item_dict,
+        }
+        new_data.append(data)
+    return new_data
+
+async def get_id_list(random_sample,id_list_file):
+    id_list = []
+    for m in range(0,len(random_sample)):
+        random_sample_id_list = []
+        id_list.append(random_sample[m]['id'])
+    random_sample_id = {
+        "Random_ID":id_list
+    }
+    random_sample_id_list.append(random_sample_id)
+    print("List of ID:",random_sample_id_list)
+    await save_rsp_to_file(random_sample_id,id_list_file)
+
+async def get_analysis_pool(json_file):  
+    data_list = []
+    count = 0
+    json_data = await load_json_data(json_file)
+    for i in range(0,len(json_data)):
+        analysis_data = (json_data[i]['Analysis']['Analysis'])
+        for j in range(0,len(json_data[i]['Analysis']['Analysis'])):
+            data = {
+                "Analysis":analysis_data[j],
+                "Score":json_data[i]['Analysis']['metric'],
+                "low_is_better":json_data[i]['Analysis']['lower_is_better'],
+                "id":count
+            }
+            count+=1
+            data_list.append(data)
+    rsp_file = Path(os.path.dirname(json_file))/"analysis_pool_sample.json"
+    await save_rsp_to_file(data_list,rsp_file)
+    return data_list
+
+async def summarize_insights(data):
+    llm = LLM()
+    analysis = data
+    structual_prompt = STRUCTUAL_PROMPT.format(
+            analysis=analysis,   
+    )
+    context = llm.format_msg([Message(content=structual_prompt, role="assistant")]) 
+    llm_response = await llm.aask(
+            context, system_msgs=[REFLECTION_SYSTEM_MSG]
+        )
+    logger.info(llm_response)
+    rsp = await clean_json_from_rsp(llm_response)
+    format_rsp = await format_output(rsp)
+    return format_rsp
+
+async def main():
+    path = "/Users/aurora/Desktop"   #Replace with your path
+    directory = Path(path)/f"Summarize_test/Second_summarize_DI/"
+    os.makedirs(directory, exist_ok=True) 
+    json_dir = Path(path)/"Analysis/Second_analysis_DI/response_data_format.json" #final_path" in "code_to_analysis.py"
+    data = await get_analysis_pool(json_dir)
+    for i in range(0,5):
+        summarize_res_file = Path(directory)/f"summarize_{i}.json"
+        random_sample = await _random_sample(data,20)
+        summarize_insights_rsp = await summarize_insights(random_sample)
+        await save_rsp_to_file(summarize_insights_rsp,summarize_res_file)
+        print("The results of the summarizes have been saved at:",summarize_res_file)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+