Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test #7

Merged
merged 5 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions README_cta.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
##What?
## What?
The goal is to combine the code saved in the plan(plan.json) generated by DI with the corresponding score, and then allow the LLM to analyze relevant experiences from the code and save them locally.
##How to use?
## How to use?
Make the following modifications within the main() function:

* step1:Replace with the new score list
* step2:Replace with your path
* step3:Replace with your folder path (the path where DI is automatically saved)

```javascript
{
```python
async def main():
score = [0.1241,0.1302,0.1313,0.1295,0.1292,0.1242,0.1375,0.1786,0.1567,0.1295,] # Replace with the new score list
path = "your_path" # Replace with your path
original_dir = (
Path(path) / "metaGPT/MetaGPT/data/output_1"
) # Replace with your folder path (the path where DI is automatically saved)
}
```
```
100 changes: 100 additions & 0 deletions machine_learning_with_tools_insights.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import asyncio
import json
import random

from pathlib import Path
from metagpt.roles.di.data_interpreter import DataInterpreter
from metagpt.logs import logger
from metagpt.utils.recovery_util import save_history


async def random_sample_tools(tools, num_samples):
return random.sample(tools, num_samples)

def load_baseline_code(file_path):
code_str = ""
with open(file_path, "r") as file:
for line in file:
code_str += line
return code_str

def load_json_data(json_dir):
with open(json_dir, "r") as file:
json_data = json.load(file)
return json_data

def load_analysis(file_path):
new_data = []
json_data = load_json_data(file_path)
data = json_data[0]['Analysis']
# print(data['Data Preprocessing']['Insights'])
_data ={
'Data Preprocessing':{
'Insights':data['Data Preprocessing']['Insights']
},
'Feature Engineering':{
'Insights':data['Feature Engineering']['Insights']
},
'Model Training':{
'Insights':data['Model Training']['Insights']
}
}
new_data.append(_data)
return new_data
# new_file_path = f"/Users/aurora/Desktop/summarize/Second_summarize_DI_2/summarize_new_{i}.json"
# with open(new_file_path, "w") as file:
# json.dump(new_data, file)

# async def main(requirement: str):
# preprocess_tools = ["FillMissingValue","MinMaxScale","StandardScale","MaxAbsScale","RobustScale","OrdinalEncode","OneHotEncode","LabelEncode"]
# feature_engineering_tools = ["PolynomialExpansion","CatCount","TargetMeanEncoder","KFoldTargetMeanEncoder","CatCross","GroupStat","SplitBins","ExtractTimeComps","GeneralSelection","TreeBasedSelection","VarianceBasedSelection"]
# tools = []
# tools.extend(await random_sample_tools(preprocess_tools, 2))
# tools.extend(await random_sample_tools(feature_engineering_tools, 2))
# print("The tools are:",tools)
# role = DataInterpreter(use_reflection=True, tools = tools)
# rsp = await role.run(requirement)
# logger.info(rsp)
# save_history(role=role)
# async def clear_history():

async def main(requirement: str):
role = DataInterpreter(use_reflection=True, tools=["<all>"])
rsp = await role.run(requirement)
logger.info(rsp)
save_history(role=role)

if __name__ == "__main__":
#House Price
data_path = "/Users/aurora/Desktop/ml_benchmark/05_house-prices-advanced-regression-techniques"
train_path = f"{data_path}/split_train.csv"
eval_path = f"{data_path}/split_eval.csv"
user_requirement = (
f"This is a house price dataset, your goal is to predict the sale price of a property based on its features. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sale prices on the eval data. The target column is 'SalePrice'. Don't transform skewed target column. Don't need to plot. "
)
code_path = "/Users/aurora/Desktop/metaGPT_new/MetaGPT/examples/di/first_best_code_house_price copy.py" #baseline_code path
path = "/Users/aurora/Desktop"
directory = Path(path)/f"Summarize_new_test/Second_summarize_DI_0/"
i = 4
file_path = Path(directory)/f"summarize_{i}.json" #summarize_res_file_path in "random_sample_analysis.py"
analysis = load_analysis(file_path)
baseline_code = load_baseline_code(code_path)
query = user_requirement+"\n"+"Here are some insights derived from high-performance code:"+ str(analysis)+ "\n" + "Please generate new complete code referenced on the provided baseline code below:\n"+ baseline_code +"\n"+f"Train data path: {train_path}', eval data path: '{eval_path}'."
print(query)
asyncio.run(main(query))

##Titanic
# if __name__ == "__main__":
# #data_path = "your/path/to/titanic"
# data_path = "/Users/aurora/Desktop/ml_benchmark/04_titanic"
# train_path = f"{data_path}/split_train.csv"
# eval_path = f"{data_path}/split_eval.csv"
# requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target.Please recommend at least five models and evaluate their performance separately, and finally input the optimal result. To predict the target, the Report accuracy on the eval data. Train data path: '{train_path}', eval data path: '{eval_path}'."
#ICR
# if __name__ == "__main__":
# #data_path = "your/path/to/titanic"
# data_path = "/Users/aurora/Desktop/ml_benchmark/07_icr-identify-age-related-conditions"
# train_path = f"{data_path}/split_train.csv"
# eval_path = f"{data_path}/split_eval.csv"
# requirement = f" ICR dataset is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions. The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{train_path}', eval data path: '{eval_path}'. "

159 changes: 159 additions & 0 deletions random_sample_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import random
import asyncio
import json
import re
import os

from pathlib import Path
from metagpt.llm import LLM
from metagpt.logs import logger
from metagpt.schema import Message

STRUCTUAL_PROMPT = """
[Original Analysis]
Below are 20 randomly sampled experiences from a previous dataset, each labeled with an experience, a corresponding score, and a unique ID:
{analysis}

**Task**:
For each stage (Data Preprocessing, Feature Engineering, Model Training):
- Reasoning: Analyze the provided experiences based on the variation in their scores to identify which experiences are most likely to improve model's performance.
- Reference: Connect each key point with the corresponding experience ID.
- Insights: Based on the given reasons and referenced experience, provide some as specific and actionable as possible insights you believe can enhance the model's performance. Your insights must be listed in bullet points, with a minimum of 3 points(e.g.,1.).

**Instructions for Output**:
Organize the output into three sections corresponding to each stage of data handling:
- Data Preprocessing
- Feature Engineering
- Model Training

**Expected Output Format**:
```json
{{
"Data Preprocessing": {{
"Source": ["List all experience IDs related to data preprocessing."]
"Reference IDs": ["List of IDs that you main reference or choose from this stage source."],
"Reasoning(yes)": "Reasons for selecting these experiences.",
"Not Reference IDs": ["List of IDs that you did not reference or choose from this stage source."],
"Reasoning(No)": "Reasons for not selecting these experiences.",
"Insights": "Based on the reasons and experiences, propose as specific and actionable as possible insights for improving model's performance."
}},
"Feature Engineering": {{
"Source": ["List all experience IDs related to feature engineering."]
"Reference IDs": ["List of IDs that you main reference or choose from this stage source."],
"Reasoning(yes)": "Reasons for selecting these experiences.",
"Not Reference IDs": ["List of IDs that you did not reference or choose from this stage source."],
"Reasoning(No)": "Reasons for not selecting these experiences.",
"Insights": "Based on the reasons and experiences, propose as specific and actionable as possible insights for improving model's performance. If you need to create new features, please provide specific feature names."
}},
"Model Training": {{
"Source": ["List all experience IDs related to model training."]
"Reference IDs": ["List of IDs that you main reference or choose from this stage source."],
"Reasoning(yes)": "Reasons for selecting these experiences.",
"Not Reference IDs": ["List of IDs that you did not reference or choose from this stage source."],
"Reasoning(No)": "Reasons for not selecting these experiences.",
"Insights": "Based on the reasons and experiences, propose as specific and actionable as possible insights for improving model's performance."
}}
}}

"""

REFLECTION_SYSTEM_MSG = "As a Kaggle grandmaster participating in a competition, you need to analyze your experiences and propose evolutionary points that are more likely to improve the performance of your model."
#, with a minimum of 3 points(e.g.,1.)

async def _random_sample(analysis, num_samples):
return random.sample(analysis, num_samples)

async def load_json_data(json_dir):
with open(json_dir, "r") as file:
json_data = json.load(file)
return json_data

async def save_rsp_to_file(rsp, filename):
with open(filename, "w") as file:
json.dump(rsp, file)


async def clean_json_from_rsp(text):
pattern = r"```json(.*?)```"
matches = re.findall(pattern, text, re.DOTALL)
if matches:
json = "\n".join(matches) #
return json
else:
return ""

async def format_output(rsp):
rsp_list = []
new_data = []
rsp_list.append(rsp)
for item in rsp_list:
item_dict = json.loads(item)
data = {
"Analysis": item_dict,
}
new_data.append(data)
return new_data

async def get_id_list(random_sample,id_list_file):
id_list = []
for m in range(0,len(random_sample)):
random_sample_id_list = []
id_list.append(random_sample[m]['id'])
random_sample_id = {
"Random_ID":id_list
}
random_sample_id_list.append(random_sample_id)
print("List of ID:",random_sample_id_list)
await save_rsp_to_file(random_sample_id,id_list_file)

async def get_analysis_pool(json_file):
data_list = []
count = 0
json_data = await load_json_data(json_file)
for i in range(0,len(json_data)):
analysis_data = (json_data[i]['Analysis']['Analysis'])
for j in range(0,len(json_data[i]['Analysis']['Analysis'])):
data = {
"Analysis":analysis_data[j],
"Score":json_data[i]['Analysis']['metric'],
"low_is_better":json_data[i]['Analysis']['lower_is_better'],
"id":count
}
count+=1
data_list.append(data)
rsp_file = Path(os.path.dirname(json_file))/"analysis_pool_sample.json"
await save_rsp_to_file(data_list,rsp_file)
return data_list

async def summarize_insights(data):
llm = LLM()
analysis = data
structual_prompt = STRUCTUAL_PROMPT.format(
analysis=analysis,
)
context = llm.format_msg([Message(content=structual_prompt, role="assistant")])
llm_response = await llm.aask(
context, system_msgs=[REFLECTION_SYSTEM_MSG]
)
logger.info(llm_response)
rsp = await clean_json_from_rsp(llm_response)
format_rsp = await format_output(rsp)
return format_rsp

async def main():
path = "/Users/aurora/Desktop" #Replace with your path
directory = Path(path)/f"Summarize_test/Second_summarize_DI/"
os.makedirs(directory, exist_ok=True)
json_dir = Path(path)/"Analysis/Second_analysis_DI/response_data_format.json" #final_path" in "code_to_analysis.py"
data = await get_analysis_pool(json_dir)
for i in range(0,5):
summarize_res_file = Path(directory)/f"summarize_{i}.json"
random_sample = await _random_sample(data,20)
summarize_insights_rsp = await summarize_insights(random_sample)
await save_rsp_to_file(summarize_insights_rsp,summarize_res_file)
print("The results of the summarizes have been saved at:",summarize_res_file)

if __name__ == "__main__":
asyncio.run(main())


Loading