diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 156956dbe..d31a93fac 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -7,7 +7,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): # Main components ## Scen - scen: str = "rdagent.scenarios.data_science.scen.DataScienceScen" + scen: str = "rdagent.scenarios.data_science.scen.KaggleScen" """Scenario class for data mining model""" ## proposal diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py index dbfff1425..17f468634 100644 --- a/rdagent/app/data_science/loop.py +++ b/rdagent/app/data_science/loop.py @@ -1,5 +1,6 @@ from typing import Any +from pathlib import Path import fire from rdagent.app.data_science.conf import DS_RD_SETTING @@ -111,7 +112,12 @@ def main(path=None, step_n=None, competition=None): DS_RD_SETTING.competition = competition if DS_RD_SETTING.competition: - download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING) + if DS_RD_SETTING.scen.endswith("KaggleScen"): + download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING) + else: + if not Path(f"{DS_RD_SETTING.local_data_path}/{competition}").exists(): + logger.error(f"Please prepare data for competition {competition} first.") + return else: logger.error("Please specify competition name.") if path is None: diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py index 04f75941d..1a7a3d08b 100644 --- a/rdagent/components/coder/data_science/ensemble/test.py +++ b/rdagent/components/coder/data_science/ensemble/test.py @@ -8,7 +8,7 @@ from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen # Add the competition folder to path COMPETITION_PATH = ( @@ -31,7 +31,7 @@ def load_ensemble_spec(): def develop_one_competition(competition: str): # Initialize scenario and coder - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) ensemble_coder = EnsembleCoSTEER(scen) # Load ensemble specification ensemble_spec = load_ensemble_spec() diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py index 96cd5e590..74732e756 100644 --- a/rdagent/components/coder/data_science/feature/test.py +++ b/rdagent/components/coder/data_science/feature/test.py @@ -9,11 +9,11 @@ from rdagent.components.coder.data_science.feature import FeatureCoSTEER from rdagent.components.coder.data_science.feature.exp import FeatureTask from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen def develop_one_competition(competition: str): # -> experiment - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) feature_coder = FeatureCoSTEER(scen) with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r") as file: diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py index 201535187..ad7995789 100644 --- a/rdagent/components/coder/data_science/model/test.py +++ b/rdagent/components/coder/data_science/model/test.py @@ -12,12 +12,12 @@ from rdagent.components.coder.data_science.model.exp import ModelTask from rdagent.core.experiment import FBWorkspace from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen # Take tasks, spec.md and feat as input, generate a feedback as output def develop_one_competition(competition: str): - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) model_coder = ModelCoSTEER(scen) # Create the task diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py index 5aacc8b8c..2cd68a790 100644 --- a/rdagent/components/coder/data_science/raw_data_loader/test.py +++ b/rdagent/components/coder/data_science/raw_data_loader/test.py @@ -9,11 +9,11 @@ from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen def develop_one_competition(competition: str): # -> experiment - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) data_loader_coder = DataLoaderCoSTEER(scen) # Create the experiment diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py index 5e9e7a1b4..f2c2dcade 100644 --- a/rdagent/components/coder/data_science/workflow/test.py +++ b/rdagent/components/coder/data_science/workflow/test.py @@ -12,11 +12,11 @@ from rdagent.components.coder.data_science.workflow.exp import WorkflowTask from rdagent.core.experiment import FBWorkspace from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen def develop_one_competition(competition: str): - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) workflow_coder = WorkflowCoSTEER(scen) wt = WorkflowTask( diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py index 629f8803d..2cc25f031 100644 --- a/rdagent/core/experiment.py +++ b/rdagent/core/experiment.py @@ -223,7 +223,7 @@ def __init__( based_experiments: Sequence[ASpecificWSForExperiment] = [], hypothesis: Optional["Hypothesis"] = None, ) -> None: - self.hypothesis: Optional["Hypothesis"] = hypothesis # Experiment is opptionally generated by hypothesis + self.hypothesis: Optional["Hypothesis"] = hypothesis # Experiment is optionally generated by hypothesis self.sub_tasks: Sequence[ASpecificTask] = sub_tasks self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks) # TODO: diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py index 29324c02a..8aaf93146 100644 --- a/rdagent/scenarios/data_science/scen/__init__.py +++ b/rdagent/scenarios/data_science/scen/__init__.py @@ -1,3 +1,4 @@ from .scen import DataScienceScen +from .kaggle import KaggleScen -__all__ = ["DataScienceScen"] +__all__ = ["DataScienceScen", "KaggleScen"] diff --git a/rdagent/scenarios/data_science/scen/kaggle.py b/rdagent/scenarios/data_science/scen/kaggle.py new file mode 100644 index 000000000..3a269405f --- /dev/null +++ b/rdagent/scenarios/data_science/scen/kaggle.py @@ -0,0 +1,35 @@ +import json + +from rdagent.app.data_science.conf import DS_RD_SETTING +from rdagent.core.scenario import Scenario +from rdagent.oai.llm_utils import APIBackend +from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.kaggle.kaggle_crawler import ( + crawl_descriptions, + leaderboard_scores, +) +from rdagent.utils.agent.tpl import T + + +class KaggleScen(DataScienceScen): + """Kaggle Scenario + It is based on kaggle now. + - But it is not use the same interface with previous kaggle version. + - Ideally, we should reuse previous kaggle scenario. + But we found that too much scenario unrelated code in kaggle scenario and hard to reuse. + So we start from a simple one.... + """ + def _get_description(self): + return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path) + + def _get_direction(self): + leaderboard = leaderboard_scores(self.competition) + return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize" + + @property + def rich_style_description(self) -> str: + return T(".prompts:rich_style_description").r( + name="Kaggle", + competition=f"[{self.competition}](https://www.kaggle.com/competitions/{self.competition})", + ) + diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml index 432f5c4f4..b0601af56 100644 --- a/rdagent/scenarios/data_science/scen/prompts.yaml +++ b/rdagent/scenarios/data_science/scen/prompts.yaml @@ -34,4 +34,29 @@ competition_background: |- The data type used in this competition is {{ data_type }}. Briefly, the competition involves: {{ brief_description }}. The dataset used in this competition is: {{ data_description }}. - Your goal in this competition is to: {{target_description }}. \ No newline at end of file + Your goal in this competition is to: {{target_description }}. + +rich_style_description: |- + ### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution + + #### [Overview](#_summary) + + In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process. + + #### {{ name }} Competition info + + Current Competition: {{ competition }} + + #### [Automated R&D](#_rdloops) + + - **[R (Research)](#_research)** + - Iteration of ideas and hypotheses. + - Continuous learning and knowledge construction. + + - **[D (Development)](#_development)** + - Evolving code generation, model refinement, and features generation. + - Automated implementation and testing of models/features. + + #### [Objective](#_summary) + + To automatically optimize performance metrics within the validation set, ultimately discovering the most efficient features and models through autonomous research and development. \ No newline at end of file diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py index 3fbdc8fda..a702c36ac 100644 --- a/rdagent/scenarios/data_science/scen/scen.py +++ b/rdagent/scenarios/data_science/scen/scen.py @@ -1,32 +1,33 @@ import json +from pathlib import Path from rdagent.app.data_science.conf import DS_RD_SETTING from rdagent.core.scenario import Scenario from rdagent.oai.llm_utils import APIBackend -from rdagent.scenarios.kaggle.kaggle_crawler import ( - crawl_descriptions, - leaderboard_scores, -) from rdagent.utils.agent.tpl import T +from rdagent.log import rdagent_logger as logger class DataScienceScen(Scenario): """Data Science Scenario - It is based on kaggle now. - - But it is not use the same interface with previous kaggle version. - - Ideally, we should reuse previous kaggle scenario. - But we found that too much scenario unrelated code in kaggle scenario and hard to reuse. - So we start from a simple one.... """ def __init__(self, competition: str) -> None: self.competition = competition - self.raw_description = crawl_descriptions(competition, DS_RD_SETTING.local_data_path) + self.raw_description = self._get_description() + self.metric_direction = self._get_direction() + self._analysis_competition_description() - leaderboard = leaderboard_scores(competition) - self.metric_direction = "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize" + def _get_description(self): + if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists(): + logger.info(f"Found {self.competition}.json, loading from local file.") + with fp.open("r") as f: + return json.load(f) + else: + logger.error(f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file.") - self._analysis_competition_description() + def _get_direction(self): + return self.raw_description.get("metric_direction", "minimize") def _analysis_competition_description(self): sys_prompt = T(".prompts:competition_description_template.system").r() @@ -75,31 +76,10 @@ def background(self) -> str: @property def rich_style_description(self) -> str: - return f""" -### Kaggle Agent: Automated Feature Engineering & Model Tuning Evolution - -#### [Overview](#_summary) - -In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process. - -#### Kaggle Competition info - -Current Competition: [{self.competition}](https://www.kaggle.com/competitions/{self.competition}) - -#### [Automated R&D](#_rdloops) - -- **[R (Research)](#_research)** -- Iteration of ideas and hypotheses. -- Continuous learning and knowledge construction. - -- **[D (Development)](#_development)** -- Evolving code generation, model refinement, and features generation. -- Automated implementation and testing of models/features. - -#### [Objective](#_summary) - -To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development. -""" + return T(".prompts:rich_style_description").r( + name="Data Science", + competition=self.competition, + ) def get_scenario_all_desc(self) -> str: return T(".prompts:scenario_description").r(