Skip to content

Commit

Permalink
support local folder (#511)
Browse files Browse the repository at this point in the history
* support local folder

* remove unnecessary random

* KaggleScen Subclass

* small fix

* use template for style description

* update default scen to kaggle
  • Loading branch information
qew21 authored Dec 25, 2024
1 parent 01ad2e9 commit db1455b
Show file tree
Hide file tree
Showing 12 changed files with 100 additions and 53 deletions.
2 changes: 1 addition & 1 deletion rdagent/app/data_science/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):

# Main components
## Scen
scen: str = "rdagent.scenarios.data_science.scen.DataScienceScen"
scen: str = "rdagent.scenarios.data_science.scen.KaggleScen"
"""Scenario class for data mining model"""

## proposal
Expand Down
8 changes: 7 additions & 1 deletion rdagent/app/data_science/loop.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Any

from pathlib import Path
import fire

from rdagent.app.data_science.conf import DS_RD_SETTING
Expand Down Expand Up @@ -111,7 +112,12 @@ def main(path=None, step_n=None, competition=None):
DS_RD_SETTING.competition = competition

if DS_RD_SETTING.competition:
download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
if DS_RD_SETTING.scen.endswith("KaggleScen"):
download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
else:
if not Path(f"{DS_RD_SETTING.local_data_path}/{competition}").exists():
logger.error(f"Please prepare data for competition {competition} first.")
return
else:
logger.error("Please specify competition name.")
if path is None:
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/ensemble/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen

# Add the competition folder to path
COMPETITION_PATH = (
Expand All @@ -31,7 +31,7 @@ def load_ensemble_spec():

def develop_one_competition(competition: str):
# Initialize scenario and coder
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
ensemble_coder = EnsembleCoSTEER(scen)
# Load ensemble specification
ensemble_spec = load_ensemble_spec()
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/feature/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from rdagent.components.coder.data_science.feature import FeatureCoSTEER
from rdagent.components.coder.data_science.feature.exp import FeatureTask
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen


def develop_one_competition(competition: str): # -> experiment
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
feature_coder = FeatureCoSTEER(scen)

with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r") as file:
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/model/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from rdagent.components.coder.data_science.model.exp import ModelTask
from rdagent.core.experiment import FBWorkspace
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen


# Take tasks, spec.md and feat as input, generate a feedback as output
def develop_one_competition(competition: str):
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
model_coder = ModelCoSTEER(scen)

# Create the task
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/raw_data_loader/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen


def develop_one_competition(competition: str): # -> experiment
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
data_loader_coder = DataLoaderCoSTEER(scen)

# Create the experiment
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/workflow/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
from rdagent.core.experiment import FBWorkspace
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen


def develop_one_competition(competition: str):
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
workflow_coder = WorkflowCoSTEER(scen)

wt = WorkflowTask(
Expand Down
2 changes: 1 addition & 1 deletion rdagent/core/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def __init__(
based_experiments: Sequence[ASpecificWSForExperiment] = [],
hypothesis: Optional["Hypothesis"] = None,
) -> None:
self.hypothesis: Optional["Hypothesis"] = hypothesis # Experiment is opptionally generated by hypothesis
self.hypothesis: Optional["Hypothesis"] = hypothesis # Experiment is optionally generated by hypothesis
self.sub_tasks: Sequence[ASpecificTask] = sub_tasks
self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
# TODO:
Expand Down
3 changes: 2 additions & 1 deletion rdagent/scenarios/data_science/scen/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .scen import DataScienceScen
from .kaggle import KaggleScen

__all__ = ["DataScienceScen"]
__all__ = ["DataScienceScen", "KaggleScen"]
35 changes: 35 additions & 0 deletions rdagent/scenarios/data_science/scen/kaggle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import json

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.core.scenario import Scenario
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.kaggle.kaggle_crawler import (
crawl_descriptions,
leaderboard_scores,
)
from rdagent.utils.agent.tpl import T


class KaggleScen(DataScienceScen):
"""Kaggle Scenario
It is based on kaggle now.
- But it is not use the same interface with previous kaggle version.
- Ideally, we should reuse previous kaggle scenario.
But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
So we start from a simple one....
"""
def _get_description(self):
return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)

def _get_direction(self):
leaderboard = leaderboard_scores(self.competition)
return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"

@property
def rich_style_description(self) -> str:
return T(".prompts:rich_style_description").r(
name="Kaggle",
competition=f"[{self.competition}](https://www.kaggle.com/competitions/{self.competition})",
)

27 changes: 26 additions & 1 deletion rdagent/scenarios/data_science/scen/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,29 @@ competition_background: |-
The data type used in this competition is {{ data_type }}.
Briefly, the competition involves: {{ brief_description }}.
The dataset used in this competition is: {{ data_description }}.
Your goal in this competition is to: {{target_description }}.
Your goal in this competition is to: {{target_description }}.
rich_style_description: |-
### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution
#### [Overview](#_summary)
In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process.
#### {{ name }} Competition info
Current Competition: {{ competition }}
#### [Automated R&D](#_rdloops)
- **[R (Research)](#_research)**
- Iteration of ideas and hypotheses.
- Continuous learning and knowledge construction.
- **[D (Development)](#_development)**
- Evolving code generation, model refinement, and features generation.
- Automated implementation and testing of models/features.
#### [Objective](#_summary)
To automatically optimize performance metrics within the validation set, ultimately discovering the most efficient features and models through autonomous research and development.
56 changes: 18 additions & 38 deletions rdagent/scenarios/data_science/scen/scen.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
import json
from pathlib import Path

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.core.scenario import Scenario
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.kaggle_crawler import (
crawl_descriptions,
leaderboard_scores,
)
from rdagent.utils.agent.tpl import T
from rdagent.log import rdagent_logger as logger


class DataScienceScen(Scenario):
"""Data Science Scenario
It is based on kaggle now.
- But it is not use the same interface with previous kaggle version.
- Ideally, we should reuse previous kaggle scenario.
But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
So we start from a simple one....
"""

def __init__(self, competition: str) -> None:
self.competition = competition
self.raw_description = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
self.raw_description = self._get_description()
self.metric_direction = self._get_direction()
self._analysis_competition_description()

leaderboard = leaderboard_scores(competition)
self.metric_direction = "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
def _get_description(self):
if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists():
logger.info(f"Found {self.competition}.json, loading from local file.")
with fp.open("r") as f:
return json.load(f)
else:
logger.error(f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file.")

self._analysis_competition_description()
def _get_direction(self):
return self.raw_description.get("metric_direction", "minimize")

def _analysis_competition_description(self):
sys_prompt = T(".prompts:competition_description_template.system").r()
Expand Down Expand Up @@ -75,31 +76,10 @@ def background(self) -> str:

@property
def rich_style_description(self) -> str:
return f"""
### Kaggle Agent: Automated Feature Engineering & Model Tuning Evolution
#### [Overview](#_summary)
In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process.
#### Kaggle Competition info
Current Competition: [{self.competition}](https://www.kaggle.com/competitions/{self.competition})
#### [Automated R&D](#_rdloops)
- **[R (Research)](#_research)**
- Iteration of ideas and hypotheses.
- Continuous learning and knowledge construction.
- **[D (Development)](#_development)**
- Evolving code generation, model refinement, and features generation.
- Automated implementation and testing of models/features.
#### [Objective](#_summary)
To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development.
"""
return T(".prompts:rich_style_description").r(
name="Data Science",
competition=self.competition,
)

def get_scenario_all_desc(self) -> str:
return T(".prompts:scenario_description").r(
Expand Down

0 comments on commit db1455b

Please sign in to comment.