Skip to content

Commit

Permalink
KaggleScen Subclass
Browse files Browse the repository at this point in the history
  • Loading branch information
qew21 committed Dec 25, 2024
1 parent b2ce0c3 commit 7f6d653
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 35 deletions.
8 changes: 7 additions & 1 deletion rdagent/app/data_science/loop.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import subprocess
from typing import Any, Literal

from pathlib import Path
import fire

from rdagent.app.data_science.conf import DS_RD_SETTING
Expand Down Expand Up @@ -125,7 +126,12 @@ def main(path=None, step_n=None, competition=None):
DS_RD_SETTING.competition = competition

if DS_RD_SETTING.competition:
download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
if DS_RD_SETTING.scen.endswith("KaggleScen"):
download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
else:
if not Path(f"{DS_RD_SETTING.local_data_path}/{competition}").exists():
logger.error(f"Please prepare data for competition {competition} first.")
return
else:
logger.error("Please specify competition name.")
if path is None:
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/ensemble/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen

# Add the competition folder to path
COMPETITION_PATH = (
Expand All @@ -31,7 +31,7 @@ def load_ensemble_spec():

def develop_one_competition(competition: str):
# Initialize scenario and coder
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
ensemble_coder = EnsembleCoSTEER(scen)
# Load ensemble specification
ensemble_spec = load_ensemble_spec()
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/feature/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from rdagent.components.coder.data_science.feature import FeatureCoSTEER
from rdagent.components.coder.data_science.feature.exp import FeatureTask
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen


def develop_one_competition(competition: str): # -> experiment
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
feature_coder = FeatureCoSTEER(scen)

with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r") as file:
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/model/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from rdagent.components.coder.data_science.model.exp import ModelTask
from rdagent.core.experiment import FBWorkspace
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen


# Take tasks, spec.md and feat as input, generate a feedback as output
def develop_one_competition(competition: str):
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
model_coder = ModelCoSTEER(scen)

# Create the task
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/raw_data_loader/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen


def develop_one_competition(competition: str): # -> experiment
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
data_loader_coder = DataLoaderCoSTEER(scen)

# Create the experiment
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/workflow/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
from rdagent.core.experiment import FBWorkspace
from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.data_science.scen import KaggleScen


def develop_one_competition(competition: str):
scen = DataScienceScen(competition=competition)
scen = KaggleScen(competition=competition)
workflow_coder = WorkflowCoSTEER(scen)

wt = WorkflowTask(
Expand Down
3 changes: 2 additions & 1 deletion rdagent/scenarios/data_science/scen/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .scen import DataScienceScen
from .kaggle import KaggleScen

__all__ = ["DataScienceScen"]
__all__ = ["DataScienceScen", "KaggleScen"]
55 changes: 55 additions & 0 deletions rdagent/scenarios/data_science/scen/kaggle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import json

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.core.scenario import Scenario
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.data_science.scen import DataScienceScen
from rdagent.scenarios.kaggle.kaggle_crawler import (
crawl_descriptions,
leaderboard_scores,
)
from rdagent.utils.agent.tpl import T


class KaggleScen(DataScienceScen):
"""Kaggle Scenario
It is based on kaggle now.
- But it is not use the same interface with previous kaggle version.
- Ideally, we should reuse previous kaggle scenario.
But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
So we start from a simple one....
"""
def _get_description(self):
return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)

def _get_direction(self):
leaderboard = leaderboard_scores(self.competition)
return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"

@property
def rich_style_description(self) -> str:
return f"""
### Kaggle Agent: Automated Feature Engineering & Model Tuning Evolution
#### [Overview](#_summary)
In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process.
#### Kaggle Competition info
Current Competition: [{self.competition}](https://www.kaggle.com/competitions/{self.competition})
#### [Automated R&D](#_rdloops)
- **[R (Research)](#_research)**
- Iteration of ideas and hypotheses.
- Continuous learning and knowledge construction.
- **[D (Development)](#_development)**
- Evolving code generation, model refinement, and features generation.
- Automated implementation and testing of models/features.
#### [Objective](#_summary)
To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development.
"""
33 changes: 16 additions & 17 deletions rdagent/scenarios/data_science/scen/scen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,29 @@
from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.core.scenario import Scenario
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.kaggle_crawler import (
crawl_descriptions,
leaderboard_scores,
)
from rdagent.utils.agent.tpl import T


class DataScienceScen(Scenario):
"""Data Science Scenario
It is based on kaggle now.
- But it is not use the same interface with previous kaggle version.
- Ideally, we should reuse previous kaggle scenario.
But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
So we start from a simple one....
"""

def __init__(self, competition: str) -> None:
self.competition = competition
self.raw_description = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
self.raw_description = self._get_description()
self.metric_direction = self._get_direction()
self._analysis_competition_description()

leaderboard = leaderboard_scores(competition)
self.metric_direction = "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
def _get_description(self):
if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists():
logger.info(f"Found {competition}.json, loading from local file.")
with fp.open("r") as f:
return json.load(f)
else:
logger.error(f"Cannot find {competition}.json, please check the file.")

self._analysis_competition_description()
def _get_direction(self):
return self.raw_description.get("metric_direction", "minimize")

def _analysis_competition_description(self):
sys_prompt = T(".prompts:competition_description_template.system").r()
Expand Down Expand Up @@ -76,15 +75,15 @@ def background(self) -> str:
@property
def rich_style_description(self) -> str:
return f"""
### Kaggle Agent: Automated Feature Engineering & Model Tuning Evolution
### Data Science Agent: Automated Feature Engineering & Model Tuning Evolution
#### [Overview](#_summary)
In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process.
#### Kaggle Competition info
#### Data Science Competition info
Current Competition: [{self.competition}](https://www.kaggle.com/competitions/{self.competition})
Current Competition: [{self.competition}]
#### [Automated R&D](#_rdloops)
Expand All @@ -98,7 +97,7 @@ def rich_style_description(self) -> str:
#### [Objective](#_summary)
To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development.
To automatically optimize performance metrics within the validation set, ultimately discovering the most efficient features and models through autonomous research and development.
"""

def get_scenario_all_desc(self) -> str:
Expand Down
8 changes: 2 additions & 6 deletions rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPL
raise FileNotFoundError(f"{labels_path} does not exist")
else:
zipfile_path = f"{local_path}/zip_files"
if not Path(f"{zipfile_path}/{competition}.zip").exists() and not Path(f"{local_path}/{competition}").exists():
if not Path(f"{zipfile_path}/{competition}.zip").exists():
try:
subprocess.run(
["kaggle", "competitions", "download", "-c", competition, "-p", zipfile_path],
Expand Down Expand Up @@ -173,11 +173,7 @@ def leaderboard_scores(competition: str) -> list[float]:

api = KaggleApi()
api.authenticate()
try:
ll = api.competition_leaderboard_view(competition)
except Exception as e:
logger.error(f"Error: {e}")
return [i / 100 for i in range(100)]
ll = api.competition_leaderboard_view(competition)
return [float(x.score) for x in ll]


Expand Down

0 comments on commit 7f6d653

Please sign in to comment.