diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py index e790789d8..cb31e89a3 100644 --- a/rdagent/app/data_science/loop.py +++ b/rdagent/app/data_science/loop.py @@ -1,6 +1,7 @@ import subprocess from typing import Any, Literal +from pathlib import Path import fire from rdagent.app.data_science.conf import DS_RD_SETTING @@ -125,7 +126,12 @@ def main(path=None, step_n=None, competition=None): DS_RD_SETTING.competition = competition if DS_RD_SETTING.competition: - download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING) + if DS_RD_SETTING.scen.endswith("KaggleScen"): + download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING) + else: + if not Path(f"{DS_RD_SETTING.local_data_path}/{competition}").exists(): + logger.error(f"Please prepare data for competition {competition} first.") + return else: logger.error("Please specify competition name.") if path is None: diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py index 04f75941d..1a7a3d08b 100644 --- a/rdagent/components/coder/data_science/ensemble/test.py +++ b/rdagent/components/coder/data_science/ensemble/test.py @@ -8,7 +8,7 @@ from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen # Add the competition folder to path COMPETITION_PATH = ( @@ -31,7 +31,7 @@ def load_ensemble_spec(): def develop_one_competition(competition: str): # Initialize scenario and coder - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) ensemble_coder = EnsembleCoSTEER(scen) # Load ensemble specification ensemble_spec = load_ensemble_spec() diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py index 96cd5e590..74732e756 100644 --- a/rdagent/components/coder/data_science/feature/test.py +++ b/rdagent/components/coder/data_science/feature/test.py @@ -9,11 +9,11 @@ from rdagent.components.coder.data_science.feature import FeatureCoSTEER from rdagent.components.coder.data_science.feature.exp import FeatureTask from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen def develop_one_competition(competition: str): # -> experiment - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) feature_coder = FeatureCoSTEER(scen) with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r") as file: diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py index 201535187..ad7995789 100644 --- a/rdagent/components/coder/data_science/model/test.py +++ b/rdagent/components/coder/data_science/model/test.py @@ -12,12 +12,12 @@ from rdagent.components.coder.data_science.model.exp import ModelTask from rdagent.core.experiment import FBWorkspace from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen # Take tasks, spec.md and feat as input, generate a feedback as output def develop_one_competition(competition: str): - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) model_coder = ModelCoSTEER(scen) # Create the task diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py index 5aacc8b8c..2cd68a790 100644 --- a/rdagent/components/coder/data_science/raw_data_loader/test.py +++ b/rdagent/components/coder/data_science/raw_data_loader/test.py @@ -9,11 +9,11 @@ from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen def develop_one_competition(competition: str): # -> experiment - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) data_loader_coder = DataLoaderCoSTEER(scen) # Create the experiment diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py index 5e9e7a1b4..f2c2dcade 100644 --- a/rdagent/components/coder/data_science/workflow/test.py +++ b/rdagent/components/coder/data_science/workflow/test.py @@ -12,11 +12,11 @@ from rdagent.components.coder.data_science.workflow.exp import WorkflowTask from rdagent.core.experiment import FBWorkspace from rdagent.scenarios.data_science.experiment.experiment import DSExperiment -from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.data_science.scen import KaggleScen def develop_one_competition(competition: str): - scen = DataScienceScen(competition=competition) + scen = KaggleScen(competition=competition) workflow_coder = WorkflowCoSTEER(scen) wt = WorkflowTask( diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py index 29324c02a..8aaf93146 100644 --- a/rdagent/scenarios/data_science/scen/__init__.py +++ b/rdagent/scenarios/data_science/scen/__init__.py @@ -1,3 +1,4 @@ from .scen import DataScienceScen +from .kaggle import KaggleScen -__all__ = ["DataScienceScen"] +__all__ = ["DataScienceScen", "KaggleScen"] diff --git a/rdagent/scenarios/data_science/scen/kaggle.py b/rdagent/scenarios/data_science/scen/kaggle.py new file mode 100644 index 000000000..f693ad75f --- /dev/null +++ b/rdagent/scenarios/data_science/scen/kaggle.py @@ -0,0 +1,55 @@ +import json + +from rdagent.app.data_science.conf import DS_RD_SETTING +from rdagent.core.scenario import Scenario +from rdagent.oai.llm_utils import APIBackend +from rdagent.scenarios.data_science.scen import DataScienceScen +from rdagent.scenarios.kaggle.kaggle_crawler import ( + crawl_descriptions, + leaderboard_scores, +) +from rdagent.utils.agent.tpl import T + + +class KaggleScen(DataScienceScen): + """Kaggle Scenario + It is based on kaggle now. + - But it is not use the same interface with previous kaggle version. + - Ideally, we should reuse previous kaggle scenario. + But we found that too much scenario unrelated code in kaggle scenario and hard to reuse. + So we start from a simple one.... + """ + def _get_description(self): + return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path) + + def _get_direction(self): + leaderboard = leaderboard_scores(self.competition) + return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize" + + @property + def rich_style_description(self) -> str: + return f""" +### Kaggle Agent: Automated Feature Engineering & Model Tuning Evolution + +#### [Overview](#_summary) + +In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process. + +#### Kaggle Competition info + +Current Competition: [{self.competition}](https://www.kaggle.com/competitions/{self.competition}) + +#### [Automated R&D](#_rdloops) + +- **[R (Research)](#_research)** +- Iteration of ideas and hypotheses. +- Continuous learning and knowledge construction. + +- **[D (Development)](#_development)** +- Evolving code generation, model refinement, and features generation. +- Automated implementation and testing of models/features. + +#### [Objective](#_summary) + +To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development. +""" diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py index 3fbdc8fda..2e73ff99b 100644 --- a/rdagent/scenarios/data_science/scen/scen.py +++ b/rdagent/scenarios/data_science/scen/scen.py @@ -3,30 +3,29 @@ from rdagent.app.data_science.conf import DS_RD_SETTING from rdagent.core.scenario import Scenario from rdagent.oai.llm_utils import APIBackend -from rdagent.scenarios.kaggle.kaggle_crawler import ( - crawl_descriptions, - leaderboard_scores, -) from rdagent.utils.agent.tpl import T class DataScienceScen(Scenario): """Data Science Scenario - It is based on kaggle now. - - But it is not use the same interface with previous kaggle version. - - Ideally, we should reuse previous kaggle scenario. - But we found that too much scenario unrelated code in kaggle scenario and hard to reuse. - So we start from a simple one.... """ def __init__(self, competition: str) -> None: self.competition = competition - self.raw_description = crawl_descriptions(competition, DS_RD_SETTING.local_data_path) + self.raw_description = self._get_description() + self.metric_direction = self._get_direction() + self._analysis_competition_description() - leaderboard = leaderboard_scores(competition) - self.metric_direction = "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize" + def _get_description(self): + if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists(): + logger.info(f"Found {competition}.json, loading from local file.") + with fp.open("r") as f: + return json.load(f) + else: + logger.error(f"Cannot find {competition}.json, please check the file.") - self._analysis_competition_description() + def _get_direction(self): + return self.raw_description.get("metric_direction", "minimize") def _analysis_competition_description(self): sys_prompt = T(".prompts:competition_description_template.system").r() @@ -76,15 +75,15 @@ def background(self) -> str: @property def rich_style_description(self) -> str: return f""" -### Kaggle Agent: Automated Feature Engineering & Model Tuning Evolution +### Data Science Agent: Automated Feature Engineering & Model Tuning Evolution #### [Overview](#_summary) In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process. -#### Kaggle Competition info +#### Data Science Competition info -Current Competition: [{self.competition}](https://www.kaggle.com/competitions/{self.competition}) +Current Competition: [{self.competition}] #### [Automated R&D](#_rdloops) @@ -98,7 +97,7 @@ def rich_style_description(self) -> str: #### [Objective](#_summary) -To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development. +To automatically optimize performance metrics within the validation set, ultimately discovering the most efficient features and models through autonomous research and development. """ def get_scenario_all_desc(self) -> str: diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py index 7da4dde6c..069ca655a 100644 --- a/rdagent/scenarios/kaggle/kaggle_crawler.py +++ b/rdagent/scenarios/kaggle/kaggle_crawler.py @@ -143,7 +143,7 @@ def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPL raise FileNotFoundError(f"{labels_path} does not exist") else: zipfile_path = f"{local_path}/zip_files" - if not Path(f"{zipfile_path}/{competition}.zip").exists() and not Path(f"{local_path}/{competition}").exists(): + if not Path(f"{zipfile_path}/{competition}.zip").exists(): try: subprocess.run( ["kaggle", "competitions", "download", "-c", competition, "-p", zipfile_path], @@ -173,11 +173,7 @@ def leaderboard_scores(competition: str) -> list[float]: api = KaggleApi() api.authenticate() - try: - ll = api.competition_leaderboard_view(competition) - except Exception as e: - logger.error(f"Error: {e}") - return [i / 100 for i in range(100)] + ll = api.competition_leaderboard_view(competition) return [float(x.score) for x in ll]