From 989161b0396bcc053bbdcc2a20663f0daa6a69c7 Mon Sep 17 00:00:00 2001 From: WinstonLiye <1957922024@qq.com> Date: Mon, 25 Nov 2024 08:45:21 +0000 Subject: [PATCH 1/5] fix a bug in KGFBWorkspace --- .../scenarios/kaggle/experiment/workspace.py | 81 +++++++++++++------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py index 1bfaaf86..f9bd39b2 100644 --- a/rdagent/scenarios/kaggle/experiment/workspace.py +++ b/rdagent/scenarios/kaggle/experiment/workspace.py @@ -10,7 +10,7 @@ from rdagent.log import rdagent_logger as logger from rdagent.utils.env import KGDockerEnv -KG_FEATURE_PREPROCESS_SCRIPT = """import pickle +KG_FEATURE_PREPROCESS_SCRIPT_v1 = """import pickle from fea_share_preprocess import preprocess_script @@ -24,6 +24,18 @@ pickle.dump(others, open("others.pkl", "wb")) """ +KG_FEATURE_PREPROCESS_SCRIPT_v2 = """import pickle + +from load_data import load_from_raw_data + +X, y, X_test, others = load_from_raw_data() + +pickle.dump(X, open("X.pkl", "wb")) +pickle.dump(y, open("y.pkl", "wb")) +pickle.dump(X_test, open("X_test.pkl", "wb")) +pickle.dump(others, open("others.pkl", "wb")) +""" + class KGFBWorkspace(FBWorkspace): def __init__(self, template_folder_path: Path, *args, **kwargs) -> None: @@ -45,29 +57,52 @@ def generate_preprocess_data( kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition) kgde.prepare() - execute_log, results = kgde.dump_python_code_run_and_get_results( - code=KG_FEATURE_PREPROCESS_SCRIPT, - local_path=str(self.workspace_path), - dump_file_names=[ - "X_train.pkl", - "X_valid.pkl", - "y_train.pkl", - "y_valid.pkl", - "X_test.pkl", - "others.pkl", - ], - running_extra_volume=( - {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} - if KAGGLE_IMPLEMENT_SETTING.competition - else None - ), - ) - if results is None: - logger.error("Feature preprocess failed.") - raise Exception("Feature preprocess failed.") + if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex": + execute_log, results = kgde.dump_python_code_run_and_get_results( + code=KG_FEATURE_PREPROCESS_SCRIPT_v1, + local_path=str(self.workspace_path), + dump_file_names=[ + "X_train.pkl", + "X_valid.pkl", + "y_train.pkl", + "y_valid.pkl", + "X_test.pkl", + "others.pkl", + ], + running_extra_volume=( + {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} + if KAGGLE_IMPLEMENT_SETTING.competition + else None + ), + ) + if results is None: + logger.error("Feature preprocess failed.") + raise Exception("Feature preprocess failed.") + else: + X_train, X_valid, y_train, y_valid, X_test, others = results + return X_train, X_valid, y_train, y_valid, X_test, *others else: - X_train, X_valid, y_train, y_valid, X_test, others = results - return X_train, X_valid, y_train, y_valid, X_test, *others + execute_log, results = kgde.dump_python_code_run_and_get_results( + code=KG_FEATURE_PREPROCESS_SCRIPT_v2, + local_path=str(self.workspace_path), + dump_file_names=[ + "X.pkl", + "y.pkl", + "X_test.pkl", + "others.pkl", + ], + running_extra_volume=( + {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} + if KAGGLE_IMPLEMENT_SETTING.competition + else None + ), + ) + if results is None: + logger.error("Feature preprocess failed.") + raise Exception("Feature preprocess failed.") + else: + X, y, X_test, others = results + return X, y, X_test, others def execute(self, run_env: dict = {}, *args, **kwargs) -> str: logger.info(f"Running the experiment in {self.workspace_path}") From 01162b9019fb0701ae32686eb048a6d4b6ed8a90 Mon Sep 17 00:00:00 2001 From: WinstonLiye <1957922024@qq.com> Date: Mon, 25 Nov 2024 08:54:28 +0000 Subject: [PATCH 2/5] fix a bug in source_data --- rdagent/scenarios/kaggle/experiment/scenario.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index b5a7f84e..5a24655a 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -148,6 +148,21 @@ def background(self) -> str: def source_data(self) -> str: data_folder = Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / self.competition + if KAGGLE_IMPLEMENT_SETTING.template_path == "rdagent/scenarios/kaggle/tpl_ex": + if not (data_folder / "X.pkl").exists(): + preprocess_experiment = KGFactorExperiment([]) + X, y, X_test, others = preprocess_experiment.experiment_workspace.generate_preprocess_data() + + data_folder.mkdir(exist_ok=True, parents=True) + pickle.dump(X, open(data_folder / "X.pkl", "wb")) + pickle.dump(y, open(data_folder / "y.pkl", "wb")) + pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb")) + pickle.dump(others, open(data_folder / "others.pkl", "wb")) + + X = pd.read_pickle(data_folder / "X.pkl") + self.input_shape = X.shape + return str(self.input_shape) + if not (data_folder / "X_valid.pkl").exists(): preprocess_experiment = KGFactorExperiment([]) ( From de4d413feff26011a3c4c6e15d65bb96b05e94f1 Mon Sep 17 00:00:00 2001 From: WinstonLiye <1957922024@qq.com> Date: Mon, 25 Nov 2024 09:11:09 +0000 Subject: [PATCH 3/5] fix a bug in the result storing of factor costeer --- rdagent/components/coder/factor_coder/factor.py | 5 ++++- ...late.txt => factor_execution_template_v1.txt} | 0 .../factor_execution_template_v2.txt | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) rename rdagent/components/coder/factor_coder/{factor_execution_template.txt => factor_execution_template_v1.txt} (100%) create mode 100644 rdagent/components/coder/factor_coder/factor_execution_template_v2.txt diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py index edb7a9ce..1467bdc0 100644 --- a/rdagent/components/coder/factor_coder/factor.py +++ b/rdagent/components/coder/factor_coder/factor.py @@ -147,7 +147,10 @@ def execute(self, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]: execution_code_path = code_path elif self.target_task.version == 2: execution_code_path = self.workspace_path / f"{uuid.uuid4()}.py" - execution_code_path.write_text((Path(__file__).parent / "factor_execution_template.txt").read_text()) + if KAGGLE_IMPLEMENT_SETTING.template_path == "rdagent/scenarios/kaggle/tpl_ex": + execution_code_path.write_text((Path(__file__).parent / "factor_execution_template_v2.txt").read_text()) + else: + execution_code_path.write_text((Path(__file__).parent / "factor_execution_template_v1.txt").read_text()) try: subprocess.check_output( diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template_v1.txt similarity index 100% rename from rdagent/components/coder/factor_coder/factor_execution_template.txt rename to rdagent/components/coder/factor_coder/factor_execution_template_v1.txt diff --git a/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt new file mode 100644 index 00000000..397171b6 --- /dev/null +++ b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt @@ -0,0 +1,16 @@ +import os +import h5py +import numpy as np +import pandas as pd +from feat01 import feat_eng + +if os.path.exists("X.pkl"): + X = pd.read_pickle("X.pkl") + y = pd.read_pickle("y.pkl") +else: + raise FileNotFoundError("No valid data found.") + +X, y, p = feat_eng(X, y) + +with h5py.File('result.h5', 'w') as hf: + hf.create_dataset('default', data=X) From 2d588b7b20202be399cfad4f98db22b7226d89e9 Mon Sep 17 00:00:00 2001 From: Bowen Xian Date: Mon, 25 Nov 2024 09:27:23 +0000 Subject: [PATCH 4/5] fix train.py to main.py for refactored kaggle template --- rdagent/components/coder/factor_coder/factor.py | 8 ++++++-- rdagent/scenarios/kaggle/experiment/scenario.py | 7 ++++++- rdagent/scenarios/kaggle/experiment/workspace.py | 12 ++++++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py index 1467bdc0..9e4fcd95 100644 --- a/rdagent/components/coder/factor_coder/factor.py +++ b/rdagent/components/coder/factor_coder/factor.py @@ -148,9 +148,13 @@ def execute(self, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]: elif self.target_task.version == 2: execution_code_path = self.workspace_path / f"{uuid.uuid4()}.py" if KAGGLE_IMPLEMENT_SETTING.template_path == "rdagent/scenarios/kaggle/tpl_ex": - execution_code_path.write_text((Path(__file__).parent / "factor_execution_template_v2.txt").read_text()) + execution_code_path.write_text( + (Path(__file__).parent / "factor_execution_template_v2.txt").read_text() + ) else: - execution_code_path.write_text((Path(__file__).parent / "factor_execution_template_v1.txt").read_text()) + execution_code_path.write_text( + (Path(__file__).parent / "factor_execution_template_v1.txt").read_text() + ) try: subprocess.check_output( diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index 5a24655a..b06b4542 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -125,7 +125,12 @@ def background(self) -> str: background_template = prompt_dict["kg_background"] train_script = ( - Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py" + Path(__file__).resolve() + / Path(KAGGLE_IMPLEMENT_SETTING.template_path).resolve() + / KAGGLE_IMPLEMENT_SETTING.competition + / "train.py" + if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex" + else "main.py" ).read_text() background_prompt = ( diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py index f9bd39b2..13d717e9 100644 --- a/rdagent/scenarios/kaggle/experiment/workspace.py +++ b/rdagent/scenarios/kaggle/experiment/workspace.py @@ -70,7 +70,11 @@ def generate_preprocess_data( "others.pkl", ], running_extra_volume=( - {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} + { + KAGGLE_IMPLEMENT_SETTING.local_data_path + + "/" + + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input" + } if KAGGLE_IMPLEMENT_SETTING.competition else None ), @@ -92,7 +96,11 @@ def generate_preprocess_data( "others.pkl", ], running_extra_volume=( - {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} + { + KAGGLE_IMPLEMENT_SETTING.local_data_path + + "/" + + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input" + } if KAGGLE_IMPLEMENT_SETTING.competition else None ), From a6a097d48e4ed2943f9b86c7546b8094444dc4dd Mon Sep 17 00:00:00 2001 From: WinstonLiye <1957922024@qq.com> Date: Mon, 25 Nov 2024 09:40:53 +0000 Subject: [PATCH 5/5] fix a bug in factor_execution_template_v2 --- .../coder/factor_coder/factor_execution_template_v2.txt | 6 +++--- rdagent/scenarios/kaggle/experiment/scenario.py | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt index 397171b6..656df8e8 100644 --- a/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt +++ b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt @@ -1,5 +1,5 @@ import os -import h5py + import numpy as np import pandas as pd from feat01 import feat_eng @@ -12,5 +12,5 @@ else: X, y, p = feat_eng(X, y) -with h5py.File('result.h5', 'w') as hf: - hf.create_dataset('default', data=X) +X = pd.dataframe(X) +X.to_hdf("result.h5", key="data", mode="w") diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index b06b4542..dbb4accd 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -128,9 +128,7 @@ def background(self) -> str: Path(__file__).resolve() / Path(KAGGLE_IMPLEMENT_SETTING.template_path).resolve() / KAGGLE_IMPLEMENT_SETTING.competition - / "train.py" - if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex" - else "main.py" + / ("train.py" if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex" else "main.py") ).read_text() background_prompt = (