diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py index edb7a9ce..9e4fcd95 100644 --- a/rdagent/components/coder/factor_coder/factor.py +++ b/rdagent/components/coder/factor_coder/factor.py @@ -147,7 +147,14 @@ def execute(self, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]: execution_code_path = code_path elif self.target_task.version == 2: execution_code_path = self.workspace_path / f"{uuid.uuid4()}.py" - execution_code_path.write_text((Path(__file__).parent / "factor_execution_template.txt").read_text()) + if KAGGLE_IMPLEMENT_SETTING.template_path == "rdagent/scenarios/kaggle/tpl_ex": + execution_code_path.write_text( + (Path(__file__).parent / "factor_execution_template_v2.txt").read_text() + ) + else: + execution_code_path.write_text( + (Path(__file__).parent / "factor_execution_template_v1.txt").read_text() + ) try: subprocess.check_output( diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template_v1.txt similarity index 100% rename from rdagent/components/coder/factor_coder/factor_execution_template.txt rename to rdagent/components/coder/factor_coder/factor_execution_template_v1.txt diff --git a/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt new file mode 100644 index 00000000..656df8e8 --- /dev/null +++ b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt @@ -0,0 +1,16 @@ +import os + +import numpy as np +import pandas as pd +from feat01 import feat_eng + +if os.path.exists("X.pkl"): + X = pd.read_pickle("X.pkl") + y = pd.read_pickle("y.pkl") +else: + raise FileNotFoundError("No valid data found.") + +X, y, p = feat_eng(X, y) + +X = pd.dataframe(X) +X.to_hdf("result.h5", key="data", mode="w") diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index b5a7f84e..dbb4accd 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -125,7 +125,10 @@ def background(self) -> str: background_template = prompt_dict["kg_background"] train_script = ( - Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py" + Path(__file__).resolve() + / Path(KAGGLE_IMPLEMENT_SETTING.template_path).resolve() + / KAGGLE_IMPLEMENT_SETTING.competition + / ("train.py" if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex" else "main.py") ).read_text() background_prompt = ( @@ -148,6 +151,21 @@ def background(self) -> str: def source_data(self) -> str: data_folder = Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / self.competition + if KAGGLE_IMPLEMENT_SETTING.template_path == "rdagent/scenarios/kaggle/tpl_ex": + if not (data_folder / "X.pkl").exists(): + preprocess_experiment = KGFactorExperiment([]) + X, y, X_test, others = preprocess_experiment.experiment_workspace.generate_preprocess_data() + + data_folder.mkdir(exist_ok=True, parents=True) + pickle.dump(X, open(data_folder / "X.pkl", "wb")) + pickle.dump(y, open(data_folder / "y.pkl", "wb")) + pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb")) + pickle.dump(others, open(data_folder / "others.pkl", "wb")) + + X = pd.read_pickle(data_folder / "X.pkl") + self.input_shape = X.shape + return str(self.input_shape) + if not (data_folder / "X_valid.pkl").exists(): preprocess_experiment = KGFactorExperiment([]) ( diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py index 1bfaaf86..13d717e9 100644 --- a/rdagent/scenarios/kaggle/experiment/workspace.py +++ b/rdagent/scenarios/kaggle/experiment/workspace.py @@ -10,7 +10,7 @@ from rdagent.log import rdagent_logger as logger from rdagent.utils.env import KGDockerEnv -KG_FEATURE_PREPROCESS_SCRIPT = """import pickle +KG_FEATURE_PREPROCESS_SCRIPT_v1 = """import pickle from fea_share_preprocess import preprocess_script @@ -24,6 +24,18 @@ pickle.dump(others, open("others.pkl", "wb")) """ +KG_FEATURE_PREPROCESS_SCRIPT_v2 = """import pickle + +from load_data import load_from_raw_data + +X, y, X_test, others = load_from_raw_data() + +pickle.dump(X, open("X.pkl", "wb")) +pickle.dump(y, open("y.pkl", "wb")) +pickle.dump(X_test, open("X_test.pkl", "wb")) +pickle.dump(others, open("others.pkl", "wb")) +""" + class KGFBWorkspace(FBWorkspace): def __init__(self, template_folder_path: Path, *args, **kwargs) -> None: @@ -45,29 +57,60 @@ def generate_preprocess_data( kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition) kgde.prepare() - execute_log, results = kgde.dump_python_code_run_and_get_results( - code=KG_FEATURE_PREPROCESS_SCRIPT, - local_path=str(self.workspace_path), - dump_file_names=[ - "X_train.pkl", - "X_valid.pkl", - "y_train.pkl", - "y_valid.pkl", - "X_test.pkl", - "others.pkl", - ], - running_extra_volume=( - {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} - if KAGGLE_IMPLEMENT_SETTING.competition - else None - ), - ) - if results is None: - logger.error("Feature preprocess failed.") - raise Exception("Feature preprocess failed.") + if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex": + execute_log, results = kgde.dump_python_code_run_and_get_results( + code=KG_FEATURE_PREPROCESS_SCRIPT_v1, + local_path=str(self.workspace_path), + dump_file_names=[ + "X_train.pkl", + "X_valid.pkl", + "y_train.pkl", + "y_valid.pkl", + "X_test.pkl", + "others.pkl", + ], + running_extra_volume=( + { + KAGGLE_IMPLEMENT_SETTING.local_data_path + + "/" + + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input" + } + if KAGGLE_IMPLEMENT_SETTING.competition + else None + ), + ) + if results is None: + logger.error("Feature preprocess failed.") + raise Exception("Feature preprocess failed.") + else: + X_train, X_valid, y_train, y_valid, X_test, others = results + return X_train, X_valid, y_train, y_valid, X_test, *others else: - X_train, X_valid, y_train, y_valid, X_test, others = results - return X_train, X_valid, y_train, y_valid, X_test, *others + execute_log, results = kgde.dump_python_code_run_and_get_results( + code=KG_FEATURE_PREPROCESS_SCRIPT_v2, + local_path=str(self.workspace_path), + dump_file_names=[ + "X.pkl", + "y.pkl", + "X_test.pkl", + "others.pkl", + ], + running_extra_volume=( + { + KAGGLE_IMPLEMENT_SETTING.local_data_path + + "/" + + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input" + } + if KAGGLE_IMPLEMENT_SETTING.competition + else None + ), + ) + if results is None: + logger.error("Feature preprocess failed.") + raise Exception("Feature preprocess failed.") + else: + X, y, X_test, others = results + return X, y, X_test, others def execute(self, run_env: dict = {}, *args, **kwargs) -> str: logger.info(f"Running the experiment in {self.workspace_path}")