From 989161b0396bcc053bbdcc2a20663f0daa6a69c7 Mon Sep 17 00:00:00 2001
From: WinstonLiye <1957922024@qq.com>
Date: Mon, 25 Nov 2024 08:45:21 +0000
Subject: [PATCH 1/5] fix a bug in KGFBWorkspace

---
 .../scenarios/kaggle/experiment/workspace.py  | 81 +++++++++++++------
 1 file changed, 58 insertions(+), 23 deletions(-)

diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
index 1bfaaf86..f9bd39b2 100644
--- a/rdagent/scenarios/kaggle/experiment/workspace.py
+++ b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -10,7 +10,7 @@
 from rdagent.log import rdagent_logger as logger
 from rdagent.utils.env import KGDockerEnv
 
-KG_FEATURE_PREPROCESS_SCRIPT = """import pickle
+KG_FEATURE_PREPROCESS_SCRIPT_v1 = """import pickle
 
 from fea_share_preprocess import preprocess_script
 
@@ -24,6 +24,18 @@
 pickle.dump(others, open("others.pkl", "wb"))
 """
 
+KG_FEATURE_PREPROCESS_SCRIPT_v2 = """import pickle
+
+from load_data import load_from_raw_data
+
+X, y, X_test, others = load_from_raw_data()
+
+pickle.dump(X, open("X.pkl", "wb"))
+pickle.dump(y, open("y.pkl", "wb"))
+pickle.dump(X_test, open("X_test.pkl", "wb"))
+pickle.dump(others, open("others.pkl", "wb"))
+"""
+
 
 class KGFBWorkspace(FBWorkspace):
     def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
@@ -45,29 +57,52 @@ def generate_preprocess_data(
         kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
         kgde.prepare()
 
-        execute_log, results = kgde.dump_python_code_run_and_get_results(
-            code=KG_FEATURE_PREPROCESS_SCRIPT,
-            local_path=str(self.workspace_path),
-            dump_file_names=[
-                "X_train.pkl",
-                "X_valid.pkl",
-                "y_train.pkl",
-                "y_valid.pkl",
-                "X_test.pkl",
-                "others.pkl",
-            ],
-            running_extra_volume=(
-                {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
-                if KAGGLE_IMPLEMENT_SETTING.competition
-                else None
-            ),
-        )
-        if results is None:
-            logger.error("Feature preprocess failed.")
-            raise Exception("Feature preprocess failed.")
+        if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex":
+            execute_log, results = kgde.dump_python_code_run_and_get_results(
+                code=KG_FEATURE_PREPROCESS_SCRIPT_v1,
+                local_path=str(self.workspace_path),
+                dump_file_names=[
+                    "X_train.pkl",
+                    "X_valid.pkl",
+                    "y_train.pkl",
+                    "y_valid.pkl",
+                    "X_test.pkl",
+                    "others.pkl",
+                ],
+                running_extra_volume=(
+                    {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
+                    if KAGGLE_IMPLEMENT_SETTING.competition
+                    else None
+                ),
+            )
+            if results is None:
+                logger.error("Feature preprocess failed.")
+                raise Exception("Feature preprocess failed.")
+            else:
+                X_train, X_valid, y_train, y_valid, X_test, others = results
+                return X_train, X_valid, y_train, y_valid, X_test, *others
         else:
-            X_train, X_valid, y_train, y_valid, X_test, others = results
-            return X_train, X_valid, y_train, y_valid, X_test, *others
+            execute_log, results = kgde.dump_python_code_run_and_get_results(
+                code=KG_FEATURE_PREPROCESS_SCRIPT_v2,
+                local_path=str(self.workspace_path),
+                dump_file_names=[
+                    "X.pkl",
+                    "y.pkl",
+                    "X_test.pkl",
+                    "others.pkl",
+                ],
+                running_extra_volume=(
+                    {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
+                    if KAGGLE_IMPLEMENT_SETTING.competition
+                    else None
+                ),
+            )
+            if results is None:
+                logger.error("Feature preprocess failed.")
+                raise Exception("Feature preprocess failed.")
+            else:
+                X, y, X_test, others = results
+                return X, y, X_test, others
 
     def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
         logger.info(f"Running the experiment in {self.workspace_path}")

From 01162b9019fb0701ae32686eb048a6d4b6ed8a90 Mon Sep 17 00:00:00 2001
From: WinstonLiye <1957922024@qq.com>
Date: Mon, 25 Nov 2024 08:54:28 +0000
Subject: [PATCH 2/5] fix a bug in source_data

---
 rdagent/scenarios/kaggle/experiment/scenario.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index b5a7f84e..5a24655a 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -148,6 +148,21 @@ def background(self) -> str:
     def source_data(self) -> str:
         data_folder = Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / self.competition
 
+        if KAGGLE_IMPLEMENT_SETTING.template_path == "rdagent/scenarios/kaggle/tpl_ex":
+            if not (data_folder / "X.pkl").exists():
+                preprocess_experiment = KGFactorExperiment([])
+                X, y, X_test, others = preprocess_experiment.experiment_workspace.generate_preprocess_data()
+
+                data_folder.mkdir(exist_ok=True, parents=True)
+                pickle.dump(X, open(data_folder / "X.pkl", "wb"))
+                pickle.dump(y, open(data_folder / "y.pkl", "wb"))
+                pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb"))
+                pickle.dump(others, open(data_folder / "others.pkl", "wb"))
+
+            X = pd.read_pickle(data_folder / "X.pkl")
+            self.input_shape = X.shape
+            return str(self.input_shape)
+
         if not (data_folder / "X_valid.pkl").exists():
             preprocess_experiment = KGFactorExperiment([])
             (

From de4d413feff26011a3c4c6e15d65bb96b05e94f1 Mon Sep 17 00:00:00 2001
From: WinstonLiye <1957922024@qq.com>
Date: Mon, 25 Nov 2024 09:11:09 +0000
Subject: [PATCH 3/5] fix a bug in the result storing of factor costeer

---
 rdagent/components/coder/factor_coder/factor.py  |  5 ++++-
 ...late.txt => factor_execution_template_v1.txt} |  0
 .../factor_execution_template_v2.txt             | 16 ++++++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)
 rename rdagent/components/coder/factor_coder/{factor_execution_template.txt => factor_execution_template_v1.txt} (100%)
 create mode 100644 rdagent/components/coder/factor_coder/factor_execution_template_v2.txt

diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index edb7a9ce..1467bdc0 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -147,7 +147,10 @@ def execute(self, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]:
                 execution_code_path = code_path
             elif self.target_task.version == 2:
                 execution_code_path = self.workspace_path / f"{uuid.uuid4()}.py"
-                execution_code_path.write_text((Path(__file__).parent / "factor_execution_template.txt").read_text())
+                if KAGGLE_IMPLEMENT_SETTING.template_path == "rdagent/scenarios/kaggle/tpl_ex":
+                    execution_code_path.write_text((Path(__file__).parent / "factor_execution_template_v2.txt").read_text())
+                else:
+                    execution_code_path.write_text((Path(__file__).parent / "factor_execution_template_v1.txt").read_text())
 
             try:
                 subprocess.check_output(
diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template_v1.txt
similarity index 100%
rename from rdagent/components/coder/factor_coder/factor_execution_template.txt
rename to rdagent/components/coder/factor_coder/factor_execution_template_v1.txt
diff --git a/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt
new file mode 100644
index 00000000..397171b6
--- /dev/null
+++ b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt
@@ -0,0 +1,16 @@
+import os
+import h5py
+import numpy as np
+import pandas as pd
+from feat01 import feat_eng
+
+if os.path.exists("X.pkl"):
+    X = pd.read_pickle("X.pkl")
+    y = pd.read_pickle("y.pkl")
+else:
+    raise FileNotFoundError("No valid data found.")
+
+X, y, p = feat_eng(X, y)
+
+with h5py.File('result.h5', 'w') as hf:
+    hf.create_dataset('default', data=X)

From 2d588b7b20202be399cfad4f98db22b7226d89e9 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 25 Nov 2024 09:27:23 +0000
Subject: [PATCH 4/5] fix train.py to main.py for refactored kaggle template

---
 rdagent/components/coder/factor_coder/factor.py  |  8 ++++++--
 rdagent/scenarios/kaggle/experiment/scenario.py  |  7 ++++++-
 rdagent/scenarios/kaggle/experiment/workspace.py | 12 ++++++++++--
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index 1467bdc0..9e4fcd95 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -148,9 +148,13 @@ def execute(self, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]:
             elif self.target_task.version == 2:
                 execution_code_path = self.workspace_path / f"{uuid.uuid4()}.py"
                 if KAGGLE_IMPLEMENT_SETTING.template_path == "rdagent/scenarios/kaggle/tpl_ex":
-                    execution_code_path.write_text((Path(__file__).parent / "factor_execution_template_v2.txt").read_text())
+                    execution_code_path.write_text(
+                        (Path(__file__).parent / "factor_execution_template_v2.txt").read_text()
+                    )
                 else:
-                    execution_code_path.write_text((Path(__file__).parent / "factor_execution_template_v1.txt").read_text())
+                    execution_code_path.write_text(
+                        (Path(__file__).parent / "factor_execution_template_v1.txt").read_text()
+                    )
 
             try:
                 subprocess.check_output(
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index 5a24655a..b06b4542 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -125,7 +125,12 @@ def background(self) -> str:
         background_template = prompt_dict["kg_background"]
 
         train_script = (
-            Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py"
+            Path(__file__).resolve()
+            / Path(KAGGLE_IMPLEMENT_SETTING.template_path).resolve()
+            / KAGGLE_IMPLEMENT_SETTING.competition
+            / "train.py"
+            if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex"
+            else "main.py"
         ).read_text()
 
         background_prompt = (
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
index f9bd39b2..13d717e9 100644
--- a/rdagent/scenarios/kaggle/experiment/workspace.py
+++ b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -70,7 +70,11 @@ def generate_preprocess_data(
                     "others.pkl",
                 ],
                 running_extra_volume=(
-                    {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
+                    {
+                        KAGGLE_IMPLEMENT_SETTING.local_data_path
+                        + "/"
+                        + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"
+                    }
                     if KAGGLE_IMPLEMENT_SETTING.competition
                     else None
                 ),
@@ -92,7 +96,11 @@ def generate_preprocess_data(
                     "others.pkl",
                 ],
                 running_extra_volume=(
-                    {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
+                    {
+                        KAGGLE_IMPLEMENT_SETTING.local_data_path
+                        + "/"
+                        + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"
+                    }
                     if KAGGLE_IMPLEMENT_SETTING.competition
                     else None
                 ),

From a6a097d48e4ed2943f9b86c7546b8094444dc4dd Mon Sep 17 00:00:00 2001
From: WinstonLiye <1957922024@qq.com>
Date: Mon, 25 Nov 2024 09:40:53 +0000
Subject: [PATCH 5/5] fix a bug in factor_execution_template_v2

---
 .../coder/factor_coder/factor_execution_template_v2.txt     | 6 +++---
 rdagent/scenarios/kaggle/experiment/scenario.py             | 4 +---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt
index 397171b6..656df8e8 100644
--- a/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt
+++ b/rdagent/components/coder/factor_coder/factor_execution_template_v2.txt
@@ -1,5 +1,5 @@
 import os
-import h5py
+
 import numpy as np
 import pandas as pd
 from feat01 import feat_eng
@@ -12,5 +12,5 @@ else:
 
 X, y, p = feat_eng(X, y)
 
-with h5py.File('result.h5', 'w') as hf:
-    hf.create_dataset('default', data=X)
+X = pd.dataframe(X)
+X.to_hdf("result.h5", key="data", mode="w")
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index b06b4542..dbb4accd 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -128,9 +128,7 @@ def background(self) -> str:
             Path(__file__).resolve()
             / Path(KAGGLE_IMPLEMENT_SETTING.template_path).resolve()
             / KAGGLE_IMPLEMENT_SETTING.competition
-            / "train.py"
-            if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex"
-            else "main.py"
+            / ("train.py" if KAGGLE_IMPLEMENT_SETTING.template_path != "rdagent/scenarios/kaggle/tpl_ex" else "main.py")
         ).read_text()
 
         background_prompt = (