From d7c3e6746d6ba18f3189b3dccadc6c8a2c9da7d9 Mon Sep 17 00:00:00 2001 From: pcvishak Date: Wed, 9 Nov 2022 23:43:38 +0530 Subject: [PATCH] Adds code for NGBoost Algorithm Signed-off-by: pcvishak --- TabSurvey/models/__init__.py | 7 ++- TabSurvey/models/tree_models.py | 70 +++++++++++++++++++++++- TabSurvey/tabzilla_alg_handler.py | 10 ++-- TabSurvey/testall.sh | 5 +- conda_envs/gbdt.yml | 5 +- scripts/experiments/cpu_experiments.sh | 1 + scripts/experiments/gpu_experiments_a.sh | 1 + scripts/experiments/gpu_experiments_b.sh | 1 + scripts/tests/all_algs_two_datasets.sh | 1 + 9 files changed, 90 insertions(+), 11 deletions(-) diff --git a/TabSurvey/models/__init__.py b/TabSurvey/models/__init__.py index 9f2033d..50df78c 100644 --- a/TabSurvey/models/__init__.py +++ b/TabSurvey/models/__init__.py @@ -1,6 +1,6 @@ all_models = ["LinearModel", "KNN", "DecisionTree", "RandomForest", "XGBoost", "CatBoost", "LightGBM", "ModelTree", "MLP", "TabNet", "VIME", "TabTransformer", "NODE", "DeepGBM", "RLN", "DNFNet", "STG", "NAM", "DeepFM", - "SAINT", "DANet"] + "SAINT", "DANet", "NGBoost"] def str2model(model): @@ -93,5 +93,10 @@ def str2model(model): from models.danet import DANet return DANet + elif model == "NGBoost": + from models.tree_models import NGBoost + return NGBoost + + else: raise NotImplementedError("Model \"" + model + "\" not yet implemented") diff --git a/TabSurvey/models/tree_models.py b/TabSurvey/models/tree_models.py index d98ad3e..03535f2 100755 --- a/TabSurvey/models/tree_models.py +++ b/TabSurvey/models/tree_models.py @@ -4,7 +4,8 @@ import lightgbm as lgb import numpy as np import xgboost as xgb - +from ngboost import NGBRegressor +from ngboost import NGBClassifier from models.basemodel import BaseModel """ @@ -103,6 +104,71 @@ def default_parameters(cls): } return params +class NGBoost(BaseModel): + + # TabZilla: add default number of boosting rounds + # default_epochs = 500 + + def __init__(self, params, args): + super().__init__(params, args) + self.params["verbosity"] = 1 + + if args.objective == "regression": + self.params["objective"] = "regression" + self.params["metric"] = "mse" + elif args.objective == "classification": + self.params["objective"] = "multiclass" + self.params["num_class"] = args.num_classes + self.params["metric"] = "multiclass" + elif args.objective == "binary": + self.params["objective"] = "binary" + self.params["metric"] = "auc" + + def fit(self, X, y, X_val=None, y_val=None): + if self.args.objective == "regression": + self.model = NGBRegressor().fit(X, y, X_val=X_val, Y_val=y_val) + elif self.args.objective == "classification": + self.model = NGBClassifier(Dist=k_categorical(self.args.num_classes)).fit(X, y, X_val=X_val, Y_val=y_val) + else: + self.model = NGBClassifier(Dist=k_categorical(2)).fit(X, y, X_val=X_val, Y_val=y_val) + return [], [] + def predict(self, X): + return super().predict(X) + def predict_proba(self, X): + probabilities = self.model.predict(X) + self.prediction_probabilities = probabilities + return self.prediction_probabilities + + @classmethod + def define_trial_parameters(cls, trial, args): + params = { + "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), + "n_estimators": trial.suggest_int("n_estimators", 100, 250, log=True), + "minibatch_frac": trial.suggest_float("minibatch_frac", 0.4, 0.8, log=True), + "col_sample": trial.suggest_float("col_sample", 0.3, 0.7, log=True), + } + return params + @classmethod + def get_random_parameters(cls, seed): + rs = np.random.RandomState(seed) + params = { + "learning_rate": 3.0 * np.power(10, rs.uniform(-2, -1)), + "n_estimators": int(np.round(50 * rs.uniform(1,5))), + "minibatch_frac": rs.uniform(0.4, 0.8), + "col_sample": rs.uniform(0.3, 0.7) + } + return params + + @classmethod + def default_parameters(cls): + params = { + "learning_rate": 0.08, + "n_estimators": 100, + "minibatch_frac": 0.5, + "col_sample": 0.5, + } + return params + """ CatBoost (https://catboost.ai/) @@ -274,3 +340,5 @@ def default_parameters(cls): "learning_rate": 0.08, } return params + + diff --git a/TabSurvey/tabzilla_alg_handler.py b/TabSurvey/tabzilla_alg_handler.py index 69625b8..5ed2ded 100644 --- a/TabSurvey/tabzilla_alg_handler.py +++ b/TabSurvey/tabzilla_alg_handler.py @@ -83,6 +83,11 @@ def get_CatBoost(): return model +@register_model("gbdt") +def get_NGBoost(): + from models.tree_models import NGBoost as model + + return model @register_model("gbdt") def get_LightGBM(): @@ -90,7 +95,6 @@ def get_LightGBM(): return model - @register_model("gbdt") def get_ModelTree(): from models.modeltree import ModelTree as model @@ -178,10 +182,6 @@ def get_DANet(): return model -@register_model("torch") -def get_Hopular(): - from models.hopular_model import HopularModel as model - ############################################################## # tensorflow models diff --git a/TabSurvey/testall.sh b/TabSurvey/testall.sh index 76432a5..eeedc85 100644 --- a/TabSurvey/testall.sh +++ b/TabSurvey/testall.sh @@ -9,9 +9,9 @@ TORCH_ENV="torch" KERAS_ENV="tensorflow" # "LinearModel" "KNN" "DecisionTree" "RandomForest" -# "XGBoost" "CatBoost" "LightGBM" +# "XGBoost" "CatBoost" "NGBoost" "LightGBM" # "MLP" "TabNet" "VIME" -# MODELS=( "LinearModel" "KNN" "DecisionTree" "RandomForest" "XGBoost" "CatBoost" "LightGBM" "MLP" "TabNet" "VIME") +# MODELS=( "LinearModel" "KNN" "DecisionTree" "RandomForest" "XGBoost" "NGBoost" "CatBoost" "LightGBM" "MLP" "TabNet" "VIME") declare -A MODELS MODELS=( ["LinearModel"]=$SKLEARN_ENV @@ -20,6 +20,7 @@ MODELS=( ["LinearModel"]=$SKLEARN_ENV ["DecisionTree"]=$SKLEARN_ENV ["RandomForest"]=$SKLEARN_ENV ["XGBoost"]=$GBDT_ENV + ["NGBoost"]=$GBDT_ENV ["CatBoost"]=$GBDT_ENV ["LightGBM"]=$GBDT_ENV ["MLP"]=$TORCH_ENV diff --git a/conda_envs/gbdt.yml b/conda_envs/gbdt.yml index 60121ea..7a72030 100644 --- a/conda_envs/gbdt.yml +++ b/conda_envs/gbdt.yml @@ -99,15 +99,16 @@ dependencies: - kiwisolver==1.4.3 - lightgbm==3.3.1 - matplotlib==3.5.2 - - modeltrees==0.1.1 + #- modeltrees==0.1.1 - numpy==1.23.0 - pandas==1.4.3 - pillow==9.1.1 - plotly==5.9.0 - - python-graphviz==0.20 + #- python-graphviz==0.20 - scikit-learn==1.1.1 - scipy==1.8.1 - tenacity==8.0.1 - threadpoolctl==3.1.0 - xgboost==1.5.0 + - ngboost==0.3.13 prefix: /opt/conda/envs/gbdt diff --git a/scripts/experiments/cpu_experiments.sh b/scripts/experiments/cpu_experiments.sh index 87bfefd..69b7643 100644 --- a/scripts/experiments/cpu_experiments.sh +++ b/scripts/experiments/cpu_experiments.sh @@ -17,6 +17,7 @@ MODELS_ENVS=( RandomForest:$SKLEARN_ENV # XGBoost:$GBDT_ENV # CatBoost:$GBDT_ENV + # NGBoost:$GBDT_ENV LightGBM:$GBDT_ENV # MLP:$TORCH_ENV # ModelTree:$GBDT_ENV <- bug diff --git a/scripts/experiments/gpu_experiments_a.sh b/scripts/experiments/gpu_experiments_a.sh index 215fb3f..060ec89 100644 --- a/scripts/experiments/gpu_experiments_a.sh +++ b/scripts/experiments/gpu_experiments_a.sh @@ -18,6 +18,7 @@ MODELS_ENVS=( # DecisionTree:$SKLEARN_ENV # RandomForest:$SKLEARN_ENV XGBoost:$GBDT_ENV + NGBoost:$GBDT_ENV CatBoost:$GBDT_ENV # LightGBM:$GBDT_ENV MLP:$TORCH_ENV diff --git a/scripts/experiments/gpu_experiments_b.sh b/scripts/experiments/gpu_experiments_b.sh index 89a9c46..4e373ce 100644 --- a/scripts/experiments/gpu_experiments_b.sh +++ b/scripts/experiments/gpu_experiments_b.sh @@ -19,6 +19,7 @@ MODELS_ENVS=( # DecisionTree:$SKLEARN_ENV # RandomForest:$SKLEARN_ENV # XGBoost:$GBDT_ENV + # NGBoost:$GBDT_ENV # CatBoost:$GBDT_ENV # LightGBM:$GBDT_ENV # MLP:$TORCH_ENV diff --git a/scripts/tests/all_algs_two_datasets.sh b/scripts/tests/all_algs_two_datasets.sh index 8073003..bdfc219 100755 --- a/scripts/tests/all_algs_two_datasets.sh +++ b/scripts/tests/all_algs_two_datasets.sh @@ -16,6 +16,7 @@ MODELS_ENVS=( DecisionTree:$SKLEARN_ENV RandomForest:$SKLEARN_ENV XGBoost:$GBDT_ENV + NGBoost:$GBDT_ENV CatBoost:$GBDT_ENV LightGBM:$GBDT_ENV MLP:$TORCH_ENV