From b2155ddb062a270b11f4d9c63724200e03827040 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 23 Jun 2021 20:32:38 +0200
Subject: [PATCH 001/147] Adding sample RF space for tabular collection design

---
 hpobench/benchmarks/ml/rf_benchmark.py | 391 +++++++++++++++++++++++++
 1 file changed, 391 insertions(+)
 create mode 100644 hpobench/benchmarks/ml/rf_benchmark.py

diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
new file mode 100644
index 00000000..35684c00
--- /dev/null
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -0,0 +1,391 @@
+import time
+import openml
+import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, make_scorer
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+
+
+class RandomForestBenchmark(AbstractBenchmark):
+    _issue_tasks = [3917, 3945]
+
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            benchmark_type: str = "raw"
+    ):
+        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
+        self.rng = check_random_state(self.seed)
+        super(RandomForestBenchmark, self).__init__(rng=seed)
+
+        self.benchmark_type = benchmark_type
+        self.task_id = task_id
+        self.valid_size = valid_size
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        # Data variables
+        self.train_X = None
+        self.valid_X = None
+        self.test_X = None
+        self.train_y = None
+        self.valid_y = None
+        self.test_y = None
+        self.train_idx = None
+        self.test_idx = None
+        self.task = None
+        self.dataset = None
+        self.preprocessor = None
+        self.lower_bound_train_size = None
+        self.load_data_from_openml()
+
+        # Observation and fidelity spaces
+        self.fidelity_choice = fidelity_choice
+        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
+        self.x_cs = self.get_configuration_space(self.seed)
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter(
+                'max_depth', lower=1, upper=15, default_value=2, log=False
+            ),
+            CS.UniformIntegerHyperparameter(
+                'min_samples_split', lower=2, upper=128, default_value=2, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'max_features', lower=0.1, upper=0.9, default_value=0.5, log=False
+            ),
+            CS.UniformIntegerHyperparameter(
+                'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
+            ),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
+        """
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+        if fidelity_choice == 0:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 1:
+            # only n_estimators as fidelity
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 2:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+            )
+        else:
+            # both n_estimators and subsample as fidelities
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+            )
+        z_cs.add_hyperparameters([ntrees, subsample])
+        return z_cs
+
+    def get_config(self, size=None):
+        """Samples configuration(s) from the (hyper) parameter space
+        """
+        if size is None:  # return only one config
+            return self.x_cs.sample_configuration()
+        return [self.x_cs.sample_configuration() for i in range(size)]
+
+    def get_fidelity(self, size=None):
+        """Samples candidate fidelities from the fidelity space
+        """
+        if size is None:  # return only one config
+            return self.z_cs.sample_configuration()
+        return [self.z_cs.sample_configuration() for i in range(size)]
+
+    def _convert_labels(self, labels):
+        """Converts boolean labels (if exists) to strings
+        """
+        label_types = list(map(lambda x: isinstance(x, bool), labels))
+        if np.all(label_types):
+            _labels = list(map(lambda x: str(x), labels))
+            if isinstance(labels, pd.Series):
+                labels = pd.Series(_labels, index=labels.index)
+            elif isinstance(labels, np.array):
+                labels = np.array(labels)
+        return labels
+
+    def load_data_from_openml(self, valid_size=None, verbose=False):
+        """Fetches data from OpenML and initializes the train-validation-test data splits
+
+        The validation set is fixed till this function is called again or explicitly altered
+        """
+        # fetches task
+        self.task = openml.tasks.get_task(self.task_id, download_data=False)
+        # fetches dataset
+        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
+        if verbose:
+            print(self.task, '\n')
+            print(self.dataset, '\n')
+
+        # loads full data
+        X, y, categorical_ind, feature_names = self.dataset.get_data(
+            target=self.task.target_name, dataset_format="dataframe"
+        )
+        categorical_ind = np.array(categorical_ind)
+        (cat_idx,) = np.where(categorical_ind)
+        (cont_idx,) = np.where(~categorical_ind)
+
+        # splitting dataset into train and test (10% test)
+        # train-test split is fixed for a task and its associated dataset
+        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
+        train_x = X.iloc[self.train_idx]
+        train_y = y.iloc[self.train_idx]
+        self.test_X = X.iloc[self.test_idx]
+        self.test_y = y.iloc[self.test_idx]
+
+        # splitting training into training and validation
+        # validation set is fixed till this function is called again or explicitly altered
+        valid_size = self.valid_size if valid_size is None else valid_size
+        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
+            train_x, train_y, test_size=valid_size,
+            shuffle=True, stratify=train_y, random_state=self.rng
+        )
+
+        # preprocessor to handle missing values, categorical columns encodings,
+        # and scaling numeric columns
+        self.preprocessor = make_pipeline(
+            ColumnTransformer([
+                (
+                    "cat",
+                    make_pipeline(SimpleImputer(strategy="most_frequent"),
+                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
+                    cat_idx.tolist(),
+                ),
+                (
+                    "cont",
+                    make_pipeline(SimpleImputer(strategy="median"),
+                                  StandardScaler()),
+                    cont_idx.tolist(),
+                )
+            ])
+        )
+        if verbose:
+            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
+
+        # preprocessor fit only on the training set
+        self.train_X = self.preprocessor.fit_transform(self.train_X)
+        # applying preprocessor built on the training set, across validation and test splits
+        self.valid_X = self.preprocessor.transform(self.valid_X)
+        self.test_X = self.preprocessor.transform(self.test_X)
+        # converting boolean labels to strings
+        self.train_y = self._convert_labels(self.train_y)
+        self.valid_y = self._convert_labels(self.valid_y)
+        self.test_y = self._convert_labels(self.test_y)
+
+        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
+        # use 10 times the number of classes as lower bound for the dataset fraction
+        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+
+        if verbose:
+            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
+
+        if verbose:
+            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
+            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
+            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
+            print("\nData loading complete!\n")
+        return
+
+    def shuffle_data_idx(self, train_id=None, ng=None):
+        rng = self.rng if rng is None else rng
+        train_idx = self.train_idx if train_idx is None else train_idx
+        rng.shuffle(train_idx)
+        return train_idx
+
+    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+        start = time.time()
+
+        # initializing model
+        model = RandomForestClassifier(
+            **config.get_dictionary(),
+            n_estimators=fidelity['n_estimators'],  # a fidelity being used during initialization
+            bootstrap=True,
+            random_state=self.rng
+        )
+
+        # preparing data
+        if eval == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+            train_idx = self.train_idx
+        else:
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+            train_idx = np.arange(len(train_X))
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here
+        # application of the other fidelity to the dataset that the model interfaces
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                fidelity['subsample'] * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        model.fit(train_X[train_idx], train_y.iloc[train_idx])
+        # computing statistics on training data
+        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
+
+        model_fit_time = time.time() - start
+        return model, model_fit_time, train_loss
+
+    def objective(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    def objective_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng, eval="test"
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        return dict()
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        return dict()
+
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        pass

From ce405e6bc43ea926c3786e1564a1e9b61d3754a3 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 23 Jun 2021 20:57:37 +0200
Subject: [PATCH 002/147] Placeholder SVM benchmark to interface tabular data
 collection

---
 hpobench/benchmarks/ml/svm_benchmark_2.py | 371 ++++++++++++++++++++++
 1 file changed, 371 insertions(+)
 create mode 100644 hpobench/benchmarks/ml/svm_benchmark_2.py

diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
new file mode 100644
index 00000000..6e8ec6c9
--- /dev/null
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -0,0 +1,371 @@
+import time
+import openml
+import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
+from sklearn.svm import SVC
+from sklearn.impute import SimpleImputer
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline, Pipeline
+from sklearn.metrics import accuracy_score, make_scorer
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+
+
+class SVMBenchmark(AbstractBenchmark):
+    _issue_tasks = [3917, 3945]
+
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            benchmark_type: str = "raw"
+    ):
+        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
+        self.rng = check_random_state(self.seed)
+        super(SVMBenchmark, self).__init__(rng=seed)
+
+        self.benchmark_type = benchmark_type
+        self.task_id = task_id
+        self.valid_size = valid_size
+        self.accuracy_scorer = make_scorer(accuracy_score)
+        #TODO: check the cache_size parameter from sklearn docs
+        self.cache_size = 200
+
+        # Data variables
+        self.train_X = None
+        self.valid_X = None
+        self.test_X = None
+        self.train_y = None
+        self.valid_y = None
+        self.test_y = None
+        self.train_idx = None
+        self.test_idx = None
+        self.task = None
+        self.dataset = None
+        self.preprocessor = None
+        self.lower_bound_train_size = None
+        self.load_data_from_openml()
+
+        # Observation and fidelity spaces
+        self.fidelity_choice = fidelity_choice
+        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
+        self.x_cs = self.get_configuration_space(self.seed)
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformFloatHyperparameter(
+                'C', lower=-10., upper=10., default_value=0., log=False
+            ),
+            CS.UniformFloatHyperparameter(
+                'gamma', lower=-10., upper=10., default_value=1., log=False
+            ),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=None):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        For SVM, only a single fidelity exists, i.e., subsample fraction.
+        if fidelity_choice == 0
+            uses the entire data (subsample=1), reflecting the black-box setup
+        else
+            parameterizes the fraction of data to subsample
+
+        """
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+        if fidelity_choice == 0:
+            subsample = CS.Constant('subsample', value=1)
+        else:
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+            )
+        z_cs.add_hyperparameter(subsample)
+        return z_cs
+
+    def get_config(self, size=None):
+        """Samples configuration(s) from the (hyper) parameter space
+        """
+        if size is None:  # return only one config
+            return self.x_cs.sample_configuration()
+        return [self.x_cs.sample_configuration() for i in range(size)]
+
+    def get_fidelity(self, size=None):
+        """Samples candidate fidelities from the fidelity space
+        """
+        if size is None:  # return only one config
+            return self.z_cs.sample_configuration()
+        return [self.z_cs.sample_configuration() for i in range(size)]
+
+    def _convert_labels(self, labels):
+        """Converts boolean labels (if exists) to strings
+        """
+        label_types = list(map(lambda x: isinstance(x, bool), labels))
+        if np.all(label_types):
+            _labels = list(map(lambda x: str(x), labels))
+            if isinstance(labels, pd.Series):
+                labels = pd.Series(_labels, index=labels.index)
+            elif isinstance(labels, np.array):
+                labels = np.array(labels)
+        return labels
+
+    def load_data_from_openml(self, valid_size=None, verbose=False):
+        """Fetches data from OpenML and initializes the train-validation-test data splits
+
+        The validation set is fixed till this function is called again or explicitly altered
+        """
+        # fetches task
+        self.task = openml.tasks.get_task(self.task_id, download_data=False)
+        # fetches dataset
+        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
+        if verbose:
+            print(self.task, '\n')
+            print(self.dataset, '\n')
+
+        # loads full data
+        X, y, categorical_ind, feature_names = self.dataset.get_data(
+            target=self.task.target_name, dataset_format="dataframe"
+        )
+        categorical_ind = np.array(categorical_ind)
+        (cat_idx,) = np.where(categorical_ind)
+        (cont_idx,) = np.where(~categorical_ind)
+
+        # splitting dataset into train and test (10% test)
+        # train-test split is fixed for a task and its associated dataset
+        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
+        train_x = X.iloc[self.train_idx]
+        train_y = y.iloc[self.train_idx]
+        self.test_X = X.iloc[self.test_idx]
+        self.test_y = y.iloc[self.test_idx]
+
+        # splitting training into training and validation
+        # validation set is fixed till this function is called again or explicitly altered
+        valid_size = self.valid_size if valid_size is None else valid_size
+        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
+            train_x, train_y, test_size=valid_size,
+            shuffle=True, stratify=train_y, random_state=self.rng
+        )
+
+        # preprocessor to handle missing values, categorical columns encodings,
+        # and scaling numeric columns
+        self.preprocessor = make_pipeline(
+            ColumnTransformer([
+                (
+                    "cat",
+                    make_pipeline(SimpleImputer(strategy="most_frequent"),
+                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
+                    cat_idx.tolist(),
+                ),
+                (
+                    "cont",
+                    make_pipeline(SimpleImputer(strategy="median"),
+                                  StandardScaler()),
+                    cont_idx.tolist(),
+                )
+            ])
+        )
+        if verbose:
+            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
+
+        # preprocessor fit only on the training set
+        self.train_X = self.preprocessor.fit_transform(self.train_X)
+        # applying preprocessor built on the training set, across validation and test splits
+        self.valid_X = self.preprocessor.transform(self.valid_X)
+        self.test_X = self.preprocessor.transform(self.test_X)
+        # converting boolean labels to strings
+        self.train_y = self._convert_labels(self.train_y)
+        self.valid_y = self._convert_labels(self.valid_y)
+        self.test_y = self._convert_labels(self.test_y)
+
+        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
+        # use 10 times the number of classes as lower bound for the dataset fraction
+        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+
+        if verbose:
+            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
+
+        if verbose:
+            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
+            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
+            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
+            print("\nData loading complete!\n")
+        return
+
+    def shuffle_data_idx(self, train_id=None, ng=None):
+        rng = self.rng if rng is None else rng
+        train_idx = self.train_idx if train_idx is None else train_idx
+        rng.shuffle(train_idx)
+        return train_idx
+
+    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+        start = time.time()
+
+        # initializing model
+        rng = self.rng if rng is None else rng
+        config = config.get_dictionary()
+        for k, v in config.items():
+            config[k] = np.exp(float(v))
+        model = SVC(
+            **config,
+            random_state=rng,
+            cache_size=self.cache_size
+
+        )
+
+        # preparing data
+        if eval == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+            train_idx = self.train_idx
+        else:
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+            train_idx = np.arange(len(train_X))
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here
+        # application of the other fidelity to the dataset that the model interfaces
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                fidelity['subsample'] * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        model.fit(train_X[train_idx], train_y.iloc[train_idx])
+        # computing statistics on training data
+        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
+
+        model_fit_time = time.time() - start
+        return model, model_fit_time, train_loss
+
+    def objective(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    def objective_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng, eval="test"
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        return dict()
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        return dict()
+
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        pass

From 2ef3af8019bf6ab2531610b8299f57bcb1148ef7 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 24 Jun 2021 14:38:56 +0200
Subject: [PATCH 003/147] Writing common ML benchmark class for tabular
 collection

---
 .../benchmarks/ml/ml_benchmark_template.py    | 347 ++++++++++++++++++
 hpobench/benchmarks/ml/rf_benchmark.py        | 300 +--------------
 hpobench/benchmarks/ml/svm_benchmark_2.py     | 315 +---------------
 3 files changed, 376 insertions(+), 586 deletions(-)
 create mode 100644 hpobench/benchmarks/ml/ml_benchmark_template.py

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
new file mode 100644
index 00000000..0891f0fe
--- /dev/null
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -0,0 +1,347 @@
+import time
+import openml
+import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, make_scorer
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+
+
+class Benchmark(AbstractBenchmark):
+    _issue_tasks = [3917, 3945]
+
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            benchmark_type: str = "raw"
+    ):
+        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
+        self.rng = check_random_state(self.seed)
+        super(Benchmark, self).__init__(rng=seed)
+
+        self.benchmark_type = benchmark_type
+        self.task_id = task_id
+        self.valid_size = valid_size
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        # Data variables
+        self.train_X = None
+        self.valid_X = None
+        self.test_X = None
+        self.train_y = None
+        self.valid_y = None
+        self.test_y = None
+        self.train_idx = None
+        self.test_idx = None
+        self.task = None
+        self.dataset = None
+        self.preprocessor = None
+        self.lower_bound_train_size = None
+        self.load_data_from_openml()
+
+        # Observation and fidelity spaces
+        self.fidelity_choice = fidelity_choice
+        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
+        self.x_cs = self.get_configuration_space(self.seed)
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
+        """
+        raise NotImplementedError()
+
+    def get_config(self, size=None):
+        """Samples configuration(s) from the (hyper) parameter space
+        """
+        if size is None:  # return only one config
+            return self.x_cs.sample_configuration()
+        return [self.x_cs.sample_configuration() for i in range(size)]
+
+    def get_fidelity(self, size=None):
+        """Samples candidate fidelities from the fidelity space
+        """
+        if size is None:  # return only one config
+            return self.z_cs.sample_configuration()
+        return [self.z_cs.sample_configuration() for i in range(size)]
+
+    def _convert_labels(self, labels):
+        """Converts boolean labels (if exists) to strings
+        """
+        label_types = list(map(lambda x: isinstance(x, bool), labels))
+        if np.all(label_types):
+            _labels = list(map(lambda x: str(x), labels))
+            if isinstance(labels, pd.Series):
+                labels = pd.Series(_labels, index=labels.index)
+            elif isinstance(labels, np.array):
+                labels = np.array(labels)
+        return labels
+
+    def load_data_from_openml(self, valid_size=None, verbose=False):
+        """Fetches data from OpenML and initializes the train-validation-test data splits
+
+        The validation set is fixed till this function is called again or explicitly altered
+        """
+        # fetches task
+        self.task = openml.tasks.get_task(self.task_id, download_data=False)
+        # fetches dataset
+        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
+        if verbose:
+            print(self.task, '\n')
+            print(self.dataset, '\n')
+
+        # loads full data
+        X, y, categorical_ind, feature_names = self.dataset.get_data(
+            target=self.task.target_name, dataset_format="dataframe"
+        )
+        categorical_ind = np.array(categorical_ind)
+        (cat_idx,) = np.where(categorical_ind)
+        (cont_idx,) = np.where(~categorical_ind)
+
+        # splitting dataset into train and test (10% test)
+        # train-test split is fixed for a task and its associated dataset
+        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
+        train_x = X.iloc[self.train_idx]
+        train_y = y.iloc[self.train_idx]
+        self.test_X = X.iloc[self.test_idx]
+        self.test_y = y.iloc[self.test_idx]
+
+        # splitting training into training and validation
+        # validation set is fixed till this function is called again or explicitly altered
+        valid_size = self.valid_size if valid_size is None else valid_size
+        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
+            train_x, train_y, test_size=valid_size,
+            shuffle=True, stratify=train_y, random_state=self.rng
+        )
+
+        # preprocessor to handle missing values, categorical columns encodings,
+        # and scaling numeric columns
+        self.preprocessor = make_pipeline(
+            ColumnTransformer([
+                (
+                    "cat",
+                    make_pipeline(SimpleImputer(strategy="most_frequent"),
+                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
+                    cat_idx.tolist(),
+                ),
+                (
+                    "cont",
+                    make_pipeline(SimpleImputer(strategy="median"),
+                                  StandardScaler()),
+                    cont_idx.tolist(),
+                )
+            ])
+        )
+        if verbose:
+            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
+
+        # preprocessor fit only on the training set
+        self.train_X = self.preprocessor.fit_transform(self.train_X)
+        # applying preprocessor built on the training set, across validation and test splits
+        self.valid_X = self.preprocessor.transform(self.valid_X)
+        self.test_X = self.preprocessor.transform(self.test_X)
+        # converting boolean labels to strings
+        self.train_y = self._convert_labels(self.train_y)
+        self.valid_y = self._convert_labels(self.valid_y)
+        self.test_y = self._convert_labels(self.test_y)
+
+        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
+        # use 10 times the number of classes as lower bound for the dataset fraction
+        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+
+        if verbose:
+            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
+
+        if verbose:
+            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
+            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
+            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
+            print("\nData loading complete!\n")
+        return
+
+    def shuffle_data_idx(self, train_id=None, ng=None):
+        rng = self.rng if rng is None else rng
+        train_idx = self.train_idx if train_idx is None else train_idx
+        rng.shuffle(train_idx)
+        return train_idx
+
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+        raise NotImplementedError()
+
+    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+        start = time.time()
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if eval == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+            train_idx = self.train_idx
+        else:
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+            train_idx = np.arange(len(train_X))
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                fidelity['subsample'] * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        model.fit(train_X[train_idx], train_y.iloc[train_idx])
+        # computing statistics on training data
+        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
+
+        model_fit_time = time.time() - start
+        return model, model_fit_time, train_loss
+
+    def objective(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    def objective_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng, eval="test"
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        return dict()
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        return dict()
+
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        pass
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 35684c00..be08b938 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -17,11 +17,10 @@
 
 import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
-class RandomForestBenchmark(AbstractBenchmark):
-    _issue_tasks = [3917, 3945]
-
+class RandomForestBenchmark(Benchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
@@ -30,34 +29,10 @@ def __init__(
             fidelity_choice: int = 1,
             benchmark_type: str = "raw"
     ):
-        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
-        self.rng = check_random_state(self.seed)
-        super(RandomForestBenchmark, self).__init__(rng=seed)
-
-        self.benchmark_type = benchmark_type
-        self.task_id = task_id
-        self.valid_size = valid_size
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        # Data variables
-        self.train_X = None
-        self.valid_X = None
-        self.test_X = None
-        self.train_y = None
-        self.valid_y = None
-        self.test_y = None
-        self.train_idx = None
-        self.test_idx = None
-        self.task = None
-        self.dataset = None
-        self.preprocessor = None
-        self.lower_bound_train_size = None
-        self.load_data_from_openml()
-
-        # Observation and fidelity spaces
-        self.fidelity_choice = fidelity_choice
-        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
-        self.x_cs = self.get_configuration_space(self.seed)
+        super(RandomForestBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, benchmark_type
+        )
+        pass
 
     @staticmethod
     def get_configuration_space(seed=None):
@@ -123,269 +98,14 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs
 
-    def get_config(self, size=None):
-        """Samples configuration(s) from the (hyper) parameter space
-        """
-        if size is None:  # return only one config
-            return self.x_cs.sample_configuration()
-        return [self.x_cs.sample_configuration() for i in range(size)]
-
-    def get_fidelity(self, size=None):
-        """Samples candidate fidelities from the fidelity space
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
         """
-        if size is None:  # return only one config
-            return self.z_cs.sample_configuration()
-        return [self.z_cs.sample_configuration() for i in range(size)]
-
-    def _convert_labels(self, labels):
-        """Converts boolean labels (if exists) to strings
-        """
-        label_types = list(map(lambda x: isinstance(x, bool), labels))
-        if np.all(label_types):
-            _labels = list(map(lambda x: str(x), labels))
-            if isinstance(labels, pd.Series):
-                labels = pd.Series(_labels, index=labels.index)
-            elif isinstance(labels, np.array):
-                labels = np.array(labels)
-        return labels
-
-    def load_data_from_openml(self, valid_size=None, verbose=False):
-        """Fetches data from OpenML and initializes the train-validation-test data splits
-
-        The validation set is fixed till this function is called again or explicitly altered
-        """
-        # fetches task
-        self.task = openml.tasks.get_task(self.task_id, download_data=False)
-        # fetches dataset
-        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
-        if verbose:
-            print(self.task, '\n')
-            print(self.dataset, '\n')
-
-        # loads full data
-        X, y, categorical_ind, feature_names = self.dataset.get_data(
-            target=self.task.target_name, dataset_format="dataframe"
-        )
-        categorical_ind = np.array(categorical_ind)
-        (cat_idx,) = np.where(categorical_ind)
-        (cont_idx,) = np.where(~categorical_ind)
-
-        # splitting dataset into train and test (10% test)
-        # train-test split is fixed for a task and its associated dataset
-        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
-        train_x = X.iloc[self.train_idx]
-        train_y = y.iloc[self.train_idx]
-        self.test_X = X.iloc[self.test_idx]
-        self.test_y = y.iloc[self.test_idx]
-
-        # splitting training into training and validation
-        # validation set is fixed till this function is called again or explicitly altered
-        valid_size = self.valid_size if valid_size is None else valid_size
-        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
-            train_x, train_y, test_size=valid_size,
-            shuffle=True, stratify=train_y, random_state=self.rng
-        )
-
-        # preprocessor to handle missing values, categorical columns encodings,
-        # and scaling numeric columns
-        self.preprocessor = make_pipeline(
-            ColumnTransformer([
-                (
-                    "cat",
-                    make_pipeline(SimpleImputer(strategy="most_frequent"),
-                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
-                    cat_idx.tolist(),
-                ),
-                (
-                    "cont",
-                    make_pipeline(SimpleImputer(strategy="median"),
-                                  StandardScaler()),
-                    cont_idx.tolist(),
-                )
-            ])
-        )
-        if verbose:
-            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
-
-        # preprocessor fit only on the training set
-        self.train_X = self.preprocessor.fit_transform(self.train_X)
-        # applying preprocessor built on the training set, across validation and test splits
-        self.valid_X = self.preprocessor.transform(self.valid_X)
-        self.test_X = self.preprocessor.transform(self.test_X)
-        # converting boolean labels to strings
-        self.train_y = self._convert_labels(self.train_y)
-        self.valid_y = self._convert_labels(self.valid_y)
-        self.test_y = self._convert_labels(self.test_y)
-
-        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
-        # use 10 times the number of classes as lower bound for the dataset fraction
-        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
-
-        if verbose:
-            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
-
-        if verbose:
-            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
-            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
-            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
-            print("\nData loading complete!\n")
-        return
-
-    def shuffle_data_idx(self, train_id=None, ng=None):
         rng = self.rng if rng is None else rng
-        train_idx = self.train_idx if train_idx is None else train_idx
-        rng.shuffle(train_idx)
-        return train_idx
-
-    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
-        start = time.time()
-
-        # initializing model
         model = RandomForestClassifier(
             **config.get_dictionary(),
             n_estimators=fidelity['n_estimators'],  # a fidelity being used during initialization
             bootstrap=True,
-            random_state=self.rng
-        )
-
-        # preparing data
-        if eval == "valid":
-            train_X = self.train_X
-            train_y = self.train_y
-            train_idx = self.train_idx
-        else:
-            train_X = np.vstack((self.train_X, self.valid_X))
-            train_y = pd.concat((self.train_y, self.valid_y))
-            train_idx = np.arange(len(train_X))
-
-        # shuffling data
-        if shuffle:
-            train_idx = self.shuffle_data_idx(train_idx, rng)
-            train_X = train_X.iloc[train_idx]
-            train_y = train_y.iloc[train_idx]
-
-        # subsample here
-        # application of the other fidelity to the dataset that the model interfaces
-        train_idx = self.rng.choice(
-            np.arange(len(train_X)), size=int(
-                fidelity['subsample'] * len(train_X)
-            )
+            random_state=rng
         )
-        # fitting the model with subsampled data
-        model.fit(train_X[train_idx], train_y.iloc[train_idx])
-        # computing statistics on training data
-        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
-
-        model_fit_time = time.time() - start
-        return model, model_fit_time, train_loss
-
-    def objective(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
-                configuration, fidelity, shuffle, rng
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
-        eval_time = time.time() - start
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
-        }
-
-        return {
-            'function_value': info['val_loss'],
-            'cost': info['cost'],
-            'info': info
-        }
-
-    def objective_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
-                configuration, fidelity, shuffle, rng, eval="test"
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
-        eval_time = time.time() - start
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
-        }
-
-        return {
-            'function_value': info['val_loss'],
-            'cost': info['cost'],
-            'info': info
-        }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        return dict()
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        return dict()
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        pass
+        return model
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 6e8ec6c9..13076040 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -17,11 +17,10 @@
 
 import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
-class SVMBenchmark(AbstractBenchmark):
-    _issue_tasks = [3917, 3945]
-
+class SVMBenchmark(Benchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
@@ -30,50 +29,33 @@ def __init__(
             fidelity_choice: int = 1,
             benchmark_type: str = "raw"
     ):
-        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
-        self.rng = check_random_state(self.seed)
-        super(SVMBenchmark, self).__init__(rng=seed)
-
-        self.benchmark_type = benchmark_type
-        self.task_id = task_id
-        self.valid_size = valid_size
-        self.accuracy_scorer = make_scorer(accuracy_score)
-        #TODO: check the cache_size parameter from sklearn docs
+        super(SVMBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, benchmark_type
+        )
         self.cache_size = 200
 
-        # Data variables
-        self.train_X = None
-        self.valid_X = None
-        self.test_X = None
-        self.train_y = None
-        self.valid_y = None
-        self.test_y = None
-        self.train_idx = None
-        self.test_idx = None
-        self.task = None
-        self.dataset = None
-        self.preprocessor = None
-        self.lower_bound_train_size = None
-        self.load_data_from_openml()
-
-        # Observation and fidelity spaces
-        self.fidelity_choice = fidelity_choice
-        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
-        self.x_cs = self.get_configuration_space(self.seed)
-
     @staticmethod
     def get_configuration_space(seed=None):
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
 
+        # cs.add_hyperparameters([
+        #     CS.UniformFloatHyperparameter(
+        #         'C', lower=-10., upper=10., default_value=0., log=False
+        #     ),
+        #     CS.UniformFloatHyperparameter(
+        #         'gamma', lower=-10., upper=10., default_value=1., log=False
+        #     ),
+        # ])
+        # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
         cs.add_hyperparameters([
             CS.UniformFloatHyperparameter(
-                'C', lower=-10., upper=10., default_value=0., log=False
+                "C", 0.03125, 32768, log=True, default_value=1.0
             ),
             CS.UniformFloatHyperparameter(
-                'gamma', lower=-10., upper=10., default_value=1., log=False
-            ),
+                "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1
+            )
         ])
         return cs
 
@@ -99,273 +81,14 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
         z_cs.add_hyperparameter(subsample)
         return z_cs
 
-    def get_config(self, size=None):
-        """Samples configuration(s) from the (hyper) parameter space
-        """
-        if size is None:  # return only one config
-            return self.x_cs.sample_configuration()
-        return [self.x_cs.sample_configuration() for i in range(size)]
-
-    def get_fidelity(self, size=None):
-        """Samples candidate fidelities from the fidelity space
-        """
-        if size is None:  # return only one config
-            return self.z_cs.sample_configuration()
-        return [self.z_cs.sample_configuration() for i in range(size)]
-
-    def _convert_labels(self, labels):
-        """Converts boolean labels (if exists) to strings
-        """
-        label_types = list(map(lambda x: isinstance(x, bool), labels))
-        if np.all(label_types):
-            _labels = list(map(lambda x: str(x), labels))
-            if isinstance(labels, pd.Series):
-                labels = pd.Series(_labels, index=labels.index)
-            elif isinstance(labels, np.array):
-                labels = np.array(labels)
-        return labels
-
-    def load_data_from_openml(self, valid_size=None, verbose=False):
-        """Fetches data from OpenML and initializes the train-validation-test data splits
-
-        The validation set is fixed till this function is called again or explicitly altered
-        """
-        # fetches task
-        self.task = openml.tasks.get_task(self.task_id, download_data=False)
-        # fetches dataset
-        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
-        if verbose:
-            print(self.task, '\n')
-            print(self.dataset, '\n')
-
-        # loads full data
-        X, y, categorical_ind, feature_names = self.dataset.get_data(
-            target=self.task.target_name, dataset_format="dataframe"
-        )
-        categorical_ind = np.array(categorical_ind)
-        (cat_idx,) = np.where(categorical_ind)
-        (cont_idx,) = np.where(~categorical_ind)
-
-        # splitting dataset into train and test (10% test)
-        # train-test split is fixed for a task and its associated dataset
-        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
-        train_x = X.iloc[self.train_idx]
-        train_y = y.iloc[self.train_idx]
-        self.test_X = X.iloc[self.test_idx]
-        self.test_y = y.iloc[self.test_idx]
-
-        # splitting training into training and validation
-        # validation set is fixed till this function is called again or explicitly altered
-        valid_size = self.valid_size if valid_size is None else valid_size
-        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
-            train_x, train_y, test_size=valid_size,
-            shuffle=True, stratify=train_y, random_state=self.rng
-        )
-
-        # preprocessor to handle missing values, categorical columns encodings,
-        # and scaling numeric columns
-        self.preprocessor = make_pipeline(
-            ColumnTransformer([
-                (
-                    "cat",
-                    make_pipeline(SimpleImputer(strategy="most_frequent"),
-                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
-                    cat_idx.tolist(),
-                ),
-                (
-                    "cont",
-                    make_pipeline(SimpleImputer(strategy="median"),
-                                  StandardScaler()),
-                    cont_idx.tolist(),
-                )
-            ])
-        )
-        if verbose:
-            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
-
-        # preprocessor fit only on the training set
-        self.train_X = self.preprocessor.fit_transform(self.train_X)
-        # applying preprocessor built on the training set, across validation and test splits
-        self.valid_X = self.preprocessor.transform(self.valid_X)
-        self.test_X = self.preprocessor.transform(self.test_X)
-        # converting boolean labels to strings
-        self.train_y = self._convert_labels(self.train_y)
-        self.valid_y = self._convert_labels(self.valid_y)
-        self.test_y = self._convert_labels(self.test_y)
-
-        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
-        # use 10 times the number of classes as lower bound for the dataset fraction
-        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
-
-        if verbose:
-            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
-
-        if verbose:
-            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
-            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
-            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
-            print("\nData loading complete!\n")
-        return
-
-    def shuffle_data_idx(self, train_id=None, ng=None):
-        rng = self.rng if rng is None else rng
-        train_idx = self.train_idx if train_idx is None else train_idx
-        rng.shuffle(train_idx)
-        return train_idx
-
-    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
-        start = time.time()
-
+    def init_model(self, config, fidelity=None, rng=None):
         # initializing model
         rng = self.rng if rng is None else rng
         config = config.get_dictionary()
-        for k, v in config.items():
-            config[k] = np.exp(float(v))
         model = SVC(
             **config,
             random_state=rng,
             cache_size=self.cache_size
 
         )
-
-        # preparing data
-        if eval == "valid":
-            train_X = self.train_X
-            train_y = self.train_y
-            train_idx = self.train_idx
-        else:
-            train_X = np.vstack((self.train_X, self.valid_X))
-            train_y = pd.concat((self.train_y, self.valid_y))
-            train_idx = np.arange(len(train_X))
-
-        # shuffling data
-        if shuffle:
-            train_idx = self.shuffle_data_idx(train_idx, rng)
-            train_X = train_X.iloc[train_idx]
-            train_y = train_y.iloc[train_idx]
-
-        # subsample here
-        # application of the other fidelity to the dataset that the model interfaces
-        train_idx = self.rng.choice(
-            np.arange(len(train_X)), size=int(
-                fidelity['subsample'] * len(train_X)
-            )
-        )
-        # fitting the model with subsampled data
-        model.fit(train_X[train_idx], train_y.iloc[train_idx])
-        # computing statistics on training data
-        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
-
-        model_fit_time = time.time() - start
-        return model, model_fit_time, train_loss
-
-    def objective(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
-                configuration, fidelity, shuffle, rng
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
-        eval_time = time.time() - start
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
-        }
-
-        return {
-            'function_value': info['val_loss'],
-            'cost': info['cost'],
-            'info': info
-        }
-
-    def objective_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
-                configuration, fidelity, shuffle, rng, eval="test"
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
-        eval_time = time.time() - start
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
-        }
-
-        return {
-            'function_value': info['val_loss'],
-            'cost': info['cost'],
-            'info': info
-        }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        return dict()
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        return dict()
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        pass
+        return model

From 61b6963ba0a7ed67a36e22941d64c01dd72f4d46 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 24 Jun 2021 18:19:33 +0200
Subject: [PATCH 004/147] Adding placeholder for HistGradientBoostedClassifier

---
 hpobench/benchmarks/ml/histgb_benchmark.py | 125 +++++++++++++++++++++
 hpobench/benchmarks/ml/rf_benchmark.py     |   5 +-
 2 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 hpobench/benchmarks/ml/histgb_benchmark.py

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
new file mode 100644
index 00000000..11e7af4a
--- /dev/null
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -0,0 +1,125 @@
+import time
+import openml
+import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from copy import deepcopy
+from typing import Union, Dict
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, make_scorer
+
+# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingClassifier
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
+
+
+class HistGBBenchmark(Benchmark):
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            benchmark_type: str = "raw"
+    ):
+        super(HistGBBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, benchmark_type
+        )
+        pass
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter(
+                'max_depth', lower=1, upper=15, default_value=2, log=False
+            ),
+            CS.UniformIntegerHyperparameter(
+                'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'learning_rate', lower=1e-5, upper=1e-1, default_value=0.1, log=False
+            ),
+            #TODO: find best way to encode l2 reg. since log params cannot have 0 as exact bound
+            # scales the regularization parameter by using it as a power of 10
+            # such that the range of the parameter becomes {0, 1e-7, 1e-6, ..., 1e-1}
+            # where 10 ** 0 is enforced to be 0 (no regularization)
+            CS.UniformIntegerHyperparameter(
+                'l2_regularization', lower=-7, upper=0, default_value=0, log=False
+            )  # value of 1 indicates 0 regularization
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
+        """
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+        if fidelity_choice == 0:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 1:
+            # only n_estimators as fidelity
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 2:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+            )
+        else:
+            # both n_estimators and subsample as fidelities
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+            )
+        z_cs.add_hyperparameters([ntrees, subsample])
+        return z_cs
+
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+        rng = self.rng if rng is None else rng
+        config = deepcopy(config).get_dictionary()
+        l2 = config.pop("l2_regularization")
+        l2 = 0 if l2 == 1 else 10 ** l2
+        # TODO: decide on encoding of learning rate
+        #TODO: allow non-encoded categoricals?
+        #TODO: early stopping set to False?
+        model = HistGradientBoostingClassifier(
+            **config,
+            l2_regularization=l2,
+            max_iter=fidelity['n_estimators'],  # a fidelity being used during initialization
+            early_stopping=False,
+            random_state=rng
+        )
+        return model
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index be08b938..960b8271 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -16,7 +16,6 @@
 from sklearn.metrics import accuracy_score, make_scorer
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
@@ -85,7 +84,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             # only subsample as fidelity
             ntrees = CS.Constant('n_estimators', value=100)
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
             )
         else:
             # both n_estimators and subsample as fidelities
@@ -93,7 +92,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
                 'n_estimators', lower=2, upper=100, default_value=10, log=False
             )
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
             )
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs

From a5d0217b258edff81b3b8082807c610e621ebbe6 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 24 Jun 2021 18:21:34 +0200
Subject: [PATCH 005/147] Minor code cleaning

---
 hpobench/benchmarks/ml/svm_benchmark_2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 13076040..ec174748 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -16,7 +16,6 @@
 from sklearn.metrics import accuracy_score, make_scorer
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 

From 3def203e10a499a0e5fe2ab90643e5e28a739826 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Sat, 26 Jun 2021 17:44:58 +0200
Subject: [PATCH 006/147] Reformatting output dict + option to add more metrics

---
 hpobench/benchmarks/ml/histgb_benchmark.py    |  3 +-
 .../benchmarks/ml/ml_benchmark_template.py    | 47 ++++++++++++++++---
 hpobench/benchmarks/ml/svm_benchmark_2.py     |  1 -
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 11e7af4a..769838ae 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -50,8 +50,9 @@ def get_configuration_space(seed=None):
             CS.UniformIntegerHyperparameter(
                 'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
             ),
+            #TODO: fix lr value range error in map_to_config()
             CS.UniformFloatHyperparameter(
-                'learning_rate', lower=1e-5, upper=1e-1, default_value=0.1, log=False
+                'learning_rate', lower=1e-5, upper=1e-1, default_value=0.1, log=True
             ),
             #TODO: find best way to encode l2 reg. since log params cannot have 0 as exact bound
             # scales the regularization parameter by using it as a power of 10
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 0891f0fe..2b95c097 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -12,12 +12,31 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, \
+    top_k_accuracy_score, balanced_accuracy_score
 
 import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractBenchmark
 
 
+metrics = dict(
+    #TODO: decide on metrics generalized for different datasets
+    acc=accuracy_score,
+    bal_acc=balanced_accuracy_score,
+    f1=f1_score,
+    # roc=roc_auc_score,
+    # topk=top_k_accuracy_score
+)
+metrics_kwargs = dict(
+    #TODO: decide on metric parameters
+    acc=dict(),
+    bal_acc=dict(),
+    f1=dict(average="weighted"),
+    # roc=dict(average="weighted"),
+    # topk=dict()
+)
+
+
 class Benchmark(AbstractBenchmark):
     _issue_tasks = [3917, 3945]
 
@@ -36,7 +55,10 @@ def __init__(
         self.benchmark_type = benchmark_type
         self.task_id = task_id
         self.valid_size = valid_size
-        self.accuracy_scorer = make_scorer(accuracy_score)
+        self.scorers = dict()
+        for k, v in metrics.items():
+            self.scorers[k] = make_scorer(v, **metrics_kwargs[k])
+        # self.scorers = make_scorer(accuracy_score)
 
         # Data variables
         self.train_X = None
@@ -231,7 +253,10 @@ def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
         # fitting the model with subsampled data
         model.fit(train_X[train_idx], train_y.iloc[train_idx])
         # computing statistics on training data
-        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
+        scores = dict()
+        for k, v in self.scorers.items():
+            scores[k] = v(model, train_X, train_y)
+        train_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, train_X, train_y)
 
         model_fit_time = time.time() - start
         return model, model_fit_time, train_loss
@@ -255,7 +280,10 @@ def objective(
             pass
 
         start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
+        scores = dict()
+        for k, v in self.scorers.items():
+            scores[k] = v(model, self.valid_X, self.valid_y)
+        val_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
         eval_time = time.time() - start
 
         info = {
@@ -264,6 +292,7 @@ def objective(
             'cost': model_fit_time + eval_time,
             'training_cost': model_fit_time,
             'evaluation_cost': eval_time,
+            'scores': scores,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -294,22 +323,26 @@ def objective_test(
             pass
 
         start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
+        scores = dict()
+        for k, v in self.scorers.items():
+            scores[k] = v(model, self.test_X, self.test_y)
+        test_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.test_X, self.test_y)
         eval_time = time.time() - start
 
         info = {
             'train_loss': train_loss,
-            'val_loss': val_loss,
+            'test_loss': test_loss,
             'cost': model_fit_time + eval_time,
             'training_cost': model_fit_time,
             'evaluation_cost': eval_time,
+            'scores': scores,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
         }
 
         return {
-            'function_value': info['val_loss'],
+            'function_value': info['test_loss'],
             'cost': info['cost'],
             'info': info
         }
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index ec174748..62da5bbc 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -88,6 +88,5 @@ def init_model(self, config, fidelity=None, rng=None):
             **config,
             random_state=rng,
             cache_size=self.cache_size
-
         )
         return model

From 750cc7d1138ba9dd6f91ba9023565a3720093f8d Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 28 Jun 2021 15:46:40 +0200
Subject: [PATCH 007/147] Removing redundant import

---
 hpobench/benchmarks/ml/histgb_benchmark.py      | 1 -
 hpobench/benchmarks/ml/ml_benchmark_template.py | 1 -
 hpobench/benchmarks/ml/rf_benchmark.py          | 1 -
 hpobench/benchmarks/ml/svm_benchmark_2.py       | 1 -
 4 files changed, 4 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 769838ae..0a0461a3 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -19,7 +19,6 @@
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
-import hpobench.util.rng_helper as rng_helper
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 2b95c097..55772ffc 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -15,7 +15,6 @@
 from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, \
     top_k_accuracy_score, balanced_accuracy_score
 
-import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractBenchmark
 
 
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 960b8271..96e3f48c 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -15,7 +15,6 @@
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, make_scorer
 
-import hpobench.util.rng_helper as rng_helper
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 62da5bbc..2747f380 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -15,7 +15,6 @@
 from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.metrics import accuracy_score, make_scorer
 
-import hpobench.util.rng_helper as rng_helper
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 

From e7665e68fdab26e2f88d927ae87810d961232372 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 30 Jun 2021 18:23:55 +0200
Subject: [PATCH 008/147] Decoupling storage of costs for each metric

---
 .../benchmarks/ml/ml_benchmark_template.py    | 53 ++++++++++---------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 55772ffc..7692e447 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -221,8 +221,6 @@ def init_model(self, config, fidelity=None, rng=None):
         raise NotImplementedError()
 
     def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
-        start = time.time()
-
         # initializing model
         model = self.init_model(config, fidelity, rng)
 
@@ -250,15 +248,18 @@ def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
             )
         )
         # fitting the model with subsampled data
+        start = time.time()
         model.fit(train_X[train_idx], train_y.iloc[train_idx])
+        model_fit_time = time.time() - start
         # computing statistics on training data
         scores = dict()
+        score_cost = dict()
         for k, v in self.scorers.items():
+            _start = time.time()
             scores[k] = v(model, train_X, train_y)
+            score_cost[k] = time.time() - _start
         train_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, train_X, train_y)
-
-        model_fit_time = time.time() - start
-        return model, model_fit_time, train_loss
+        return model, model_fit_time, train_loss, scores, score_cost
 
     def objective(
             self,
@@ -271,27 +272,29 @@ def objective(
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
         if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
+            model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
                 configuration, fidelity, shuffle, rng
             )
         else:
             #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
+            pass + info['train_costs']['acc']
 
-        start = time.time()
         scores = dict()
+        score_cost = dict()
         for k, v in self.scorers.items():
+            _start = time.time()
             scores[k] = v(model, self.valid_X, self.valid_y)
+            score_cost[k] = time.time() - _start
         val_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
-        eval_time = time.time() - start
 
         info = {
             'train_loss': train_loss,
             'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            'scores': scores,
+            'model_cost': model_fit_time,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'eval_scores': scores,
+            'eval_costs': score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -299,7 +302,7 @@ def objective(
 
         return {
             'function_value': info['val_loss'],
-            'cost': info['cost'],
+            'cost': model_fit_time + info['train_costs']['acc'] + info['eval_costs']['acc'],
             'info': info
         }
 
@@ -314,27 +317,29 @@ def objective_test(
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
         if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
+            model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
                 configuration, fidelity, shuffle, rng, eval="test"
             )
         else:
             #TODO: add cases for `tabular` and `surrogate` benchmarks
             pass
 
-        start = time.time()
         scores = dict()
+        score_cost = dict()
         for k, v in self.scorers.items():
+            _start = time.time()
             scores[k] = v(model, self.test_X, self.test_y)
-        test_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.test_X, self.test_y)
-        eval_time = time.time() - start
+            score_cost[k] = time.time() - _start
+        test_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
 
         info = {
             'train_loss': train_loss,
-            'test_loss': test_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            'scores': scores,
+            'val_loss': test_loss,
+            'model_cost': model_fit_time,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'eval_scores': scores,
+            'eval_costs': score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -342,7 +347,7 @@ def objective_test(
 
         return {
             'function_value': info['test_loss'],
-            'cost': info['cost'],
+            'cost': model_fit_time + info['train_costs']['acc'] + info['eval_costs']['acc'],
             'info': info
         }
 

From 47fe4cdd6e466589449427cbe6a91a7da28479d0 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 1 Jul 2021 16:28:27 +0200
Subject: [PATCH 009/147] Including test scores in objective

---
 hpobench/benchmarks/ml/histgb_benchmark.py    |  7 +-
 .../benchmarks/ml/ml_benchmark_template.py    | 68 +++++++++----------
 hpobench/benchmarks/ml/rf_benchmark.py        |  7 +-
 hpobench/benchmarks/ml/svm_benchmark_2.py     | 16 +----
 4 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 0a0461a3..ac273c57 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -28,12 +28,9 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            benchmark_type: str = "raw"
+            fidelity_choice: int = 1
     ):
-        super(HistGBBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, benchmark_type
-        )
+        super(HistGBBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
         pass
 
     @staticmethod
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 7692e447..cc543b50 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -44,14 +44,12 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            benchmark_type: str = "raw"
+            fidelity_choice: int = 1
     ):
         self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
         self.rng = check_random_state(self.seed)
         super(Benchmark, self).__init__(rng=seed)
 
-        self.benchmark_type = benchmark_type
         self.task_id = task_id
         self.valid_size = valid_size
         self.scorers = dict()
@@ -258,7 +256,7 @@ def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
             _start = time.time()
             scores[k] = v(model, train_X, train_y)
             score_cost[k] = time.time() - _start
-        train_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, train_X, train_y)
+        train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
     def objective(
@@ -271,21 +269,24 @@ def objective(
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
-                configuration, fidelity, shuffle, rng
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass + info['train_costs']['acc']
+        model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
+            configuration, fidelity, shuffle, rng
+        )
+        val_scores = dict()
+        val_score_cost = dict()
+        for k, v in self.scorers.items():
+            _start = time.time()
+            val_scores[k] = v(model, self.valid_X, self.valid_y)
+            val_score_cost[k] = time.time() - _start
+        val_loss = 1 - val_scores["acc"]
 
-        scores = dict()
-        score_cost = dict()
+        test_scores = dict()
+        test_score_cost = dict()
         for k, v in self.scorers.items():
             _start = time.time()
-            scores[k] = v(model, self.valid_X, self.valid_y)
-            score_cost[k] = time.time() - _start
-        val_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
+            test_scores[k] = v(model, self.test_X, self.test_y)
+            test_score_cost[k] = time.time() - _start
+        val_loss = 1 - test_scores["acc"]
 
         info = {
             'train_loss': train_loss,
@@ -293,8 +294,10 @@ def objective(
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
-            'eval_scores': scores,
-            'eval_costs': score_cost,
+            'val_scores': val_scores,
+            'val_costs': val_score_cost,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -302,7 +305,7 @@ def objective(
 
         return {
             'function_value': info['val_loss'],
-            'cost': model_fit_time + info['train_costs']['acc'] + info['eval_costs']['acc'],
+            'cost': model_fit_time + info['train_costs']['acc'] + info['val_costs']['acc'],
             'info': info
         }
 
@@ -316,21 +319,16 @@ def objective_test(
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
-                configuration, fidelity, shuffle, rng, eval="test"
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        scores = dict()
-        score_cost = dict()
+        model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
+            configuration, fidelity, shuffle, rng, eval="test"
+        )
+        test_scores = dict()
+        test_score_cost = dict()
         for k, v in self.scorers.items():
             _start = time.time()
-            scores[k] = v(model, self.test_X, self.test_y)
-            score_cost[k] = time.time() - _start
-        test_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
+            test_scores[k] = v(model, self.test_X, self.test_y)
+            test_score_cost[k] = time.time() - _start
+        test_loss = 1 - test_scores["acc"]
 
         info = {
             'train_loss': train_loss,
@@ -338,8 +336,8 @@ def objective_test(
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
-            'eval_scores': scores,
-            'eval_costs': score_cost,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -347,7 +345,7 @@ def objective_test(
 
         return {
             'function_value': info['test_loss'],
-            'cost': model_fit_time + info['train_costs']['acc'] + info['eval_costs']['acc'],
+            'cost': model_fit_time + info['train_costs']['acc'] + info['test_costs']['acc'],
             'info': info
         }
 
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 96e3f48c..7426a37a 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -24,12 +24,9 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            benchmark_type: str = "raw"
+            fidelity_choice: int = 1
     ):
-        super(RandomForestBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, benchmark_type
-        )
+        super(RandomForestBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
         pass
 
     @staticmethod
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 2747f380..12d22afa 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -24,12 +24,9 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            benchmark_type: str = "raw"
+            fidelity_choice: int = 1
     ):
-        super(SVMBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, benchmark_type
-        )
+        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
         self.cache_size = 200
 
     @staticmethod
@@ -37,15 +34,6 @@ def get_configuration_space(seed=None):
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
-
-        # cs.add_hyperparameters([
-        #     CS.UniformFloatHyperparameter(
-        #         'C', lower=-10., upper=10., default_value=0., log=False
-        #     ),
-        #     CS.UniformFloatHyperparameter(
-        #         'gamma', lower=-10., upper=10., default_value=1., log=False
-        #     ),
-        # ])
         # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
         cs.add_hyperparameters([
             CS.UniformFloatHyperparameter(

From 2d085ecd2d3fd083a3168b6bd861c06bbd8bfd32 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 1 Jul 2021 16:45:43 +0200
Subject: [PATCH 010/147] Documenting the structure of information in each fn
 eval.

---
 hpobench/benchmarks/ml/README.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 hpobench/benchmarks/ml/README.md

diff --git a/hpobench/benchmarks/ml/README.md b/hpobench/benchmarks/ml/README.md
new file mode 100644
index 00000000..46ad4e08
--- /dev/null
+++ b/hpobench/benchmarks/ml/README.md
@@ -0,0 +1,29 @@
+Each function evalution returns a dictionary with the following information:
+
+```
+└───function_value: 1 - accuracy (acc.) on validation set
+└───cost: time to fit model + time to evaluate acc. training set + time to evaluate acc. validation set
+└───info: dictionary (dict) with miscellaneous information
+|   └───train_loss: 1 - accuracy (acc.) on training set
+|   └───val_loss: 1 - accuracy (acc.) on validation set
+|   └───model_cost: time taken to fit the model
+|   └───train_scores: performance on all metrics over the training set (dict)
+|   |   └───f1: F1-score   
+|   |   └───acc: Accuracy
+|   |   └───bal_acc: Balanced accuracy
+|   └───train_costs: time taken to compute performance on all metrics over the training set (dict)
+|   |   └───f1: F1-score   
+|   |   └───acc: Accuracy
+|   |   └───bal_acc: Balanced accuracy 
+|   └───valid_scores: performance on all metrics over the validation set (dict)
+|   |   └───...
+|   └───valid_costs: time taken to compute performance on all metrics over the validation set (dict)
+|   |   └───...
+|   └───test_scores: performance on all metrics over the test set
+|   |   └───...
+|   └───test_costs: time taken to compute performance on all metrics over the test set (dict)
+|   |   └───...
+```
+
+*NOTE*: the keys `function_value`, `cost`, `info` need to exist when creating a new objective 
+function, while `info` can house any kind of auxilliary information required.
\ No newline at end of file

From 2da9d5c02a2e413f4ab78a81bf68a6bc32495f4e Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 2 Jul 2021 17:04:26 +0200
Subject: [PATCH 011/147] Some decisions on lower bound for subsample fidelity

---
 hpobench/benchmarks/ml/ml_benchmark_template.py |  5 +++--
 hpobench/benchmarks/ml/svm_benchmark_2.py       | 10 ++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index cc543b50..3ad61b54 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -194,8 +194,9 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
 
         # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
         # use 10 times the number of classes as lower bound for the dataset fraction
-        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
+        n_classes = len(self.task.class_labels)
         self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+        self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
 
         if verbose:
             print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
@@ -332,7 +333,7 @@ def objective_test(
 
         info = {
             'train_loss': train_loss,
-            'val_loss': test_loss,
+            'test_loss': test_loss,
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 12d22afa..845f40e0 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -45,8 +45,8 @@ def get_configuration_space(seed=None):
         ])
         return cs
 
-    @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=None):
+    @classmethod
+    def get_fidelity_space(cls, seed=None, fidelity_choice=None):
         """Fidelity space available --- specifies the fidelity dimensions
 
         For SVM, only a single fidelity exists, i.e., subsample fraction.
@@ -57,12 +57,14 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
 
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+
         if fidelity_choice == 0:
             subsample = CS.Constant('subsample', value=1)
         else:
+            # TODO: dynamically adapt based on 1/512 and lower_bound_train_size and set log=True
+            lower = 0.1
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+                'subsample', lower=lower, upper=1, default_value=0.33, log=False
             )
         z_cs.add_hyperparameter(subsample)
         return z_cs

From 751d2e91658f4c7efc0acdc9676545162d870f84 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 6 Jul 2021 20:20:27 +0200
Subject: [PATCH 012/147] AbstractBenchmark update for fidelity option +
 including XGBoost

---
 README.md                                     |   7 +-
 examples/local/xgboost_local.py               |   2 +-
 hpobench/abstract_benchmark.py                |   7 +-
 hpobench/benchmarks/ml/histgb_benchmark.py    |   4 +-
 .../benchmarks/ml/ml_benchmark_template.py    |  82 +--
 hpobench/benchmarks/ml/rf_benchmark.py        |   4 +-
 hpobench/benchmarks/ml/svm_benchmark.py       | 391 +++-----------
 hpobench/benchmarks/ml/svm_benchmark_2.py     |  81 ---
 hpobench/benchmarks/ml/svm_benchmark_old.py   | 350 ++++++++++++
 hpobench/benchmarks/ml/xgboost_benchmark.py   | 511 ++++--------------
 .../benchmarks/ml/xgboost_benchmark_old.py    | 426 +++++++++++++++
 tests/test_utils.py                           |   2 +-
 tests/test_whitebox.py                        |   2 +-
 13 files changed, 1004 insertions(+), 865 deletions(-)
 delete mode 100644 hpobench/benchmarks/ml/svm_benchmark_2.py
 create mode 100644 hpobench/benchmarks/ml/svm_benchmark_old.py
 create mode 100644 hpobench/benchmarks/ml/xgboost_benchmark_old.py

diff --git a/README.md b/README.md
index 001eb1f4..998f2ad2 100644
--- a/README.md
+++ b/README.md
@@ -35,11 +35,14 @@ Further requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace),
  This can be arbitrarily complex and further information can be found in the docstring of the benchmark.
  
 A simple example is the XGBoost benchmark which can be installed with `pip install .[xgboost]`
+
 ```python
-from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark
+from hpobench.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark
+
 b = XGBoostBenchmark(task_id=167149)
 config = b.get_configuration_space(seed=1).sample_configuration()
-result_dict = b.objective_function(configuration=config, fidelity={"n_estimators": 128, "dataset_fraction": 0.5}, rng=1)
+result_dict = b.objective_function(configuration=config,
+                                   fidelity={"n_estimators": 128, "dataset_fraction": 0.5}, rng=1)
 
 ```
 
diff --git a/examples/local/xgboost_local.py b/examples/local/xgboost_local.py
index 47c1f77f..4f3b3ad3 100644
--- a/examples/local/xgboost_local.py
+++ b/examples/local/xgboost_local.py
@@ -10,7 +10,7 @@
 import argparse
 from time import time
 
-from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark as Benchmark
+from hpobench.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark as Benchmark
 from hpobench.util.openml_data_manager import get_openmlcc18_taskids
 
 
diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index 5d7bc994..abbbcb22 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -226,12 +226,17 @@ def get_configuration_space(seed: Union[int, None] = None) -> ConfigSpace.Config
 
     @staticmethod
     @abc.abstractmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+    def get_fidelity_space(
+            seed: Union[int, None] = None, fidelity_choice: Union[int, None] = None
+    ) -> ConfigSpace.ConfigurationSpace:
         """ Defines the available fidelity parameters as a "fidelity space" for each benchmark.
         Parameters
         ----------
         seed: int, None
             Seed for the fidelity space.
+        fidelity_choice: int, None
+            integer value to choose the type of fidelity space
+
         Returns
         -------
         ConfigSpace.ConfigurationSpace
diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index ac273c57..21ed4ec0 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -19,10 +19,10 @@
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
-from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
 
-class HistGBBenchmark(Benchmark):
+class HistGBBenchmark(MLBenchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 3ad61b54..e0ab59bc 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -36,7 +36,7 @@
 )
 
 
-class Benchmark(AbstractBenchmark):
+class MLBenchmark(AbstractBenchmark):
     _issue_tasks = [3917, 3945]
 
     def __init__(
@@ -48,7 +48,7 @@ def __init__(
     ):
         self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
         self.rng = check_random_state(self.seed)
-        super(Benchmark, self).__init__(rng=seed)
+        super(MLBenchmark, self).__init__(rng=seed)
 
         self.task_id = task_id
         self.valid_size = valid_size
@@ -84,7 +84,7 @@ def get_configuration_space(seed=None):
         raise NotImplementedError()
 
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=1):
+    def get_fidelity_space(seed=None, fidelity_choice=None):
         """Fidelity space available --- specifies the fidelity dimensions
 
         If fidelity_choice is 0
@@ -194,8 +194,8 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
 
         # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
         # use 10 times the number of classes as lower bound for the dataset fraction
-        n_classes = len(self.task.class_labels)
-        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+        self.n_classes = len(self.task.class_labels)
+        self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
         self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
 
         if verbose:
@@ -219,7 +219,7 @@ def init_model(self, config, fidelity=None, rng=None):
         """
         raise NotImplementedError()
 
-    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+    def _train_objective(self, config, fidelity, shuffle, rng, eval="valid"):
         # initializing model
         model = self.init_model(config, fidelity, rng)
 
@@ -260,7 +260,7 @@ def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
-    def objective(
+    def objective_function(
             self,
             configuration: Union[CS.Configuration, Dict],
             fidelity: Union[CS.Configuration, Dict, None] = None,
@@ -270,7 +270,7 @@ def objective(
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
+        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
             configuration, fidelity, shuffle, rng
         )
         val_scores = dict()
@@ -310,7 +310,7 @@ def objective(
             'info': info
         }
 
-    def objective_test(
+    def objective_function_test(
             self,
             configuration: Union[CS.Configuration, Dict],
             fidelity: Union[CS.Configuration, Dict, None] = None,
@@ -320,7 +320,7 @@ def objective_test(
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
+        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
             configuration, fidelity, shuffle, rng, eval="test"
         )
         test_scores = dict()
@@ -350,34 +350,40 @@ def objective_test(
             'info': info
         }
 
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        return dict()
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        return dict()
+    # # pylint: disable=arguments-differ
+    # @AbstractBenchmark.check_parameters
+    # def objective_function(
+    #         self,
+    #         configuration: Union[CS.Configuration, Dict],
+    #         fidelity: Union[CS.Configuration, Dict, None] = None,
+    #         shuffle: bool = False,
+    #         rng: Union[np.random.RandomState, int, None] = None,
+    #         **kwargs
+    # ) -> Dict:
+    #     """Function that evaluates a 'config' on a 'fidelity' on the validation set
+    #     """
+    #     return dict()
+    #
+    # # pylint: disable=arguments-differ
+    # @AbstractBenchmark.check_parameters
+    # def objective_function_test(
+    #         self,
+    #         configuration: Union[CS.Configuration, Dict],
+    #         fidelity: Union[CS.Configuration, Dict, None] = None,
+    #         shuffle: bool = False,
+    #         rng: Union[np.random.RandomState, int, None] = None,
+    #         **kwargs
+    # ) -> Dict:
+    #     """Function that evaluates a 'config' on a 'fidelity' on the test set
+    #     """
+    #     return dict()
 
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
-        pass
+        return {'name': 'Support Vector Machine',
+                'shape of train data': self.x_train.shape,
+                'shape of test data': self.x_test.shape,
+                'shape of valid data': self.x_valid.shape,
+                'initial random seed': self.rng,
+                'task_id': self.task_id
+                }
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 7426a37a..b815e1bd 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -15,10 +15,10 @@
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, make_scorer
 
-from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
 
-class RandomForestBenchmark(Benchmark):
+class RandomForestBenchmark(MLBenchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 0a765e45..1d0e2d00 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -1,350 +1,81 @@
-"""
-
-Changelog:
-==========
-0.0.2:
-* Standardize the structure of the meta information
-
-0.0.1:
-* First implementation
-
-"""
-
-import logging
 import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
+import openml
 import numpy as np
-from scipy import sparse
-from sklearn import pipeline
-from sklearn import svm
-from sklearn.compose import ColumnTransformer
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
+from sklearn.svm import SVC
 from sklearn.impute import SimpleImputer
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.metrics import accuracy_score, make_scorer
-from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
-
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.2'
-
-logger = logging.getLogger('SVMBenchmark')
-
-
-class SupportVectorMachine(AbstractBenchmark):
-    """
-    Hyperparameter optimization task to optimize the regularization
-    parameter C and the kernel parameter gamma of a support vector machine.
-    Both hyperparameters are optimized on a log scale in [-10, 10].
-    The X_test data set is only used for a final offline evaluation of
-    a configuration. For that the validation and training data is
-    concatenated to form the whole training data set.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-        Parameters
-        ----------
-        task_id : int, None
-        rng : np.random.RandomState, int, None
-        """
-        super(SupportVectorMachine, self).__init__(rng=rng)
-
-        self.task_id = task_id
-        self.cache_size = 200  # Cache for the SVC in MB
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM model
-        fidelity: Dict, None
-            Fidelity parameters for the SVM model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : training loss
-                fidelity : used fidelities in this evaluation
-        """
-        start_time = time.time()
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
 
-        # Split of dataset subset
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_size = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_size = fidelity['dataset_fraction']
-
-        train_size = int(train_size * len(self.train_idx))
-        train_idx = self.train_idx[:train_size]
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        # Train support vector machine
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(self.x_train[train_idx], self.y_train[train_idx])
-
-        # Compute validation error
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(val_loss),
-                "cost": cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}}
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model with a given configuration on both the X_train
-        and validation data set and evaluates the model on the X_test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : X_test loss
-            cost : time to X_train and evaluate the model
-            info : Dict
-                train_valid_loss: Loss on the train+valid data set
-                fidelity : used fidelities in this evaluation
-        """
-        assert np.isclose(fidelity['dataset_fraction'], 1), \
-            f'Data set fraction must be 1 but was {fidelity["dataset_fraction"]}'
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start_time = time.time()
-
-        # Concatenate training and validation dataset
-        if isinstance(self.x_train, sparse.csr.csr_matrix) or isinstance(self.x_valid, sparse.csr.csr_matrix):
-            data = sparse.vstack((self.x_train, self.x_valid))
-        else:
-            data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(data, targets)
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
-        # Compute validation error
-        train_valid_loss = 1 - self.accuracy_scorer(model, data, targets)
 
-        # Compute test error
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(test_loss),
-                "cost": cost,
-                'info': {'train_valid_loss': float(train_valid_loss),
-                         'fidelity': fidelity}}
-
-    def get_pipeline(self, C: float, gamma: float) -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-
-        model = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", MinMaxScaler(feature_range=(0, 1)), ~self.categorical_data)])),
-            ('svm',
-             svm.SVC(gamma=gamma, C=C, random_state=self.rng, cache_size=self.cache_size))
-        ])
-        return model
+class SVMBenchmark(MLBenchmark):
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1
+    ):
+        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
+        self.cache_size = 200
 
     @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the SVM Model
-
-        For a detailed explanation of the hyperparameters:
-        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
         """
-
-        seed = seed if seed is not None else np.random.randint(1, 100000)
         cs = CS.ConfigurationSpace(seed=seed)
-
+        # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
         cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('C', lower=-10., upper=10., default_value=0., log=False),
-            CS.UniformFloatHyperparameter('gamma', lower=-10., upper=10., default_value=1., log=False),
+            CS.UniformFloatHyperparameter(
+                "C", 0.03125, 32768, log=True, default_value=1.0
+            ),
+            CS.UniformFloatHyperparameter(
+                "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1
+            )
         ])
-        # cs.generate_all_continuous_from_bounds(SupportVectorMachine.get_meta_information()['bounds'])
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the SupportVector Benchmark
-
-        Fidelities
-        ----------
-        dataset_fraction: float - [0.1, 1]
-            fraction of training data set to use
+    def get_fidelity_space(seed=None, fidelity_choice=None):
+        """Fidelity space available --- specifies the fidelity dimensions
 
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
+        For SVM, only a single fidelity exists, i.e., subsample fraction.
+        if fidelity_choice == 0
+            uses the entire data (subsample=1), reflecting the black-box setup
+        else
+            parameterizes the fraction of data to subsample
 
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
         """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
+        z_cs = CS.ConfigurationSpace(seed=seed)
 
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-        ])
-        return fidel_space
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        return {'name': 'Support Vector Machine',
-                'references': ["@InProceedings{pmlr-v54-klein17a",
-                               "author = {Aaron Klein and Stefan Falkner and Simon Bartels and Philipp Hennig and "
-                               "Frank Hutter}, "
-                               "title = {{Fast Bayesian Optimization of Machine Learning Hyperparameters on "
-                               "Large Datasets}}"
-                               "pages = {528--536}, year = {2017},"
-                               "editor = {Aarti Singh and Jerry Zhu},"
-                               "volume = {54},"
-                               "series = {Proceedings of Machine Learning Research},"
-                               "address = {Fort Lauderdale, FL, USA},"
-                               "month = {20--22 Apr},"
-                               "publisher = {PMLR},"
-                               "pdf = {http://proceedings.mlr.press/v54/klein17a/klein17a.pdf}, "
-                               "url = {http://proceedings.mlr.press/v54/klein17a.html}, "
-                               ],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/container/hpolib/benchmarks/ml/svm_benchmark.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
+        if fidelity_choice == 0:
+            subsample = CS.Constant('subsample', value=1)
+        else:
+            # TODO: dynamically adapt based on 1/512 and lower_bound_train_size and set log=True
+            lower = 0.1
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=lower, upper=1, default_value=0.33, log=False
+            )
+        z_cs.add_hyperparameter(subsample)
+        return z_cs
+
+    def init_model(self, config, fidelity=None, rng=None):
+        # initializing model
+        rng = self.rng if rng is None else rng
+        config = config.get_dictionary()
+        model = SVC(
+            **config,
+            random_state=rng,
+            cache_size=self.cache_size
+        )
+        return model
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
deleted file mode 100644
index 845f40e0..00000000
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import time
-import openml
-import numpy as np
-import pandas as pd
-import ConfigSpace as CS
-from typing import Union, Dict
-
-from sklearn.svm import SVC
-from sklearn.impute import SimpleImputer
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-from sklearn.pipeline import make_pipeline, Pipeline
-from sklearn.metrics import accuracy_score, make_scorer
-
-from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
-
-
-class SVMBenchmark(Benchmark):
-    def __init__(
-            self,
-            task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
-            valid_size: float = 0.33,
-            fidelity_choice: int = 1
-    ):
-        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
-        self.cache_size = 200
-
-    @staticmethod
-    def get_configuration_space(seed=None):
-        """Parameter space to be optimized --- contains the hyperparameters
-        """
-        cs = CS.ConfigurationSpace(seed=seed)
-        # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
-        cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter(
-                "C", 0.03125, 32768, log=True, default_value=1.0
-            ),
-            CS.UniformFloatHyperparameter(
-                "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1
-            )
-        ])
-        return cs
-
-    @classmethod
-    def get_fidelity_space(cls, seed=None, fidelity_choice=None):
-        """Fidelity space available --- specifies the fidelity dimensions
-
-        For SVM, only a single fidelity exists, i.e., subsample fraction.
-        if fidelity_choice == 0
-            uses the entire data (subsample=1), reflecting the black-box setup
-        else
-            parameterizes the fraction of data to subsample
-
-        """
-        z_cs = CS.ConfigurationSpace(seed=seed)
-
-        if fidelity_choice == 0:
-            subsample = CS.Constant('subsample', value=1)
-        else:
-            # TODO: dynamically adapt based on 1/512 and lower_bound_train_size and set log=True
-            lower = 0.1
-            subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=lower, upper=1, default_value=0.33, log=False
-            )
-        z_cs.add_hyperparameter(subsample)
-        return z_cs
-
-    def init_model(self, config, fidelity=None, rng=None):
-        # initializing model
-        rng = self.rng if rng is None else rng
-        config = config.get_dictionary()
-        model = SVC(
-            **config,
-            random_state=rng,
-            cache_size=self.cache_size
-        )
-        return model
diff --git a/hpobench/benchmarks/ml/svm_benchmark_old.py b/hpobench/benchmarks/ml/svm_benchmark_old.py
new file mode 100644
index 00000000..0a765e45
--- /dev/null
+++ b/hpobench/benchmarks/ml/svm_benchmark_old.py
@@ -0,0 +1,350 @@
+"""
+
+Changelog:
+==========
+0.0.2:
+* Standardize the structure of the meta information
+
+0.0.1:
+* First implementation
+
+"""
+
+import logging
+import time
+from typing import Union, Tuple, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+from scipy import sparse
+from sklearn import pipeline
+from sklearn import svm
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
+
+__version__ = '0.0.2'
+
+logger = logging.getLogger('SVMBenchmark')
+
+
+class SupportVectorMachine(AbstractBenchmark):
+    """
+    Hyperparameter optimization task to optimize the regularization
+    parameter C and the kernel parameter gamma of a support vector machine.
+    Both hyperparameters are optimized on a log scale in [-10, 10].
+    The X_test data set is only used for a final offline evaluation of
+    a configuration. For that the validation and training data is
+    concatenated to form the whole training data set.
+    """
+
+    def __init__(self, task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        Parameters
+        ----------
+        task_id : int, None
+        rng : np.random.RandomState, int, None
+        """
+        super(SupportVectorMachine, self).__init__(rng=rng)
+
+        self.task_id = task_id
+        self.cache_size = 200  # Cache for the SVC in MB
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
+            self.get_data()
+        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
+
+        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
+        categorical_idx = np.argwhere(self.categorical_data)
+        continuous_idx = np.argwhere(~self.categorical_data)
+        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
+        self.categorical_data = self.categorical_data[sorting]
+        self.x_train = self.x_train[:, sorting]
+        self.x_valid = self.x_valid[:, sorting]
+        self.x_test = self.x_test[:, sorting]
+
+        nan_columns = np.all(np.isnan(self.x_train), axis=0)
+        self.categorical_data = self.categorical_data[~nan_columns]
+        self.x_train, self.x_valid, self.x_test, self.categories = \
+            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
+                                                                 is_categorical=self.categorical_data)
+
+        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
+                                         size=len(self.x_train),
+                                         replace=False)
+
+        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
+        # (https://arxiv.org/pdf/1605.07079.pdf),
+        # use 10 time the number of classes as lower bound for the dataset fraction
+        n_classes = np.unique(self.y_train).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
+
+    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
+        """ Loads the data given a task or another source. """
+
+        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
+                                                             'overwrite the get_data method.')
+
+        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
+        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
+
+        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
+
+    def shuffle_data(self, rng=None):
+        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
+        class-random-state"""
+        random_state = rng_helper.get_rng(rng, self.rng)
+        random_state.shuffle(self.train_idx)
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           shuffle: bool = False,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a SVM model given a hyperparameter configuration and
+        evaluates the model on the validation set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the SVM model
+        fidelity: Dict, None
+            Fidelity parameters for the SVM model, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation loss
+            cost : time to train and evaluate the model
+            info : Dict
+                train_loss : training loss
+                fidelity : used fidelities in this evaluation
+        """
+        start_time = time.time()
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        # Split of dataset subset
+        if self.lower_bound_train_size > fidelity['dataset_fraction']:
+            train_size = self.lower_bound_train_size
+            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
+                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
+                           f'{self.lower_bound_train_size:.8f}')
+        else:
+            train_size = fidelity['dataset_fraction']
+
+        train_size = int(train_size * len(self.train_idx))
+        train_idx = self.train_idx[:train_size]
+
+        # Transform hyperparameters to linear scale
+        hp_c = np.exp(float(configuration['C']))
+        hp_gamma = np.exp(float(configuration['gamma']))
+
+        # Train support vector machine
+        model = self.get_pipeline(hp_c, hp_gamma)
+        model.fit(self.x_train[train_idx], self.y_train[train_idx])
+
+        # Compute validation error
+        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
+        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
+
+        cost = time.time() - start_time
+
+        return {'function_value': float(val_loss),
+                "cost": cost,
+                'info': {'train_loss': float(train_loss),
+                         'fidelity': fidelity}}
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                shuffle: bool = False,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a SVM model with a given configuration on both the X_train
+        and validation data set and evaluates the model on the X_test data set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the SVM Model
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : X_test loss
+            cost : time to X_train and evaluate the model
+            info : Dict
+                train_valid_loss: Loss on the train+valid data set
+                fidelity : used fidelities in this evaluation
+        """
+        assert np.isclose(fidelity['dataset_fraction'], 1), \
+            f'Data set fraction must be 1 but was {fidelity["dataset_fraction"]}'
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        start_time = time.time()
+
+        # Concatenate training and validation dataset
+        if isinstance(self.x_train, sparse.csr.csr_matrix) or isinstance(self.x_valid, sparse.csr.csr_matrix):
+            data = sparse.vstack((self.x_train, self.x_valid))
+        else:
+            data = np.concatenate((self.x_train, self.x_valid))
+        targets = np.concatenate((self.y_train, self.y_valid))
+
+        # Transform hyperparameters to linear scale
+        hp_c = np.exp(float(configuration['C']))
+        hp_gamma = np.exp(float(configuration['gamma']))
+
+        model = self.get_pipeline(hp_c, hp_gamma)
+        model.fit(data, targets)
+
+        # Compute validation error
+        train_valid_loss = 1 - self.accuracy_scorer(model, data, targets)
+
+        # Compute test error
+        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
+
+        cost = time.time() - start_time
+
+        return {'function_value': float(test_loss),
+                "cost": cost,
+                'info': {'train_valid_loss': float(train_valid_loss),
+                         'fidelity': fidelity}}
+
+    def get_pipeline(self, C: float, gamma: float) -> pipeline.Pipeline:
+        """ Create the scikit-learn (training-)pipeline """
+
+        model = pipeline.Pipeline([
+            ('preprocess_impute',
+             ColumnTransformer([
+                 ("categorical", "passthrough", self.categorical_data),
+                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
+            ('preprocess_one_hot',
+             ColumnTransformer([
+                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
+                 ("continuous", MinMaxScaler(feature_range=(0, 1)), ~self.categorical_data)])),
+            ('svm',
+             svm.SVC(gamma=gamma, C=C, random_state=self.rng, cache_size=self.cache_size))
+        ])
+        return model
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the SVM Model
+
+        For a detailed explanation of the hyperparameters:
+        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformFloatHyperparameter('C', lower=-10., upper=10., default_value=0., log=False),
+            CS.UniformFloatHyperparameter('gamma', lower=-10., upper=10., default_value=1., log=False),
+        ])
+        # cs.generate_all_continuous_from_bounds(SupportVectorMachine.get_meta_information()['bounds'])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the SupportVector Benchmark
+
+        Fidelities
+        ----------
+        dataset_fraction: float - [0.1, 1]
+            fraction of training data set to use
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
+        ])
+        return fidel_space
+
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        return {'name': 'Support Vector Machine',
+                'references': ["@InProceedings{pmlr-v54-klein17a",
+                               "author = {Aaron Klein and Stefan Falkner and Simon Bartels and Philipp Hennig and "
+                               "Frank Hutter}, "
+                               "title = {{Fast Bayesian Optimization of Machine Learning Hyperparameters on "
+                               "Large Datasets}}"
+                               "pages = {528--536}, year = {2017},"
+                               "editor = {Aarti Singh and Jerry Zhu},"
+                               "volume = {54},"
+                               "series = {Proceedings of Machine Learning Research},"
+                               "address = {Fort Lauderdale, FL, USA},"
+                               "month = {20--22 Apr},"
+                               "publisher = {PMLR},"
+                               "pdf = {http://proceedings.mlr.press/v54/klein17a/klein17a.pdf}, "
+                               "url = {http://proceedings.mlr.press/v54/klein17a.html}, "
+                               ],
+                'code': 'https://github.com/automl/HPOlib1.5/blob/container/hpolib/benchmarks/ml/svm_benchmark.py',
+                'shape of train data': self.x_train.shape,
+                'shape of test data': self.x_test.shape,
+                'shape of valid data': self.x_valid.shape,
+                'initial random seed': self.rng,
+                'task_id': self.task_id
+                }
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index e43b3529..b038e4c9 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -1,426 +1,125 @@
-"""
-
-Changelog:
-==========
-0.0.2:
-* Change the search space definiton to match the paper: (https://arxiv.org/pdf/1802.09596.pdf)
-    eta:                [1e-5, 1] (def: 0.3)    ->  [2**-10, 1] (def: 0.3)
-    min_child_weight:   [0,05, 10] (def: 1)     ->  [1, 2**7] (def: 1)
-    colsample_bytree:   [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    colsample_bylevel:  [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    reg_lambda:         [1e-5, 2] (def: 1)      ->  [2**-10, 2**10] (def: 1)
-    reg_alpha:          [1e-5, 2] (def: 1e-5)   ->  [2**-10, 2**10] (def: 1)
-    max_depth:          -                       ->  [1, 15] (def: 6)
-    subsample_per_it:   -                       ->  [0.01, 1] (def: 1)
-    [booster:            -                      ->  [gbtree, gblinear, dart] (def: gbtree)]  *)
-
-    *) This parameter is only in the XGBoostExtendedBenchmark. Not in the XGBoostBenchmark class.
-
-* Increase the fidelity `n_estimators`
-    n_estimators        [2, 128] (def: 128)     ->  [1, 256] (def: 256)
-
-* Add class to optimize also the used booster method: (gbtree, gblinear or dart)
-    We have introduced a new class, which adds the used booster as parameter to the configuration space. To read more
-    about booster, please take a look in the official XGBoost-documentation (https://xgboost.readthedocs.io/en/latest).
-
-
-0.0.1:
-* First implementation of a XGBoost Benchmark.
-
-
-"""
-
-import logging
 import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
+import openml
 import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
 import xgboost as xgb
-from sklearn import pipeline
-from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, make_scorer
 
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.2'
-
-logger = logging.getLogger('XGBBenchmark')
-
-
-class XGBoostBenchmark(AbstractBenchmark):
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-
-        Parameters
-        ----------
-        task_id : int, None
-        n_threads  : int, None
-        rng : np.random.RandomState, int, None
-        """
-
-        super(XGBoostBenchmark, self).__init__(rng=rng)
-        self.n_threads = n_threads
-        self.task_id = task_id
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        # Determine the number of categories in the labels.
-        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
-        self.num_class = len(np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid])))
-        self.num_class = 1 if self.num_class == 2 else self.num_class
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost model
-        fidelity: Dict, None
-            Fidelity parameters for the XGBoost model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : trainings loss
-                fidelity : used fidelities in this evaluation
-        """
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_data_fraction = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_data_fraction = fidelity['dataset_fraction']
-
-        train_idx = self.train_idx[:int(len(self.train_idx) * train_data_fraction)]
-
-        model = self._get_pipeline(n_estimators=fidelity["n_estimators"], **configuration)
-        model.fit(X=self.x_train[train_idx], y=self.y_train[train_idx])
-
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-        cost = time.time() - start
-
-        return {'function_value': float(val_loss),
-                'cost': cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}
-                }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model with a given configuration on both the train
-        and validation data set and evaluates the model on the test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : test loss
-            cost : time to train and evaluate the model
-            info : Dict
-                fidelity : used fidelities in this evaluation
-        """
-        default_dataset_fraction = self.get_fidelity_space().get_hyperparameter('dataset_fraction').default_value
-        if fidelity['dataset_fraction'] != default_dataset_fraction:
-            raise NotImplementedError(f'Test error can not be computed for dataset_fraction <= '
-                                      f'{default_dataset_fraction}')
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        # Impute potential nan values with the feature-
-        data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        model = self._get_pipeline(n_estimators=fidelity['n_estimators'], **configuration)
-        model.fit(X=data, y=targets)
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-        cost = time.time() - start
 
-        return {'function_value': float(test_loss),
-                'cost': cost,
-                'info': {'fidelity': fidelity}}
+class XGBoostBenchmark(MLBenchmark):
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1
+    ):
+        super(XGBoostBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
+        pass
 
     @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
         """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the XGBoost Model
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
         cs = CS.ConfigurationSpace(seed=seed)
 
         cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('eta', lower=2**-10, upper=1., default_value=0.3, log=True),
-            CS.UniformIntegerHyperparameter('max_depth', lower=1, upper=15, default_value=6, log=False),
-            CS.UniformFloatHyperparameter('min_child_weight', lower=1., upper=2**7., default_value=1., log=True),
-            CS.UniformFloatHyperparameter('colsample_bytree', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('colsample_bylevel', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('subsample_per_it', lower=0.1, upper=1, default_value=1, log=False)
+            CS.UniformFloatHyperparameter(
+                'eta', lower=2**-10, upper=1., default_value=0.3, log=True
+            ),  # learning rate
+            CS.UniformIntegerHyperparameter(
+                'max_depth', lower=1, upper=15, default_value=6, log=False
+            ),
+            CS.UniformFloatHyperparameter(
+                'min_child_weight', lower=1., upper=2**7., default_value=1., log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'colsample_bytree', lower=0.01, upper=1., default_value=1.
+            ),
+            # CS.UniformFloatHyperparameter(
+            #     'colsample_bylevel', lower=0.01, upper=1., default_value=1.
+            # ),
+            CS.UniformFloatHyperparameter(
+                'reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True
+            ),
+            # CS.UniformFloatHyperparameter(
+            #     'reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True
+            # ),
+            # CS.UniformFloatHyperparameter(
+            #     'subsample_per_it', lower=0.1, upper=1, default_value=1, log=False
+            # )
         ])
-
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
         """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the XGBoost Benchmark
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+        if fidelity_choice == 0:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 1:
+            # only n_estimators as fidelity
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 2:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+            )
+        else:
+            # both n_estimators and subsample as fidelities
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+            )
+        z_cs.add_hyperparameters([ntrees, subsample])
+        return z_cs
+
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
         """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-            CS.UniformIntegerHyperparameter("n_estimators", lower=1, upper=256, default_value=256, log=False)
-        ])
-
-        return fidel_space
-
-    def get_meta_information(self) -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'XGBoost',
-                'references': ['@article{probst2019tunability,'
-                               'title={Tunability: Importance of hyperparameters of machine learning algorithms.},'
-                               'author={Probst, Philipp and Boulesteix, Anne-Laure and Bischl, Bernd},'
-                               'journal={J. Mach. Learn. Res.},'
-                               'volume={20},'
-                               'number={53},'
-                               'pages={1--32},'
-                               'year={2019}'
-                               '}'],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/development/hpolib/benchmarks/ml/'
-                        'xgboost_benchmark.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
-
-    def _get_pipeline(self, max_depth: int, eta: float, min_child_weight: int,
-                      colsample_bytree: float, colsample_bylevel: float, reg_lambda: int, reg_alpha: int,
-                      n_estimators: int, subsample_per_it: float) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(
-                 max_depth=max_depth,
-                 learning_rate=eta,
-                 min_child_weight=min_child_weight,
-                 colsample_bytree=colsample_bytree,
-                 colsample_bylevel=colsample_bylevel,
-                 reg_alpha=reg_alpha,
-                 reg_lambda=reg_lambda,
-                 n_estimators=n_estimators,
-                 objective=objective,
-                 n_jobs=self.n_threads,
-                 random_state=self.rng.randint(1, 100000),
-                 num_class=self.num_class,
-                 subsample=subsample_per_it))
-            ])
-        return clf
-
-
-class XGBoostExtendedBenchmark(XGBoostBenchmark):
-    """
-    Similar to XGBoostBenchmark but enables also the optimization of the used booster.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        super(XGBoostExtendedBenchmark, self).__init__(task_id=task_id, n_threads=n_threads, rng=rng)
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = XGBoostBenchmark.get_configuration_space(seed)
-        hp_booster = CS.CategoricalHyperparameter('booster', choices=['gbtree', 'gblinear', 'dart'],
-                                                  default_value='gbtree')
-        cs.add_hyperparameter(hp_booster)
-
-        # XGBoost with 'gblinear' can not use some
-        # parameters. Exclude them from the configuration space by introducing a condition.
-        hps = ['colsample_bylevel', 'colsample_bytree', 'max_depth',  'min_child_weight', 'subsample_per_it']
-
-        # The NotEqualsCondition means: "Make parameter X active if hp_booster is not equal to gblinear."
-        conditions = [CS.NotEqualsCondition(cs.get_hyperparameter(hp), hp_booster, 'gblinear') for hp in hps]
-        cs.add_conditions(conditions)
-        return cs
-
-    # noinspection PyMethodOverriding
-    # pylint: disable=arguments-differ
-    def _get_pipeline(self, n_estimators: int, booster: str, reg_lambda: int, reg_alpha: int, eta: float,
-                      min_child_weight: int = None, max_depth: int = None, colsample_bytree: float = None,
-                      colsample_bylevel: float = None, subsample_per_it: float = None) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        configuration = dict(booster=booster,
-                             max_depth=max_depth,
-                             learning_rate=eta,
-                             min_child_weight=min_child_weight,
-                             colsample_bytree=colsample_bytree,
-                             colsample_bylevel=colsample_bylevel,
-                             reg_alpha=reg_alpha,
-                             reg_lambda=reg_lambda,
-                             n_estimators=n_estimators,
-                             objective=objective,
-                             n_jobs=self.n_threads,
-                             random_state=self.rng.randint(1, 100000),
-                             num_class=self.num_class,
-                             subsample=subsample_per_it)
-
-        configuration = {k: v for k, v in configuration.items() if v is not None}
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(**configuration))
-        ])
-        return clf
+        rng = rng if (rng is None and isinstance(rng, int)) else self.seed
+        extra_args = dict(
+            n_estimators=fidelity['n_estimators'],
+            objective="binary:logistic",
+            random_state=rng,
+            subsample=1
+        )
+        if self.n_classes > 2:
+            extra_args["objective"] = "multi:softmax"
+            extra_args.update({"num_class": self.n_classes})
+        model = xgb.XGBClassifier(
+            **config.get_dictionary(),
+            **extra_args
+        )
+        return model
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/benchmarks/ml/xgboost_benchmark_old.py
new file mode 100644
index 00000000..fb380c89
--- /dev/null
+++ b/hpobench/benchmarks/ml/xgboost_benchmark_old.py
@@ -0,0 +1,426 @@
+"""
+
+Changelog:
+==========
+0.0.2:
+* Change the search space definiton to match the paper: (https://arxiv.org/pdf/1802.09596.pdf)
+    eta:                [1e-5, 1] (def: 0.3)    ->  [2**-10, 1] (def: 0.3)
+    min_child_weight:   [0,05, 10] (def: 1)     ->  [1, 2**7] (def: 1)
+    colsample_bytree:   [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
+    colsample_bylevel:  [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
+    reg_lambda:         [1e-5, 2] (def: 1)      ->  [2**-10, 2**10] (def: 1)
+    reg_alpha:          [1e-5, 2] (def: 1e-5)   ->  [2**-10, 2**10] (def: 1)
+    max_depth:          -                       ->  [1, 15] (def: 6)
+    subsample_per_it:   -                       ->  [0.01, 1] (def: 1)
+    [booster:            -                      ->  [gbtree, gblinear, dart] (def: gbtree)]  *)
+
+    *) This parameter is only in the XGBoostExtendedBenchmark. Not in the XGBoostBenchmark class.
+
+* Increase the fidelity `n_estimators`
+    n_estimators        [2, 128] (def: 128)     ->  [1, 256] (def: 256)
+
+* Add class to optimize also the used booster method: (gbtree, gblinear or dart)
+    We have introduced a new class, which adds the used booster as parameter to the configuration space. To read more
+    about booster, please take a look in the official XGBoost-documentation (https://xgboost.readthedocs.io/en/latest).
+
+
+0.0.1:
+* First implementation of a XGBoost Benchmark.
+
+
+"""
+
+import logging
+import time
+from typing import Union, Tuple, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+import xgboost as xgb
+from sklearn import pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.preprocessing import OneHotEncoder
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
+
+__version__ = '0.0.2'
+
+logger = logging.getLogger('XGBBenchmark')
+
+
+class XGBoostBenchmark(AbstractBenchmark):
+
+    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+
+        Parameters
+        ----------
+        task_id : int, None
+        n_threads  : int, None
+        rng : np.random.RandomState, int, None
+        """
+
+        super(XGBoostBenchmark, self).__init__(rng=rng)
+        self.n_threads = n_threads
+        self.task_id = task_id
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
+            self.get_data()
+        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
+
+        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
+        categorical_idx = np.argwhere(self.categorical_data)
+        continuous_idx = np.argwhere(~self.categorical_data)
+        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
+        self.categorical_data = self.categorical_data[sorting]
+        self.x_train = self.x_train[:, sorting]
+        self.x_valid = self.x_valid[:, sorting]
+        self.x_test = self.x_test[:, sorting]
+
+        nan_columns = np.all(np.isnan(self.x_train), axis=0)
+        self.categorical_data = self.categorical_data[~nan_columns]
+
+        self.x_train, self.x_valid, self.x_test, self.categories = \
+            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
+                                                                 is_categorical=self.categorical_data)
+
+        # Determine the number of categories in the labels.
+        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
+        self.num_class = len(np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid])))
+        self.num_class = 1 if self.num_class == 2 else self.num_class
+
+        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
+                                         size=len(self.x_train),
+                                         replace=False)
+
+        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
+        # (https://arxiv.org/pdf/1605.07079.pdf),
+        # use 10 time the number of classes as lower bound for the dataset fraction
+        n_classes = np.unique(self.y_train).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
+
+    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
+        """ Loads the data given a task or another source. """
+
+        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
+                                                             'overwrite the get_data method.')
+
+        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
+        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
+
+        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
+
+    def shuffle_data(self, rng=None):
+        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
+        class-random-state"""
+        random_state = rng_helper.get_rng(rng, self.rng)
+        random_state.shuffle(self.train_idx)
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           shuffle: bool = False,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a XGBoost model given a hyperparameter configuration and
+        evaluates the model on the validation set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the XGBoost model
+        fidelity: Dict, None
+            Fidelity parameters for the XGBoost model, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation loss
+            cost : time to train and evaluate the model
+            info : Dict
+                train_loss : trainings loss
+                fidelity : used fidelities in this evaluation
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        start = time.time()
+
+        if self.lower_bound_train_size > fidelity['dataset_fraction']:
+            train_data_fraction = self.lower_bound_train_size
+            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
+                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
+                           f'{self.lower_bound_train_size:.8f}')
+        else:
+            train_data_fraction = fidelity['dataset_fraction']
+
+        train_idx = self.train_idx[:int(len(self.train_idx) * train_data_fraction)]
+
+        model = self._get_pipeline(n_estimators=fidelity["n_estimators"], **configuration)
+        model.fit(X=self.x_train[train_idx], y=self.y_train[train_idx])
+
+        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
+        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
+        cost = time.time() - start
+
+        return {'function_value': float(val_loss),
+                'cost': cost,
+                'info': {'train_loss': float(train_loss),
+                         'fidelity': fidelity}
+                }
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                shuffle: bool = False,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a XGBoost model with a given configuration on both the train
+        and validation data set and evaluates the model on the test data set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the XGBoost Model
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : test loss
+            cost : time to train and evaluate the model
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        default_dataset_fraction = self.get_fidelity_space().get_hyperparameter('dataset_fraction').default_value
+        if fidelity['dataset_fraction'] != default_dataset_fraction:
+            raise NotImplementedError(f'Test error can not be computed for dataset_fraction <= '
+                                      f'{default_dataset_fraction}')
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        start = time.time()
+
+        # Impute potential nan values with the feature-
+        data = np.concatenate((self.x_train, self.x_valid))
+        targets = np.concatenate((self.y_train, self.y_valid))
+
+        model = self._get_pipeline(n_estimators=fidelity['n_estimators'], **configuration)
+        model.fit(X=data, y=targets)
+
+        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
+        cost = time.time() - start
+
+        return {'function_value': float(test_loss),
+                'cost': cost,
+                'info': {'fidelity': fidelity}}
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the XGBoost Model
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformFloatHyperparameter('eta', lower=2**-10, upper=1., default_value=0.3, log=True),
+            CS.UniformIntegerHyperparameter('max_depth', lower=1, upper=15, default_value=6, log=False),
+            CS.UniformFloatHyperparameter('min_child_weight', lower=1., upper=2**7., default_value=1., log=True),
+            CS.UniformFloatHyperparameter('colsample_bytree', lower=0.01, upper=1., default_value=1.),
+            CS.UniformFloatHyperparameter('colsample_bylevel', lower=0.01, upper=1., default_value=1.),
+            CS.UniformFloatHyperparameter('reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True),
+            CS.UniformFloatHyperparameter('reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True),
+            CS.UniformFloatHyperparameter('subsample_per_it', lower=0.1, upper=1, default_value=1, log=False)
+        ])
+
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the XGBoost Benchmark
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
+            CS.UniformIntegerHyperparameter("n_estimators", lower=1, upper=256, default_value=256, log=False)
+        ])
+
+        return fidel_space
+
+    def get_meta_information(self) -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'XGBoost',
+                'references': ['@article{probst2019tunability,'
+                               'title={Tunability: Importance of hyperparameters of machine learning algorithms.},'
+                               'author={Probst, Philipp and Boulesteix, Anne-Laure and Bischl, Bernd},'
+                               'journal={J. Mach. Learn. Res.},'
+                               'volume={20},'
+                               'number={53},'
+                               'pages={1--32},'
+                               'year={2019}'
+                               '}'],
+                'code': 'https://github.com/automl/HPOlib1.5/blob/development/hpolib/benchmarks/ml/'
+                        'xgboost_benchmark_old.py',
+                'shape of train data': self.x_train.shape,
+                'shape of test data': self.x_test.shape,
+                'shape of valid data': self.x_valid.shape,
+                'initial random seed': self.rng,
+                'task_id': self.task_id
+                }
+
+    def _get_pipeline(self, max_depth: int, eta: float, min_child_weight: int,
+                      colsample_bytree: float, colsample_bylevel: float, reg_lambda: int, reg_alpha: int,
+                      n_estimators: int, subsample_per_it: float) \
+            -> pipeline.Pipeline:
+        """ Create the scikit-learn (training-)pipeline """
+        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
+
+        clf = pipeline.Pipeline([
+            ('preprocess_impute',
+             ColumnTransformer([
+                 ("categorical", "passthrough", self.categorical_data),
+                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
+            ('preprocess_one_hot',
+             ColumnTransformer([
+                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
+                 ("continuous", "passthrough", ~self.categorical_data)])),
+            ('xgb',
+             xgb.XGBClassifier(
+                 max_depth=max_depth,
+                 learning_rate=eta,
+                 min_child_weight=min_child_weight,
+                 colsample_bytree=colsample_bytree,
+                 colsample_bylevel=colsample_bylevel,
+                 reg_alpha=reg_alpha,
+                 reg_lambda=reg_lambda,
+                 n_estimators=n_estimators,
+                 objective=objective,
+                 n_jobs=self.n_threads,
+                 random_state=self.rng.randint(1, 100000),
+                 num_class=self.num_class,
+                 subsample=subsample_per_it))
+            ])
+        return clf
+
+
+class XGBoostExtendedBenchmark(XGBoostBenchmark):
+    """
+    Similar to XGBoostBenchmark but enables also the optimization of the used booster.
+    """
+
+    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(XGBoostExtendedBenchmark, self).__init__(task_id=task_id, n_threads=n_threads, rng=rng)
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        cs = XGBoostBenchmark.get_configuration_space(seed)
+        hp_booster = CS.CategoricalHyperparameter('booster', choices=['gbtree', 'gblinear', 'dart'],
+                                                  default_value='gbtree')
+        cs.add_hyperparameter(hp_booster)
+
+        # XGBoost with 'gblinear' can not use some
+        # parameters. Exclude them from the configuration space by introducing a condition.
+        hps = ['colsample_bylevel', 'colsample_bytree', 'max_depth',  'min_child_weight', 'subsample_per_it']
+
+        # The NotEqualsCondition means: "Make parameter X active if hp_booster is not equal to gblinear."
+        conditions = [CS.NotEqualsCondition(cs.get_hyperparameter(hp), hp_booster, 'gblinear') for hp in hps]
+        cs.add_conditions(conditions)
+        return cs
+
+    # noinspection PyMethodOverriding
+    # pylint: disable=arguments-differ
+    def _get_pipeline(self, n_estimators: int, booster: str, reg_lambda: int, reg_alpha: int, eta: float,
+                      min_child_weight: int = None, max_depth: int = None, colsample_bytree: float = None,
+                      colsample_bylevel: float = None, subsample_per_it: float = None) \
+            -> pipeline.Pipeline:
+        """ Create the scikit-learn (training-)pipeline """
+        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
+
+        configuration = dict(booster=booster,
+                             max_depth=max_depth,
+                             learning_rate=eta,
+                             min_child_weight=min_child_weight,
+                             colsample_bytree=colsample_bytree,
+                             colsample_bylevel=colsample_bylevel,
+                             reg_alpha=reg_alpha,
+                             reg_lambda=reg_lambda,
+                             n_estimators=n_estimators,
+                             objective=objective,
+                             n_jobs=self.n_threads,
+                             random_state=self.rng.randint(1, 100000),
+                             num_class=self.num_class,
+                             subsample=subsample_per_it)
+
+        configuration = {k: v for k, v in configuration.items() if v is not None}
+
+        clf = pipeline.Pipeline([
+            ('preprocess_impute',
+             ColumnTransformer([
+                 ("categorical", "passthrough", self.categorical_data),
+                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
+            ('preprocess_one_hot',
+             ColumnTransformer([
+                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
+                 ("continuous", "passthrough", ~self.categorical_data)])),
+            ('xgb',
+             xgb.XGBClassifier(**configuration))
+        ])
+        return clf
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 885ce606..9bc5ff3b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -64,7 +64,7 @@ def test_rng_serialization():
 def test_rng_serialization_xgb():
     import json
     from hpobench.util.container_utils import BenchmarkEncoder, BenchmarkDecoder
-    from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark
+    from hpobench.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark
 
     b = XGBoostBenchmark(task_id=167149, rng=0)
     meta = b.get_meta_information()
diff --git a/tests/test_whitebox.py b/tests/test_whitebox.py
index 7e4c32aa..c3f5e0ff 100644
--- a/tests/test_whitebox.py
+++ b/tests/test_whitebox.py
@@ -14,7 +14,7 @@
 
 
 def test_whitebox_without_container_xgb():
-    from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark as Benchmark
+    from hpobench.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark as Benchmark
     b = Benchmark(task_id=167199, rng=0)
     cs = b.get_configuration_space(seed=0)
 

From 3f84afbe466b83910d95f0b074ccb0d1046f35ed Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 23 Jun 2021 20:32:38 +0200
Subject: [PATCH 013/147] Adding sample RF space for tabular collection design

---
 hpobench/benchmarks/ml/rf_benchmark.py | 391 +++++++++++++++++++++++++
 1 file changed, 391 insertions(+)
 create mode 100644 hpobench/benchmarks/ml/rf_benchmark.py

diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
new file mode 100644
index 00000000..35684c00
--- /dev/null
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -0,0 +1,391 @@
+import time
+import openml
+import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, make_scorer
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+
+
+class RandomForestBenchmark(AbstractBenchmark):
+    _issue_tasks = [3917, 3945]
+
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            benchmark_type: str = "raw"
+    ):
+        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
+        self.rng = check_random_state(self.seed)
+        super(RandomForestBenchmark, self).__init__(rng=seed)
+
+        self.benchmark_type = benchmark_type
+        self.task_id = task_id
+        self.valid_size = valid_size
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        # Data variables
+        self.train_X = None
+        self.valid_X = None
+        self.test_X = None
+        self.train_y = None
+        self.valid_y = None
+        self.test_y = None
+        self.train_idx = None
+        self.test_idx = None
+        self.task = None
+        self.dataset = None
+        self.preprocessor = None
+        self.lower_bound_train_size = None
+        self.load_data_from_openml()
+
+        # Observation and fidelity spaces
+        self.fidelity_choice = fidelity_choice
+        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
+        self.x_cs = self.get_configuration_space(self.seed)
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter(
+                'max_depth', lower=1, upper=15, default_value=2, log=False
+            ),
+            CS.UniformIntegerHyperparameter(
+                'min_samples_split', lower=2, upper=128, default_value=2, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'max_features', lower=0.1, upper=0.9, default_value=0.5, log=False
+            ),
+            CS.UniformIntegerHyperparameter(
+                'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
+            ),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
+        """
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+        if fidelity_choice == 0:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 1:
+            # only n_estimators as fidelity
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 2:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+            )
+        else:
+            # both n_estimators and subsample as fidelities
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+            )
+        z_cs.add_hyperparameters([ntrees, subsample])
+        return z_cs
+
+    def get_config(self, size=None):
+        """Samples configuration(s) from the (hyper) parameter space
+        """
+        if size is None:  # return only one config
+            return self.x_cs.sample_configuration()
+        return [self.x_cs.sample_configuration() for i in range(size)]
+
+    def get_fidelity(self, size=None):
+        """Samples candidate fidelities from the fidelity space
+        """
+        if size is None:  # return only one config
+            return self.z_cs.sample_configuration()
+        return [self.z_cs.sample_configuration() for i in range(size)]
+
+    def _convert_labels(self, labels):
+        """Converts boolean labels (if exists) to strings
+        """
+        label_types = list(map(lambda x: isinstance(x, bool), labels))
+        if np.all(label_types):
+            _labels = list(map(lambda x: str(x), labels))
+            if isinstance(labels, pd.Series):
+                labels = pd.Series(_labels, index=labels.index)
+            elif isinstance(labels, np.array):
+                labels = np.array(labels)
+        return labels
+
+    def load_data_from_openml(self, valid_size=None, verbose=False):
+        """Fetches data from OpenML and initializes the train-validation-test data splits
+
+        The validation set is fixed till this function is called again or explicitly altered
+        """
+        # fetches task
+        self.task = openml.tasks.get_task(self.task_id, download_data=False)
+        # fetches dataset
+        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
+        if verbose:
+            print(self.task, '\n')
+            print(self.dataset, '\n')
+
+        # loads full data
+        X, y, categorical_ind, feature_names = self.dataset.get_data(
+            target=self.task.target_name, dataset_format="dataframe"
+        )
+        categorical_ind = np.array(categorical_ind)
+        (cat_idx,) = np.where(categorical_ind)
+        (cont_idx,) = np.where(~categorical_ind)
+
+        # splitting dataset into train and test (10% test)
+        # train-test split is fixed for a task and its associated dataset
+        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
+        train_x = X.iloc[self.train_idx]
+        train_y = y.iloc[self.train_idx]
+        self.test_X = X.iloc[self.test_idx]
+        self.test_y = y.iloc[self.test_idx]
+
+        # splitting training into training and validation
+        # validation set is fixed till this function is called again or explicitly altered
+        valid_size = self.valid_size if valid_size is None else valid_size
+        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
+            train_x, train_y, test_size=valid_size,
+            shuffle=True, stratify=train_y, random_state=self.rng
+        )
+
+        # preprocessor to handle missing values, categorical columns encodings,
+        # and scaling numeric columns
+        self.preprocessor = make_pipeline(
+            ColumnTransformer([
+                (
+                    "cat",
+                    make_pipeline(SimpleImputer(strategy="most_frequent"),
+                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
+                    cat_idx.tolist(),
+                ),
+                (
+                    "cont",
+                    make_pipeline(SimpleImputer(strategy="median"),
+                                  StandardScaler()),
+                    cont_idx.tolist(),
+                )
+            ])
+        )
+        if verbose:
+            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
+
+        # preprocessor fit only on the training set
+        self.train_X = self.preprocessor.fit_transform(self.train_X)
+        # applying preprocessor built on the training set, across validation and test splits
+        self.valid_X = self.preprocessor.transform(self.valid_X)
+        self.test_X = self.preprocessor.transform(self.test_X)
+        # converting boolean labels to strings
+        self.train_y = self._convert_labels(self.train_y)
+        self.valid_y = self._convert_labels(self.valid_y)
+        self.test_y = self._convert_labels(self.test_y)
+
+        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
+        # use 10 times the number of classes as lower bound for the dataset fraction
+        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+
+        if verbose:
+            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
+
+        if verbose:
+            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
+            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
+            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
+            print("\nData loading complete!\n")
+        return
+
+    def shuffle_data_idx(self, train_id=None, ng=None):
+        rng = self.rng if rng is None else rng
+        train_idx = self.train_idx if train_idx is None else train_idx
+        rng.shuffle(train_idx)
+        return train_idx
+
+    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+        start = time.time()
+
+        # initializing model
+        model = RandomForestClassifier(
+            **config.get_dictionary(),
+            n_estimators=fidelity['n_estimators'],  # a fidelity being used during initialization
+            bootstrap=True,
+            random_state=self.rng
+        )
+
+        # preparing data
+        if eval == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+            train_idx = self.train_idx
+        else:
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+            train_idx = np.arange(len(train_X))
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here
+        # application of the other fidelity to the dataset that the model interfaces
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                fidelity['subsample'] * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        model.fit(train_X[train_idx], train_y.iloc[train_idx])
+        # computing statistics on training data
+        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
+
+        model_fit_time = time.time() - start
+        return model, model_fit_time, train_loss
+
+    def objective(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    def objective_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng, eval="test"
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        return dict()
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        return dict()
+
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        pass

From 09b296a4a1ef18e0a11b0e0dfa25ebb6345c6427 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 23 Jun 2021 20:57:37 +0200
Subject: [PATCH 014/147] Placeholder SVM benchmark to interface tabular data
 collection

---
 hpobench/benchmarks/ml/svm_benchmark_2.py | 371 ++++++++++++++++++++++
 1 file changed, 371 insertions(+)
 create mode 100644 hpobench/benchmarks/ml/svm_benchmark_2.py

diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
new file mode 100644
index 00000000..6e8ec6c9
--- /dev/null
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -0,0 +1,371 @@
+import time
+import openml
+import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
+from sklearn.svm import SVC
+from sklearn.impute import SimpleImputer
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline, Pipeline
+from sklearn.metrics import accuracy_score, make_scorer
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+
+
+class SVMBenchmark(AbstractBenchmark):
+    _issue_tasks = [3917, 3945]
+
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            benchmark_type: str = "raw"
+    ):
+        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
+        self.rng = check_random_state(self.seed)
+        super(SVMBenchmark, self).__init__(rng=seed)
+
+        self.benchmark_type = benchmark_type
+        self.task_id = task_id
+        self.valid_size = valid_size
+        self.accuracy_scorer = make_scorer(accuracy_score)
+        #TODO: check the cache_size parameter from sklearn docs
+        self.cache_size = 200
+
+        # Data variables
+        self.train_X = None
+        self.valid_X = None
+        self.test_X = None
+        self.train_y = None
+        self.valid_y = None
+        self.test_y = None
+        self.train_idx = None
+        self.test_idx = None
+        self.task = None
+        self.dataset = None
+        self.preprocessor = None
+        self.lower_bound_train_size = None
+        self.load_data_from_openml()
+
+        # Observation and fidelity spaces
+        self.fidelity_choice = fidelity_choice
+        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
+        self.x_cs = self.get_configuration_space(self.seed)
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformFloatHyperparameter(
+                'C', lower=-10., upper=10., default_value=0., log=False
+            ),
+            CS.UniformFloatHyperparameter(
+                'gamma', lower=-10., upper=10., default_value=1., log=False
+            ),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=None):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        For SVM, only a single fidelity exists, i.e., subsample fraction.
+        if fidelity_choice == 0
+            uses the entire data (subsample=1), reflecting the black-box setup
+        else
+            parameterizes the fraction of data to subsample
+
+        """
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+        if fidelity_choice == 0:
+            subsample = CS.Constant('subsample', value=1)
+        else:
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+            )
+        z_cs.add_hyperparameter(subsample)
+        return z_cs
+
+    def get_config(self, size=None):
+        """Samples configuration(s) from the (hyper) parameter space
+        """
+        if size is None:  # return only one config
+            return self.x_cs.sample_configuration()
+        return [self.x_cs.sample_configuration() for i in range(size)]
+
+    def get_fidelity(self, size=None):
+        """Samples candidate fidelities from the fidelity space
+        """
+        if size is None:  # return only one config
+            return self.z_cs.sample_configuration()
+        return [self.z_cs.sample_configuration() for i in range(size)]
+
+    def _convert_labels(self, labels):
+        """Converts boolean labels (if exists) to strings
+        """
+        label_types = list(map(lambda x: isinstance(x, bool), labels))
+        if np.all(label_types):
+            _labels = list(map(lambda x: str(x), labels))
+            if isinstance(labels, pd.Series):
+                labels = pd.Series(_labels, index=labels.index)
+            elif isinstance(labels, np.array):
+                labels = np.array(labels)
+        return labels
+
+    def load_data_from_openml(self, valid_size=None, verbose=False):
+        """Fetches data from OpenML and initializes the train-validation-test data splits
+
+        The validation set is fixed till this function is called again or explicitly altered
+        """
+        # fetches task
+        self.task = openml.tasks.get_task(self.task_id, download_data=False)
+        # fetches dataset
+        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
+        if verbose:
+            print(self.task, '\n')
+            print(self.dataset, '\n')
+
+        # loads full data
+        X, y, categorical_ind, feature_names = self.dataset.get_data(
+            target=self.task.target_name, dataset_format="dataframe"
+        )
+        categorical_ind = np.array(categorical_ind)
+        (cat_idx,) = np.where(categorical_ind)
+        (cont_idx,) = np.where(~categorical_ind)
+
+        # splitting dataset into train and test (10% test)
+        # train-test split is fixed for a task and its associated dataset
+        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
+        train_x = X.iloc[self.train_idx]
+        train_y = y.iloc[self.train_idx]
+        self.test_X = X.iloc[self.test_idx]
+        self.test_y = y.iloc[self.test_idx]
+
+        # splitting training into training and validation
+        # validation set is fixed till this function is called again or explicitly altered
+        valid_size = self.valid_size if valid_size is None else valid_size
+        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
+            train_x, train_y, test_size=valid_size,
+            shuffle=True, stratify=train_y, random_state=self.rng
+        )
+
+        # preprocessor to handle missing values, categorical columns encodings,
+        # and scaling numeric columns
+        self.preprocessor = make_pipeline(
+            ColumnTransformer([
+                (
+                    "cat",
+                    make_pipeline(SimpleImputer(strategy="most_frequent"),
+                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
+                    cat_idx.tolist(),
+                ),
+                (
+                    "cont",
+                    make_pipeline(SimpleImputer(strategy="median"),
+                                  StandardScaler()),
+                    cont_idx.tolist(),
+                )
+            ])
+        )
+        if verbose:
+            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
+
+        # preprocessor fit only on the training set
+        self.train_X = self.preprocessor.fit_transform(self.train_X)
+        # applying preprocessor built on the training set, across validation and test splits
+        self.valid_X = self.preprocessor.transform(self.valid_X)
+        self.test_X = self.preprocessor.transform(self.test_X)
+        # converting boolean labels to strings
+        self.train_y = self._convert_labels(self.train_y)
+        self.valid_y = self._convert_labels(self.valid_y)
+        self.test_y = self._convert_labels(self.test_y)
+
+        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
+        # use 10 times the number of classes as lower bound for the dataset fraction
+        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+
+        if verbose:
+            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
+
+        if verbose:
+            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
+            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
+            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
+            print("\nData loading complete!\n")
+        return
+
+    def shuffle_data_idx(self, train_id=None, ng=None):
+        rng = self.rng if rng is None else rng
+        train_idx = self.train_idx if train_idx is None else train_idx
+        rng.shuffle(train_idx)
+        return train_idx
+
+    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+        start = time.time()
+
+        # initializing model
+        rng = self.rng if rng is None else rng
+        config = config.get_dictionary()
+        for k, v in config.items():
+            config[k] = np.exp(float(v))
+        model = SVC(
+            **config,
+            random_state=rng,
+            cache_size=self.cache_size
+
+        )
+
+        # preparing data
+        if eval == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+            train_idx = self.train_idx
+        else:
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+            train_idx = np.arange(len(train_X))
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here
+        # application of the other fidelity to the dataset that the model interfaces
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                fidelity['subsample'] * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        model.fit(train_X[train_idx], train_y.iloc[train_idx])
+        # computing statistics on training data
+        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
+
+        model_fit_time = time.time() - start
+        return model, model_fit_time, train_loss
+
+    def objective(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    def objective_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng, eval="test"
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        return dict()
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        return dict()
+
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        pass

From af4f593c835764268dd7f07344ac5287eeb9d891 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 24 Jun 2021 14:38:56 +0200
Subject: [PATCH 015/147] Writing common ML benchmark class for tabular
 collection

---
 .../benchmarks/ml/ml_benchmark_template.py    | 347 ++++++++++++++++++
 hpobench/benchmarks/ml/rf_benchmark.py        | 300 +--------------
 hpobench/benchmarks/ml/svm_benchmark_2.py     | 315 +---------------
 3 files changed, 376 insertions(+), 586 deletions(-)
 create mode 100644 hpobench/benchmarks/ml/ml_benchmark_template.py

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
new file mode 100644
index 00000000..0891f0fe
--- /dev/null
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -0,0 +1,347 @@
+import time
+import openml
+import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, make_scorer
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+
+
+class Benchmark(AbstractBenchmark):
+    _issue_tasks = [3917, 3945]
+
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            benchmark_type: str = "raw"
+    ):
+        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
+        self.rng = check_random_state(self.seed)
+        super(Benchmark, self).__init__(rng=seed)
+
+        self.benchmark_type = benchmark_type
+        self.task_id = task_id
+        self.valid_size = valid_size
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        # Data variables
+        self.train_X = None
+        self.valid_X = None
+        self.test_X = None
+        self.train_y = None
+        self.valid_y = None
+        self.test_y = None
+        self.train_idx = None
+        self.test_idx = None
+        self.task = None
+        self.dataset = None
+        self.preprocessor = None
+        self.lower_bound_train_size = None
+        self.load_data_from_openml()
+
+        # Observation and fidelity spaces
+        self.fidelity_choice = fidelity_choice
+        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
+        self.x_cs = self.get_configuration_space(self.seed)
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
+        """
+        raise NotImplementedError()
+
+    def get_config(self, size=None):
+        """Samples configuration(s) from the (hyper) parameter space
+        """
+        if size is None:  # return only one config
+            return self.x_cs.sample_configuration()
+        return [self.x_cs.sample_configuration() for i in range(size)]
+
+    def get_fidelity(self, size=None):
+        """Samples candidate fidelities from the fidelity space
+        """
+        if size is None:  # return only one config
+            return self.z_cs.sample_configuration()
+        return [self.z_cs.sample_configuration() for i in range(size)]
+
+    def _convert_labels(self, labels):
+        """Converts boolean labels (if exists) to strings
+        """
+        label_types = list(map(lambda x: isinstance(x, bool), labels))
+        if np.all(label_types):
+            _labels = list(map(lambda x: str(x), labels))
+            if isinstance(labels, pd.Series):
+                labels = pd.Series(_labels, index=labels.index)
+            elif isinstance(labels, np.array):
+                labels = np.array(labels)
+        return labels
+
+    def load_data_from_openml(self, valid_size=None, verbose=False):
+        """Fetches data from OpenML and initializes the train-validation-test data splits
+
+        The validation set is fixed till this function is called again or explicitly altered
+        """
+        # fetches task
+        self.task = openml.tasks.get_task(self.task_id, download_data=False)
+        # fetches dataset
+        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
+        if verbose:
+            print(self.task, '\n')
+            print(self.dataset, '\n')
+
+        # loads full data
+        X, y, categorical_ind, feature_names = self.dataset.get_data(
+            target=self.task.target_name, dataset_format="dataframe"
+        )
+        categorical_ind = np.array(categorical_ind)
+        (cat_idx,) = np.where(categorical_ind)
+        (cont_idx,) = np.where(~categorical_ind)
+
+        # splitting dataset into train and test (10% test)
+        # train-test split is fixed for a task and its associated dataset
+        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
+        train_x = X.iloc[self.train_idx]
+        train_y = y.iloc[self.train_idx]
+        self.test_X = X.iloc[self.test_idx]
+        self.test_y = y.iloc[self.test_idx]
+
+        # splitting training into training and validation
+        # validation set is fixed till this function is called again or explicitly altered
+        valid_size = self.valid_size if valid_size is None else valid_size
+        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
+            train_x, train_y, test_size=valid_size,
+            shuffle=True, stratify=train_y, random_state=self.rng
+        )
+
+        # preprocessor to handle missing values, categorical columns encodings,
+        # and scaling numeric columns
+        self.preprocessor = make_pipeline(
+            ColumnTransformer([
+                (
+                    "cat",
+                    make_pipeline(SimpleImputer(strategy="most_frequent"),
+                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
+                    cat_idx.tolist(),
+                ),
+                (
+                    "cont",
+                    make_pipeline(SimpleImputer(strategy="median"),
+                                  StandardScaler()),
+                    cont_idx.tolist(),
+                )
+            ])
+        )
+        if verbose:
+            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
+
+        # preprocessor fit only on the training set
+        self.train_X = self.preprocessor.fit_transform(self.train_X)
+        # applying preprocessor built on the training set, across validation and test splits
+        self.valid_X = self.preprocessor.transform(self.valid_X)
+        self.test_X = self.preprocessor.transform(self.test_X)
+        # converting boolean labels to strings
+        self.train_y = self._convert_labels(self.train_y)
+        self.valid_y = self._convert_labels(self.valid_y)
+        self.test_y = self._convert_labels(self.test_y)
+
+        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
+        # use 10 times the number of classes as lower bound for the dataset fraction
+        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+
+        if verbose:
+            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
+
+        if verbose:
+            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
+            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
+            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
+            print("\nData loading complete!\n")
+        return
+
+    def shuffle_data_idx(self, train_id=None, ng=None):
+        rng = self.rng if rng is None else rng
+        train_idx = self.train_idx if train_idx is None else train_idx
+        rng.shuffle(train_idx)
+        return train_idx
+
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+        raise NotImplementedError()
+
+    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+        start = time.time()
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if eval == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+            train_idx = self.train_idx
+        else:
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+            train_idx = np.arange(len(train_X))
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                fidelity['subsample'] * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        model.fit(train_X[train_idx], train_y.iloc[train_idx])
+        # computing statistics on training data
+        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
+
+        model_fit_time = time.time() - start
+        return model, model_fit_time, train_loss
+
+    def objective(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    def objective_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        if self.benchmark_type == "raw":
+            model, model_fit_time, train_loss = self._raw_objective(
+                configuration, fidelity, shuffle, rng, eval="test"
+            )
+        else:
+            #TODO: add cases for `tabular` and `surrogate` benchmarks
+            pass
+
+        start = time.time()
+        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
+        eval_time = time.time() - start
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'cost': model_fit_time + eval_time,
+            'training_cost': model_fit_time,
+            'evaluation_cost': eval_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity.get_dictionary(),
+            'config': configuration.get_dictionary()
+        }
+
+        return {
+            'function_value': info['val_loss'],
+            'cost': info['cost'],
+            'info': info
+        }
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+        """
+        return dict()
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+        """
+        return dict()
+
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        pass
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 35684c00..be08b938 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -17,11 +17,10 @@
 
 import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
-class RandomForestBenchmark(AbstractBenchmark):
-    _issue_tasks = [3917, 3945]
-
+class RandomForestBenchmark(Benchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
@@ -30,34 +29,10 @@ def __init__(
             fidelity_choice: int = 1,
             benchmark_type: str = "raw"
     ):
-        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
-        self.rng = check_random_state(self.seed)
-        super(RandomForestBenchmark, self).__init__(rng=seed)
-
-        self.benchmark_type = benchmark_type
-        self.task_id = task_id
-        self.valid_size = valid_size
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        # Data variables
-        self.train_X = None
-        self.valid_X = None
-        self.test_X = None
-        self.train_y = None
-        self.valid_y = None
-        self.test_y = None
-        self.train_idx = None
-        self.test_idx = None
-        self.task = None
-        self.dataset = None
-        self.preprocessor = None
-        self.lower_bound_train_size = None
-        self.load_data_from_openml()
-
-        # Observation and fidelity spaces
-        self.fidelity_choice = fidelity_choice
-        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
-        self.x_cs = self.get_configuration_space(self.seed)
+        super(RandomForestBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, benchmark_type
+        )
+        pass
 
     @staticmethod
     def get_configuration_space(seed=None):
@@ -123,269 +98,14 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs
 
-    def get_config(self, size=None):
-        """Samples configuration(s) from the (hyper) parameter space
-        """
-        if size is None:  # return only one config
-            return self.x_cs.sample_configuration()
-        return [self.x_cs.sample_configuration() for i in range(size)]
-
-    def get_fidelity(self, size=None):
-        """Samples candidate fidelities from the fidelity space
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
         """
-        if size is None:  # return only one config
-            return self.z_cs.sample_configuration()
-        return [self.z_cs.sample_configuration() for i in range(size)]
-
-    def _convert_labels(self, labels):
-        """Converts boolean labels (if exists) to strings
-        """
-        label_types = list(map(lambda x: isinstance(x, bool), labels))
-        if np.all(label_types):
-            _labels = list(map(lambda x: str(x), labels))
-            if isinstance(labels, pd.Series):
-                labels = pd.Series(_labels, index=labels.index)
-            elif isinstance(labels, np.array):
-                labels = np.array(labels)
-        return labels
-
-    def load_data_from_openml(self, valid_size=None, verbose=False):
-        """Fetches data from OpenML and initializes the train-validation-test data splits
-
-        The validation set is fixed till this function is called again or explicitly altered
-        """
-        # fetches task
-        self.task = openml.tasks.get_task(self.task_id, download_data=False)
-        # fetches dataset
-        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
-        if verbose:
-            print(self.task, '\n')
-            print(self.dataset, '\n')
-
-        # loads full data
-        X, y, categorical_ind, feature_names = self.dataset.get_data(
-            target=self.task.target_name, dataset_format="dataframe"
-        )
-        categorical_ind = np.array(categorical_ind)
-        (cat_idx,) = np.where(categorical_ind)
-        (cont_idx,) = np.where(~categorical_ind)
-
-        # splitting dataset into train and test (10% test)
-        # train-test split is fixed for a task and its associated dataset
-        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
-        train_x = X.iloc[self.train_idx]
-        train_y = y.iloc[self.train_idx]
-        self.test_X = X.iloc[self.test_idx]
-        self.test_y = y.iloc[self.test_idx]
-
-        # splitting training into training and validation
-        # validation set is fixed till this function is called again or explicitly altered
-        valid_size = self.valid_size if valid_size is None else valid_size
-        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
-            train_x, train_y, test_size=valid_size,
-            shuffle=True, stratify=train_y, random_state=self.rng
-        )
-
-        # preprocessor to handle missing values, categorical columns encodings,
-        # and scaling numeric columns
-        self.preprocessor = make_pipeline(
-            ColumnTransformer([
-                (
-                    "cat",
-                    make_pipeline(SimpleImputer(strategy="most_frequent"),
-                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
-                    cat_idx.tolist(),
-                ),
-                (
-                    "cont",
-                    make_pipeline(SimpleImputer(strategy="median"),
-                                  StandardScaler()),
-                    cont_idx.tolist(),
-                )
-            ])
-        )
-        if verbose:
-            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
-
-        # preprocessor fit only on the training set
-        self.train_X = self.preprocessor.fit_transform(self.train_X)
-        # applying preprocessor built on the training set, across validation and test splits
-        self.valid_X = self.preprocessor.transform(self.valid_X)
-        self.test_X = self.preprocessor.transform(self.test_X)
-        # converting boolean labels to strings
-        self.train_y = self._convert_labels(self.train_y)
-        self.valid_y = self._convert_labels(self.valid_y)
-        self.test_y = self._convert_labels(self.test_y)
-
-        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
-        # use 10 times the number of classes as lower bound for the dataset fraction
-        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
-
-        if verbose:
-            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
-
-        if verbose:
-            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
-            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
-            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
-            print("\nData loading complete!\n")
-        return
-
-    def shuffle_data_idx(self, train_id=None, ng=None):
         rng = self.rng if rng is None else rng
-        train_idx = self.train_idx if train_idx is None else train_idx
-        rng.shuffle(train_idx)
-        return train_idx
-
-    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
-        start = time.time()
-
-        # initializing model
         model = RandomForestClassifier(
             **config.get_dictionary(),
             n_estimators=fidelity['n_estimators'],  # a fidelity being used during initialization
             bootstrap=True,
-            random_state=self.rng
-        )
-
-        # preparing data
-        if eval == "valid":
-            train_X = self.train_X
-            train_y = self.train_y
-            train_idx = self.train_idx
-        else:
-            train_X = np.vstack((self.train_X, self.valid_X))
-            train_y = pd.concat((self.train_y, self.valid_y))
-            train_idx = np.arange(len(train_X))
-
-        # shuffling data
-        if shuffle:
-            train_idx = self.shuffle_data_idx(train_idx, rng)
-            train_X = train_X.iloc[train_idx]
-            train_y = train_y.iloc[train_idx]
-
-        # subsample here
-        # application of the other fidelity to the dataset that the model interfaces
-        train_idx = self.rng.choice(
-            np.arange(len(train_X)), size=int(
-                fidelity['subsample'] * len(train_X)
-            )
+            random_state=rng
         )
-        # fitting the model with subsampled data
-        model.fit(train_X[train_idx], train_y.iloc[train_idx])
-        # computing statistics on training data
-        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
-
-        model_fit_time = time.time() - start
-        return model, model_fit_time, train_loss
-
-    def objective(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
-                configuration, fidelity, shuffle, rng
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
-        eval_time = time.time() - start
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
-        }
-
-        return {
-            'function_value': info['val_loss'],
-            'cost': info['cost'],
-            'info': info
-        }
-
-    def objective_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
-                configuration, fidelity, shuffle, rng, eval="test"
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
-        eval_time = time.time() - start
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
-        }
-
-        return {
-            'function_value': info['val_loss'],
-            'cost': info['cost'],
-            'info': info
-        }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        return dict()
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        return dict()
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        pass
+        return model
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 6e8ec6c9..13076040 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -17,11 +17,10 @@
 
 import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
-class SVMBenchmark(AbstractBenchmark):
-    _issue_tasks = [3917, 3945]
-
+class SVMBenchmark(Benchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
@@ -30,50 +29,33 @@ def __init__(
             fidelity_choice: int = 1,
             benchmark_type: str = "raw"
     ):
-        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
-        self.rng = check_random_state(self.seed)
-        super(SVMBenchmark, self).__init__(rng=seed)
-
-        self.benchmark_type = benchmark_type
-        self.task_id = task_id
-        self.valid_size = valid_size
-        self.accuracy_scorer = make_scorer(accuracy_score)
-        #TODO: check the cache_size parameter from sklearn docs
+        super(SVMBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, benchmark_type
+        )
         self.cache_size = 200
 
-        # Data variables
-        self.train_X = None
-        self.valid_X = None
-        self.test_X = None
-        self.train_y = None
-        self.valid_y = None
-        self.test_y = None
-        self.train_idx = None
-        self.test_idx = None
-        self.task = None
-        self.dataset = None
-        self.preprocessor = None
-        self.lower_bound_train_size = None
-        self.load_data_from_openml()
-
-        # Observation and fidelity spaces
-        self.fidelity_choice = fidelity_choice
-        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
-        self.x_cs = self.get_configuration_space(self.seed)
-
     @staticmethod
     def get_configuration_space(seed=None):
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
 
+        # cs.add_hyperparameters([
+        #     CS.UniformFloatHyperparameter(
+        #         'C', lower=-10., upper=10., default_value=0., log=False
+        #     ),
+        #     CS.UniformFloatHyperparameter(
+        #         'gamma', lower=-10., upper=10., default_value=1., log=False
+        #     ),
+        # ])
+        # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
         cs.add_hyperparameters([
             CS.UniformFloatHyperparameter(
-                'C', lower=-10., upper=10., default_value=0., log=False
+                "C", 0.03125, 32768, log=True, default_value=1.0
             ),
             CS.UniformFloatHyperparameter(
-                'gamma', lower=-10., upper=10., default_value=1., log=False
-            ),
+                "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1
+            )
         ])
         return cs
 
@@ -99,273 +81,14 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
         z_cs.add_hyperparameter(subsample)
         return z_cs
 
-    def get_config(self, size=None):
-        """Samples configuration(s) from the (hyper) parameter space
-        """
-        if size is None:  # return only one config
-            return self.x_cs.sample_configuration()
-        return [self.x_cs.sample_configuration() for i in range(size)]
-
-    def get_fidelity(self, size=None):
-        """Samples candidate fidelities from the fidelity space
-        """
-        if size is None:  # return only one config
-            return self.z_cs.sample_configuration()
-        return [self.z_cs.sample_configuration() for i in range(size)]
-
-    def _convert_labels(self, labels):
-        """Converts boolean labels (if exists) to strings
-        """
-        label_types = list(map(lambda x: isinstance(x, bool), labels))
-        if np.all(label_types):
-            _labels = list(map(lambda x: str(x), labels))
-            if isinstance(labels, pd.Series):
-                labels = pd.Series(_labels, index=labels.index)
-            elif isinstance(labels, np.array):
-                labels = np.array(labels)
-        return labels
-
-    def load_data_from_openml(self, valid_size=None, verbose=False):
-        """Fetches data from OpenML and initializes the train-validation-test data splits
-
-        The validation set is fixed till this function is called again or explicitly altered
-        """
-        # fetches task
-        self.task = openml.tasks.get_task(self.task_id, download_data=False)
-        # fetches dataset
-        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
-        if verbose:
-            print(self.task, '\n')
-            print(self.dataset, '\n')
-
-        # loads full data
-        X, y, categorical_ind, feature_names = self.dataset.get_data(
-            target=self.task.target_name, dataset_format="dataframe"
-        )
-        categorical_ind = np.array(categorical_ind)
-        (cat_idx,) = np.where(categorical_ind)
-        (cont_idx,) = np.where(~categorical_ind)
-
-        # splitting dataset into train and test (10% test)
-        # train-test split is fixed for a task and its associated dataset
-        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
-        train_x = X.iloc[self.train_idx]
-        train_y = y.iloc[self.train_idx]
-        self.test_X = X.iloc[self.test_idx]
-        self.test_y = y.iloc[self.test_idx]
-
-        # splitting training into training and validation
-        # validation set is fixed till this function is called again or explicitly altered
-        valid_size = self.valid_size if valid_size is None else valid_size
-        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
-            train_x, train_y, test_size=valid_size,
-            shuffle=True, stratify=train_y, random_state=self.rng
-        )
-
-        # preprocessor to handle missing values, categorical columns encodings,
-        # and scaling numeric columns
-        self.preprocessor = make_pipeline(
-            ColumnTransformer([
-                (
-                    "cat",
-                    make_pipeline(SimpleImputer(strategy="most_frequent"),
-                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
-                    cat_idx.tolist(),
-                ),
-                (
-                    "cont",
-                    make_pipeline(SimpleImputer(strategy="median"),
-                                  StandardScaler()),
-                    cont_idx.tolist(),
-                )
-            ])
-        )
-        if verbose:
-            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
-
-        # preprocessor fit only on the training set
-        self.train_X = self.preprocessor.fit_transform(self.train_X)
-        # applying preprocessor built on the training set, across validation and test splits
-        self.valid_X = self.preprocessor.transform(self.valid_X)
-        self.test_X = self.preprocessor.transform(self.test_X)
-        # converting boolean labels to strings
-        self.train_y = self._convert_labels(self.train_y)
-        self.valid_y = self._convert_labels(self.valid_y)
-        self.test_y = self._convert_labels(self.test_y)
-
-        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
-        # use 10 times the number of classes as lower bound for the dataset fraction
-        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
-
-        if verbose:
-            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
-
-        if verbose:
-            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
-            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
-            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
-            print("\nData loading complete!\n")
-        return
-
-    def shuffle_data_idx(self, train_id=None, ng=None):
-        rng = self.rng if rng is None else rng
-        train_idx = self.train_idx if train_idx is None else train_idx
-        rng.shuffle(train_idx)
-        return train_idx
-
-    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
-        start = time.time()
-
+    def init_model(self, config, fidelity=None, rng=None):
         # initializing model
         rng = self.rng if rng is None else rng
         config = config.get_dictionary()
-        for k, v in config.items():
-            config[k] = np.exp(float(v))
         model = SVC(
             **config,
             random_state=rng,
             cache_size=self.cache_size
 
         )
-
-        # preparing data
-        if eval == "valid":
-            train_X = self.train_X
-            train_y = self.train_y
-            train_idx = self.train_idx
-        else:
-            train_X = np.vstack((self.train_X, self.valid_X))
-            train_y = pd.concat((self.train_y, self.valid_y))
-            train_idx = np.arange(len(train_X))
-
-        # shuffling data
-        if shuffle:
-            train_idx = self.shuffle_data_idx(train_idx, rng)
-            train_X = train_X.iloc[train_idx]
-            train_y = train_y.iloc[train_idx]
-
-        # subsample here
-        # application of the other fidelity to the dataset that the model interfaces
-        train_idx = self.rng.choice(
-            np.arange(len(train_X)), size=int(
-                fidelity['subsample'] * len(train_X)
-            )
-        )
-        # fitting the model with subsampled data
-        model.fit(train_X[train_idx], train_y.iloc[train_idx])
-        # computing statistics on training data
-        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
-
-        model_fit_time = time.time() - start
-        return model, model_fit_time, train_loss
-
-    def objective(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
-                configuration, fidelity, shuffle, rng
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
-        eval_time = time.time() - start
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
-        }
-
-        return {
-            'function_value': info['val_loss'],
-            'cost': info['cost'],
-            'info': info
-        }
-
-    def objective_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
-                configuration, fidelity, shuffle, rng, eval="test"
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
-        eval_time = time.time() - start
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
-        }
-
-        return {
-            'function_value': info['val_loss'],
-            'cost': info['cost'],
-            'info': info
-        }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        return dict()
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        return dict()
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        pass
+        return model

From df2462dd53a323cd88afca0e9a63862900b69f97 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 24 Jun 2021 18:19:33 +0200
Subject: [PATCH 016/147] Adding placeholder for HistGradientBoostedClassifier

---
 hpobench/benchmarks/ml/histgb_benchmark.py | 125 +++++++++++++++++++++
 hpobench/benchmarks/ml/rf_benchmark.py     |   5 +-
 2 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 hpobench/benchmarks/ml/histgb_benchmark.py

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
new file mode 100644
index 00000000..11e7af4a
--- /dev/null
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -0,0 +1,125 @@
+import time
+import openml
+import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from copy import deepcopy
+from typing import Union, Dict
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, make_scorer
+
+# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingClassifier
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
+
+
+class HistGBBenchmark(Benchmark):
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            benchmark_type: str = "raw"
+    ):
+        super(HistGBBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, benchmark_type
+        )
+        pass
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter(
+                'max_depth', lower=1, upper=15, default_value=2, log=False
+            ),
+            CS.UniformIntegerHyperparameter(
+                'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'learning_rate', lower=1e-5, upper=1e-1, default_value=0.1, log=False
+            ),
+            #TODO: find best way to encode l2 reg. since log params cannot have 0 as exact bound
+            # scales the regularization parameter by using it as a power of 10
+            # such that the range of the parameter becomes {0, 1e-7, 1e-6, ..., 1e-1}
+            # where 10 ** 0 is enforced to be 0 (no regularization)
+            CS.UniformIntegerHyperparameter(
+                'l2_regularization', lower=-7, upper=0, default_value=0, log=False
+            )  # value of 1 indicates 0 regularization
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
+        """
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+        if fidelity_choice == 0:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 1:
+            # only n_estimators as fidelity
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 2:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+            )
+        else:
+            # both n_estimators and subsample as fidelities
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+            )
+        z_cs.add_hyperparameters([ntrees, subsample])
+        return z_cs
+
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+        rng = self.rng if rng is None else rng
+        config = deepcopy(config).get_dictionary()
+        l2 = config.pop("l2_regularization")
+        l2 = 0 if l2 == 1 else 10 ** l2
+        # TODO: decide on encoding of learning rate
+        #TODO: allow non-encoded categoricals?
+        #TODO: early stopping set to False?
+        model = HistGradientBoostingClassifier(
+            **config,
+            l2_regularization=l2,
+            max_iter=fidelity['n_estimators'],  # a fidelity being used during initialization
+            early_stopping=False,
+            random_state=rng
+        )
+        return model
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index be08b938..960b8271 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -16,7 +16,6 @@
 from sklearn.metrics import accuracy_score, make_scorer
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
@@ -85,7 +84,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             # only subsample as fidelity
             ntrees = CS.Constant('n_estimators', value=100)
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
             )
         else:
             # both n_estimators and subsample as fidelities
@@ -93,7 +92,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
                 'n_estimators', lower=2, upper=100, default_value=10, log=False
             )
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
             )
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs

From 4d1d2d6a8e0a9de88dcf21af6bf07196eeadb69a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 24 Jun 2021 18:21:34 +0200
Subject: [PATCH 017/147] Minor code cleaning

---
 hpobench/benchmarks/ml/svm_benchmark_2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 13076040..ec174748 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -16,7 +16,6 @@
 from sklearn.metrics import accuracy_score, make_scorer
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 

From 299e59247715518734379d8085d6b03df441baad Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Sat, 26 Jun 2021 17:44:58 +0200
Subject: [PATCH 018/147] Reformatting output dict + option to add more metrics

---
 hpobench/benchmarks/ml/histgb_benchmark.py    |  3 +-
 .../benchmarks/ml/ml_benchmark_template.py    | 47 ++++++++++++++++---
 hpobench/benchmarks/ml/svm_benchmark_2.py     |  1 -
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 11e7af4a..769838ae 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -50,8 +50,9 @@ def get_configuration_space(seed=None):
             CS.UniformIntegerHyperparameter(
                 'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
             ),
+            #TODO: fix lr value range error in map_to_config()
             CS.UniformFloatHyperparameter(
-                'learning_rate', lower=1e-5, upper=1e-1, default_value=0.1, log=False
+                'learning_rate', lower=1e-5, upper=1e-1, default_value=0.1, log=True
             ),
             #TODO: find best way to encode l2 reg. since log params cannot have 0 as exact bound
             # scales the regularization parameter by using it as a power of 10
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 0891f0fe..2b95c097 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -12,12 +12,31 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, \
+    top_k_accuracy_score, balanced_accuracy_score
 
 import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractBenchmark
 
 
+metrics = dict(
+    #TODO: decide on metrics generalized for different datasets
+    acc=accuracy_score,
+    bal_acc=balanced_accuracy_score,
+    f1=f1_score,
+    # roc=roc_auc_score,
+    # topk=top_k_accuracy_score
+)
+metrics_kwargs = dict(
+    #TODO: decide on metric parameters
+    acc=dict(),
+    bal_acc=dict(),
+    f1=dict(average="weighted"),
+    # roc=dict(average="weighted"),
+    # topk=dict()
+)
+
+
 class Benchmark(AbstractBenchmark):
     _issue_tasks = [3917, 3945]
 
@@ -36,7 +55,10 @@ def __init__(
         self.benchmark_type = benchmark_type
         self.task_id = task_id
         self.valid_size = valid_size
-        self.accuracy_scorer = make_scorer(accuracy_score)
+        self.scorers = dict()
+        for k, v in metrics.items():
+            self.scorers[k] = make_scorer(v, **metrics_kwargs[k])
+        # self.scorers = make_scorer(accuracy_score)
 
         # Data variables
         self.train_X = None
@@ -231,7 +253,10 @@ def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
         # fitting the model with subsampled data
         model.fit(train_X[train_idx], train_y.iloc[train_idx])
         # computing statistics on training data
-        train_loss = 1 - self.accuracy_scorer(model, train_X, train_y)
+        scores = dict()
+        for k, v in self.scorers.items():
+            scores[k] = v(model, train_X, train_y)
+        train_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, train_X, train_y)
 
         model_fit_time = time.time() - start
         return model, model_fit_time, train_loss
@@ -255,7 +280,10 @@ def objective(
             pass
 
         start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.valid_X, self.valid_y)
+        scores = dict()
+        for k, v in self.scorers.items():
+            scores[k] = v(model, self.valid_X, self.valid_y)
+        val_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
         eval_time = time.time() - start
 
         info = {
@@ -264,6 +292,7 @@ def objective(
             'cost': model_fit_time + eval_time,
             'training_cost': model_fit_time,
             'evaluation_cost': eval_time,
+            'scores': scores,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -294,22 +323,26 @@ def objective_test(
             pass
 
         start = time.time()
-        val_loss = 1 - self.accuracy_scorer(model, self.test_X, self.test_y)
+        scores = dict()
+        for k, v in self.scorers.items():
+            scores[k] = v(model, self.test_X, self.test_y)
+        test_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.test_X, self.test_y)
         eval_time = time.time() - start
 
         info = {
             'train_loss': train_loss,
-            'val_loss': val_loss,
+            'test_loss': test_loss,
             'cost': model_fit_time + eval_time,
             'training_cost': model_fit_time,
             'evaluation_cost': eval_time,
+            'scores': scores,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
         }
 
         return {
-            'function_value': info['val_loss'],
+            'function_value': info['test_loss'],
             'cost': info['cost'],
             'info': info
         }
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index ec174748..62da5bbc 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -88,6 +88,5 @@ def init_model(self, config, fidelity=None, rng=None):
             **config,
             random_state=rng,
             cache_size=self.cache_size
-
         )
         return model

From c46321d4d08cdacdd3b5b9a3831154e3a0a6eaab Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 28 Jun 2021 15:46:40 +0200
Subject: [PATCH 019/147] Removing redundant import

---
 hpobench/benchmarks/ml/histgb_benchmark.py      | 1 -
 hpobench/benchmarks/ml/ml_benchmark_template.py | 1 -
 hpobench/benchmarks/ml/rf_benchmark.py          | 1 -
 hpobench/benchmarks/ml/svm_benchmark_2.py       | 1 -
 4 files changed, 4 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 769838ae..0a0461a3 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -19,7 +19,6 @@
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
-import hpobench.util.rng_helper as rng_helper
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 2b95c097..55772ffc 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -15,7 +15,6 @@
 from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, \
     top_k_accuracy_score, balanced_accuracy_score
 
-import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractBenchmark
 
 
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 960b8271..96e3f48c 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -15,7 +15,6 @@
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, make_scorer
 
-import hpobench.util.rng_helper as rng_helper
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 62da5bbc..2747f380 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -15,7 +15,6 @@
 from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.metrics import accuracy_score, make_scorer
 
-import hpobench.util.rng_helper as rng_helper
 from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
 
 

From 17f663477beb30d5fbe76bd72d7e2ecdc169525a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 30 Jun 2021 18:23:55 +0200
Subject: [PATCH 020/147] Decoupling storage of costs for each metric

---
 .../benchmarks/ml/ml_benchmark_template.py    | 53 ++++++++++---------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 55772ffc..7692e447 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -221,8 +221,6 @@ def init_model(self, config, fidelity=None, rng=None):
         raise NotImplementedError()
 
     def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
-        start = time.time()
-
         # initializing model
         model = self.init_model(config, fidelity, rng)
 
@@ -250,15 +248,18 @@ def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
             )
         )
         # fitting the model with subsampled data
+        start = time.time()
         model.fit(train_X[train_idx], train_y.iloc[train_idx])
+        model_fit_time = time.time() - start
         # computing statistics on training data
         scores = dict()
+        score_cost = dict()
         for k, v in self.scorers.items():
+            _start = time.time()
             scores[k] = v(model, train_X, train_y)
+            score_cost[k] = time.time() - _start
         train_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, train_X, train_y)
-
-        model_fit_time = time.time() - start
-        return model, model_fit_time, train_loss
+        return model, model_fit_time, train_loss, scores, score_cost
 
     def objective(
             self,
@@ -271,27 +272,29 @@ def objective(
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
         if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
+            model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
                 configuration, fidelity, shuffle, rng
             )
         else:
             #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
+            pass + info['train_costs']['acc']
 
-        start = time.time()
         scores = dict()
+        score_cost = dict()
         for k, v in self.scorers.items():
+            _start = time.time()
             scores[k] = v(model, self.valid_X, self.valid_y)
+            score_cost[k] = time.time() - _start
         val_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
-        eval_time = time.time() - start
 
         info = {
             'train_loss': train_loss,
             'val_loss': val_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            'scores': scores,
+            'model_cost': model_fit_time,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'eval_scores': scores,
+            'eval_costs': score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -299,7 +302,7 @@ def objective(
 
         return {
             'function_value': info['val_loss'],
-            'cost': info['cost'],
+            'cost': model_fit_time + info['train_costs']['acc'] + info['eval_costs']['acc'],
             'info': info
         }
 
@@ -314,27 +317,29 @@ def objective_test(
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
         if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss = self._raw_objective(
+            model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
                 configuration, fidelity, shuffle, rng, eval="test"
             )
         else:
             #TODO: add cases for `tabular` and `surrogate` benchmarks
             pass
 
-        start = time.time()
         scores = dict()
+        score_cost = dict()
         for k, v in self.scorers.items():
+            _start = time.time()
             scores[k] = v(model, self.test_X, self.test_y)
-        test_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.test_X, self.test_y)
-        eval_time = time.time() - start
+            score_cost[k] = time.time() - _start
+        test_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
 
         info = {
             'train_loss': train_loss,
-            'test_loss': test_loss,
-            'cost': model_fit_time + eval_time,
-            'training_cost': model_fit_time,
-            'evaluation_cost': eval_time,
-            'scores': scores,
+            'val_loss': test_loss,
+            'model_cost': model_fit_time,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'eval_scores': scores,
+            'eval_costs': score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -342,7 +347,7 @@ def objective_test(
 
         return {
             'function_value': info['test_loss'],
-            'cost': info['cost'],
+            'cost': model_fit_time + info['train_costs']['acc'] + info['eval_costs']['acc'],
             'info': info
         }
 

From 7de891f5c82b6b12973bca7df148766e50286783 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 1 Jul 2021 16:28:27 +0200
Subject: [PATCH 021/147] Including test scores in objective

---
 hpobench/benchmarks/ml/histgb_benchmark.py    |  7 +-
 .../benchmarks/ml/ml_benchmark_template.py    | 68 +++++++++----------
 hpobench/benchmarks/ml/rf_benchmark.py        |  7 +-
 hpobench/benchmarks/ml/svm_benchmark_2.py     | 16 +----
 4 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 0a0461a3..ac273c57 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -28,12 +28,9 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            benchmark_type: str = "raw"
+            fidelity_choice: int = 1
     ):
-        super(HistGBBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, benchmark_type
-        )
+        super(HistGBBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
         pass
 
     @staticmethod
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 7692e447..cc543b50 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -44,14 +44,12 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            benchmark_type: str = "raw"
+            fidelity_choice: int = 1
     ):
         self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
         self.rng = check_random_state(self.seed)
         super(Benchmark, self).__init__(rng=seed)
 
-        self.benchmark_type = benchmark_type
         self.task_id = task_id
         self.valid_size = valid_size
         self.scorers = dict()
@@ -258,7 +256,7 @@ def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
             _start = time.time()
             scores[k] = v(model, train_X, train_y)
             score_cost[k] = time.time() - _start
-        train_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, train_X, train_y)
+        train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
     def objective(
@@ -271,21 +269,24 @@ def objective(
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
-                configuration, fidelity, shuffle, rng
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass + info['train_costs']['acc']
+        model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
+            configuration, fidelity, shuffle, rng
+        )
+        val_scores = dict()
+        val_score_cost = dict()
+        for k, v in self.scorers.items():
+            _start = time.time()
+            val_scores[k] = v(model, self.valid_X, self.valid_y)
+            val_score_cost[k] = time.time() - _start
+        val_loss = 1 - val_scores["acc"]
 
-        scores = dict()
-        score_cost = dict()
+        test_scores = dict()
+        test_score_cost = dict()
         for k, v in self.scorers.items():
             _start = time.time()
-            scores[k] = v(model, self.valid_X, self.valid_y)
-            score_cost[k] = time.time() - _start
-        val_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
+            test_scores[k] = v(model, self.test_X, self.test_y)
+            test_score_cost[k] = time.time() - _start
+        val_loss = 1 - test_scores["acc"]
 
         info = {
             'train_loss': train_loss,
@@ -293,8 +294,10 @@ def objective(
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
-            'eval_scores': scores,
-            'eval_costs': score_cost,
+            'val_scores': val_scores,
+            'val_costs': val_score_cost,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -302,7 +305,7 @@ def objective(
 
         return {
             'function_value': info['val_loss'],
-            'cost': model_fit_time + info['train_costs']['acc'] + info['eval_costs']['acc'],
+            'cost': model_fit_time + info['train_costs']['acc'] + info['val_costs']['acc'],
             'info': info
         }
 
@@ -316,21 +319,16 @@ def objective_test(
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
-        if self.benchmark_type == "raw":
-            model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
-                configuration, fidelity, shuffle, rng, eval="test"
-            )
-        else:
-            #TODO: add cases for `tabular` and `surrogate` benchmarks
-            pass
-
-        scores = dict()
-        score_cost = dict()
+        model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
+            configuration, fidelity, shuffle, rng, eval="test"
+        )
+        test_scores = dict()
+        test_score_cost = dict()
         for k, v in self.scorers.items():
             _start = time.time()
-            scores[k] = v(model, self.test_X, self.test_y)
-            score_cost[k] = time.time() - _start
-        test_loss = 1 - scores["acc"]  # self.accuracy_scorer(model, self.valid_X, self.valid_y)
+            test_scores[k] = v(model, self.test_X, self.test_y)
+            test_score_cost[k] = time.time() - _start
+        test_loss = 1 - test_scores["acc"]
 
         info = {
             'train_loss': train_loss,
@@ -338,8 +336,8 @@ def objective_test(
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
-            'eval_scores': scores,
-            'eval_costs': score_cost,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity.get_dictionary(),
             'config': configuration.get_dictionary()
@@ -347,7 +345,7 @@ def objective_test(
 
         return {
             'function_value': info['test_loss'],
-            'cost': model_fit_time + info['train_costs']['acc'] + info['eval_costs']['acc'],
+            'cost': model_fit_time + info['train_costs']['acc'] + info['test_costs']['acc'],
             'info': info
         }
 
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 96e3f48c..7426a37a 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -24,12 +24,9 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            benchmark_type: str = "raw"
+            fidelity_choice: int = 1
     ):
-        super(RandomForestBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, benchmark_type
-        )
+        super(RandomForestBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
         pass
 
     @staticmethod
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 2747f380..12d22afa 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -24,12 +24,9 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            benchmark_type: str = "raw"
+            fidelity_choice: int = 1
     ):
-        super(SVMBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, benchmark_type
-        )
+        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
         self.cache_size = 200
 
     @staticmethod
@@ -37,15 +34,6 @@ def get_configuration_space(seed=None):
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
-
-        # cs.add_hyperparameters([
-        #     CS.UniformFloatHyperparameter(
-        #         'C', lower=-10., upper=10., default_value=0., log=False
-        #     ),
-        #     CS.UniformFloatHyperparameter(
-        #         'gamma', lower=-10., upper=10., default_value=1., log=False
-        #     ),
-        # ])
         # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
         cs.add_hyperparameters([
             CS.UniformFloatHyperparameter(

From ec316c3a8ffc89c3224192ea66ed938c74d2ec53 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 1 Jul 2021 16:45:43 +0200
Subject: [PATCH 022/147] Documenting the structure of information in each fn
 eval.

---
 hpobench/benchmarks/ml/README.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 hpobench/benchmarks/ml/README.md

diff --git a/hpobench/benchmarks/ml/README.md b/hpobench/benchmarks/ml/README.md
new file mode 100644
index 00000000..46ad4e08
--- /dev/null
+++ b/hpobench/benchmarks/ml/README.md
@@ -0,0 +1,29 @@
+Each function evalution returns a dictionary with the following information:
+
+```
+└───function_value: 1 - accuracy (acc.) on validation set
+└───cost: time to fit model + time to evaluate acc. training set + time to evaluate acc. validation set
+└───info: dictionary (dict) with miscellaneous information
+|   └───train_loss: 1 - accuracy (acc.) on training set
+|   └───val_loss: 1 - accuracy (acc.) on validation set
+|   └───model_cost: time taken to fit the model
+|   └───train_scores: performance on all metrics over the training set (dict)
+|   |   └───f1: F1-score   
+|   |   └───acc: Accuracy
+|   |   └───bal_acc: Balanced accuracy
+|   └───train_costs: time taken to compute performance on all metrics over the training set (dict)
+|   |   └───f1: F1-score   
+|   |   └───acc: Accuracy
+|   |   └───bal_acc: Balanced accuracy 
+|   └───valid_scores: performance on all metrics over the validation set (dict)
+|   |   └───...
+|   └───valid_costs: time taken to compute performance on all metrics over the validation set (dict)
+|   |   └───...
+|   └───test_scores: performance on all metrics over the test set
+|   |   └───...
+|   └───test_costs: time taken to compute performance on all metrics over the test set (dict)
+|   |   └───...
+```
+
+*NOTE*: the keys `function_value`, `cost`, `info` need to exist when creating a new objective 
+function, while `info` can house any kind of auxilliary information required.
\ No newline at end of file

From e7f69b9e87731952d507d9db28cef42957be5a5b Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 2 Jul 2021 17:04:26 +0200
Subject: [PATCH 023/147] Some decisions on lower bound for subsample fidelity

---
 hpobench/benchmarks/ml/ml_benchmark_template.py |  5 +++--
 hpobench/benchmarks/ml/svm_benchmark_2.py       | 10 ++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index cc543b50..3ad61b54 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -194,8 +194,9 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
 
         # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
         # use 10 times the number of classes as lower bound for the dataset fraction
-        n_classes = len(self.task.class_labels)   # np.unique(self.train_y).shape[0]
+        n_classes = len(self.task.class_labels)
         self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+        self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
 
         if verbose:
             print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
@@ -332,7 +333,7 @@ def objective_test(
 
         info = {
             'train_loss': train_loss,
-            'val_loss': test_loss,
+            'test_loss': test_loss,
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
index 12d22afa..845f40e0 100644
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ b/hpobench/benchmarks/ml/svm_benchmark_2.py
@@ -45,8 +45,8 @@ def get_configuration_space(seed=None):
         ])
         return cs
 
-    @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=None):
+    @classmethod
+    def get_fidelity_space(cls, seed=None, fidelity_choice=None):
         """Fidelity space available --- specifies the fidelity dimensions
 
         For SVM, only a single fidelity exists, i.e., subsample fraction.
@@ -57,12 +57,14 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
 
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+
         if fidelity_choice == 0:
             subsample = CS.Constant('subsample', value=1)
         else:
+            # TODO: dynamically adapt based on 1/512 and lower_bound_train_size and set log=True
+            lower = 0.1
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=0.33, log=False
+                'subsample', lower=lower, upper=1, default_value=0.33, log=False
             )
         z_cs.add_hyperparameter(subsample)
         return z_cs

From edb3e7fedd5010bab9a65ba2e5b21e708cf8c4e3 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 6 Jul 2021 20:20:27 +0200
Subject: [PATCH 024/147] AbstractBenchmark update for fidelity option +
 including XGBoost

---
 README.md                                     |   7 +-
 examples/local/xgboost_local.py               |   2 +-
 hpobench/abstract_benchmark.py                |   7 +-
 hpobench/benchmarks/ml/histgb_benchmark.py    |   4 +-
 .../benchmarks/ml/ml_benchmark_template.py    |  82 +--
 hpobench/benchmarks/ml/rf_benchmark.py        |   4 +-
 hpobench/benchmarks/ml/svm_benchmark.py       | 395 +++-----------
 hpobench/benchmarks/ml/svm_benchmark_2.py     |  81 ---
 hpobench/benchmarks/ml/svm_benchmark_old.py   | 350 ++++++++++++
 hpobench/benchmarks/ml/xgboost_benchmark.py   | 515 ++++--------------
 .../benchmarks/ml/xgboost_benchmark_old.py    | 426 +++++++++++++++
 tests/test_utils.py                           |   2 +-
 tests/test_whitebox.py                        |   2 +-
 13 files changed, 1004 insertions(+), 873 deletions(-)
 delete mode 100644 hpobench/benchmarks/ml/svm_benchmark_2.py
 create mode 100644 hpobench/benchmarks/ml/svm_benchmark_old.py
 create mode 100644 hpobench/benchmarks/ml/xgboost_benchmark_old.py

diff --git a/README.md b/README.md
index ff34f75a..a015792b 100644
--- a/README.md
+++ b/README.md
@@ -35,11 +35,14 @@ Further requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace),
  This can be arbitrarily complex and further information can be found in the docstring of the benchmark.
  
 A simple example is the XGBoost benchmark which can be installed with `pip install .[xgboost]`
+
 ```python
-from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark
+from hpobench.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark
+
 b = XGBoostBenchmark(task_id=167149)
 config = b.get_configuration_space(seed=1).sample_configuration()
-result_dict = b.objective_function(configuration=config, fidelity={"n_estimators": 128, "dataset_fraction": 0.5}, rng=1)
+result_dict = b.objective_function(configuration=config,
+                                   fidelity={"n_estimators": 128, "dataset_fraction": 0.5}, rng=1)
 
 ```
 
diff --git a/examples/local/xgboost_local.py b/examples/local/xgboost_local.py
index 47c1f77f..4f3b3ad3 100644
--- a/examples/local/xgboost_local.py
+++ b/examples/local/xgboost_local.py
@@ -10,7 +10,7 @@
 import argparse
 from time import time
 
-from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark as Benchmark
+from hpobench.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark as Benchmark
 from hpobench.util.openml_data_manager import get_openmlcc18_taskids
 
 
diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index 5d7bc994..abbbcb22 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -226,12 +226,17 @@ def get_configuration_space(seed: Union[int, None] = None) -> ConfigSpace.Config
 
     @staticmethod
     @abc.abstractmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+    def get_fidelity_space(
+            seed: Union[int, None] = None, fidelity_choice: Union[int, None] = None
+    ) -> ConfigSpace.ConfigurationSpace:
         """ Defines the available fidelity parameters as a "fidelity space" for each benchmark.
         Parameters
         ----------
         seed: int, None
             Seed for the fidelity space.
+        fidelity_choice: int, None
+            integer value to choose the type of fidelity space
+
         Returns
         -------
         ConfigSpace.ConfigurationSpace
diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index ac273c57..21ed4ec0 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -19,10 +19,10 @@
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
-from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
 
-class HistGBBenchmark(Benchmark):
+class HistGBBenchmark(MLBenchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 3ad61b54..e0ab59bc 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -36,7 +36,7 @@
 )
 
 
-class Benchmark(AbstractBenchmark):
+class MLBenchmark(AbstractBenchmark):
     _issue_tasks = [3917, 3945]
 
     def __init__(
@@ -48,7 +48,7 @@ def __init__(
     ):
         self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
         self.rng = check_random_state(self.seed)
-        super(Benchmark, self).__init__(rng=seed)
+        super(MLBenchmark, self).__init__(rng=seed)
 
         self.task_id = task_id
         self.valid_size = valid_size
@@ -84,7 +84,7 @@ def get_configuration_space(seed=None):
         raise NotImplementedError()
 
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=1):
+    def get_fidelity_space(seed=None, fidelity_choice=None):
         """Fidelity space available --- specifies the fidelity dimensions
 
         If fidelity_choice is 0
@@ -194,8 +194,8 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
 
         # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
         # use 10 times the number of classes as lower bound for the dataset fraction
-        n_classes = len(self.task.class_labels)
-        self.lower_bound_train_size = (10 * n_classes) / self.train_X.shape[0]
+        self.n_classes = len(self.task.class_labels)
+        self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
         self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
 
         if verbose:
@@ -219,7 +219,7 @@ def init_model(self, config, fidelity=None, rng=None):
         """
         raise NotImplementedError()
 
-    def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+    def _train_objective(self, config, fidelity, shuffle, rng, eval="valid"):
         # initializing model
         model = self.init_model(config, fidelity, rng)
 
@@ -260,7 +260,7 @@ def _raw_objective(self, config, fidelity, shuffle, rng, eval="valid"):
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
-    def objective(
+    def objective_function(
             self,
             configuration: Union[CS.Configuration, Dict],
             fidelity: Union[CS.Configuration, Dict, None] = None,
@@ -270,7 +270,7 @@ def objective(
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
+        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
             configuration, fidelity, shuffle, rng
         )
         val_scores = dict()
@@ -310,7 +310,7 @@ def objective(
             'info': info
         }
 
-    def objective_test(
+    def objective_function_test(
             self,
             configuration: Union[CS.Configuration, Dict],
             fidelity: Union[CS.Configuration, Dict, None] = None,
@@ -320,7 +320,7 @@ def objective_test(
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._raw_objective(
+        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
             configuration, fidelity, shuffle, rng, eval="test"
         )
         test_scores = dict()
@@ -350,34 +350,40 @@ def objective_test(
             'info': info
         }
 
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-        """
-        return dict()
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-        """
-        return dict()
+    # # pylint: disable=arguments-differ
+    # @AbstractBenchmark.check_parameters
+    # def objective_function(
+    #         self,
+    #         configuration: Union[CS.Configuration, Dict],
+    #         fidelity: Union[CS.Configuration, Dict, None] = None,
+    #         shuffle: bool = False,
+    #         rng: Union[np.random.RandomState, int, None] = None,
+    #         **kwargs
+    # ) -> Dict:
+    #     """Function that evaluates a 'config' on a 'fidelity' on the validation set
+    #     """
+    #     return dict()
+    #
+    # # pylint: disable=arguments-differ
+    # @AbstractBenchmark.check_parameters
+    # def objective_function_test(
+    #         self,
+    #         configuration: Union[CS.Configuration, Dict],
+    #         fidelity: Union[CS.Configuration, Dict, None] = None,
+    #         shuffle: bool = False,
+    #         rng: Union[np.random.RandomState, int, None] = None,
+    #         **kwargs
+    # ) -> Dict:
+    #     """Function that evaluates a 'config' on a 'fidelity' on the test set
+    #     """
+    #     return dict()
 
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
-        pass
+        return {'name': 'Support Vector Machine',
+                'shape of train data': self.x_train.shape,
+                'shape of test data': self.x_test.shape,
+                'shape of valid data': self.x_valid.shape,
+                'initial random seed': self.rng,
+                'task_id': self.task_id
+                }
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 7426a37a..b815e1bd 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -15,10 +15,10 @@
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, make_scorer
 
-from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
 
-class RandomForestBenchmark(Benchmark):
+class RandomForestBenchmark(MLBenchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 9aad5e44..1d0e2d00 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -1,354 +1,81 @@
-"""
-
-Changelog:
-==========
-0.0.3
-* New container release due to a general change in the communication between container and HPOBench.
-  Works with HPOBench >= v0.0.8
-
-0.0.2:
-* Standardize the structure of the meta information
-
-0.0.1:
-* First implementation
-
-"""
-
-import logging
 import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
+import openml
 import numpy as np
-from scipy import sparse
-from sklearn import pipeline
-from sklearn import svm
-from sklearn.compose import ColumnTransformer
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
+from sklearn.svm import SVC
 from sklearn.impute import SimpleImputer
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.metrics import accuracy_score, make_scorer
-from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
-
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.3'
-
-logger = logging.getLogger('SVMBenchmark')
-
-
-class SupportVectorMachine(AbstractBenchmark):
-    """
-    Hyperparameter optimization task to optimize the regularization
-    parameter C and the kernel parameter gamma of a support vector machine.
-    Both hyperparameters are optimized on a log scale in [-10, 10].
-    The X_test data set is only used for a final offline evaluation of
-    a configuration. For that the validation and training data is
-    concatenated to form the whole training data set.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-        Parameters
-        ----------
-        task_id : int, None
-        rng : np.random.RandomState, int, None
-        """
-        super(SupportVectorMachine, self).__init__(rng=rng)
 
-        self.task_id = task_id
-        self.cache_size = 200  # Cache for the SVC in MB
-        self.accuracy_scorer = make_scorer(accuracy_score)
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM model
-        fidelity: Dict, None
-            Fidelity parameters for the SVM model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : training loss
-                fidelity : used fidelities in this evaluation
-        """
-        start_time = time.time()
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        # Split of dataset subset
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_size = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_size = fidelity['dataset_fraction']
 
-        train_size = int(train_size * len(self.train_idx))
-        train_idx = self.train_idx[:train_size]
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        # Train support vector machine
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(self.x_train[train_idx], self.y_train[train_idx])
-
-        # Compute validation error
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(val_loss),
-                "cost": cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}}
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model with a given configuration on both the X_train
-        and validation data set and evaluates the model on the X_test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : X_test loss
-            cost : time to X_train and evaluate the model
-            info : Dict
-                train_valid_loss: Loss on the train+valid data set
-                fidelity : used fidelities in this evaluation
-        """
-        assert np.isclose(fidelity['dataset_fraction'], 1), \
-            f'Data set fraction must be 1 but was {fidelity["dataset_fraction"]}'
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start_time = time.time()
-
-        # Concatenate training and validation dataset
-        if isinstance(self.x_train, sparse.csr.csr_matrix) or isinstance(self.x_valid, sparse.csr.csr_matrix):
-            data = sparse.vstack((self.x_train, self.x_valid))
-        else:
-            data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(data, targets)
-
-        # Compute validation error
-        train_valid_loss = 1 - self.accuracy_scorer(model, data, targets)
-
-        # Compute test error
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(test_loss),
-                "cost": cost,
-                'info': {'train_valid_loss': float(train_valid_loss),
-                         'fidelity': fidelity}}
-
-    def get_pipeline(self, C: float, gamma: float) -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-
-        model = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", MinMaxScaler(feature_range=(0, 1)), ~self.categorical_data)])),
-            ('svm',
-             svm.SVC(gamma=gamma, C=C, random_state=self.rng, cache_size=self.cache_size))
-        ])
-        return model
+class SVMBenchmark(MLBenchmark):
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1
+    ):
+        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
+        self.cache_size = 200
 
     @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the SVM Model
-
-        For a detailed explanation of the hyperparameters:
-        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
         """
-
-        seed = seed if seed is not None else np.random.randint(1, 100000)
         cs = CS.ConfigurationSpace(seed=seed)
-
+        # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
         cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('C', lower=-10., upper=10., default_value=0., log=False),
-            CS.UniformFloatHyperparameter('gamma', lower=-10., upper=10., default_value=1., log=False),
+            CS.UniformFloatHyperparameter(
+                "C", 0.03125, 32768, log=True, default_value=1.0
+            ),
+            CS.UniformFloatHyperparameter(
+                "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1
+            )
         ])
-        # cs.generate_all_continuous_from_bounds(SupportVectorMachine.get_meta_information()['bounds'])
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the SupportVector Benchmark
-
-        Fidelities
-        ----------
-        dataset_fraction: float - [0.1, 1]
-            fraction of training data set to use
+    def get_fidelity_space(seed=None, fidelity_choice=None):
+        """Fidelity space available --- specifies the fidelity dimensions
 
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
+        For SVM, only a single fidelity exists, i.e., subsample fraction.
+        if fidelity_choice == 0
+            uses the entire data (subsample=1), reflecting the black-box setup
+        else
+            parameterizes the fraction of data to subsample
 
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
         """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
+        z_cs = CS.ConfigurationSpace(seed=seed)
 
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-        ])
-        return fidel_space
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        return {'name': 'Support Vector Machine',
-                'references': ["@InProceedings{pmlr-v54-klein17a",
-                               "author = {Aaron Klein and Stefan Falkner and Simon Bartels and Philipp Hennig and "
-                               "Frank Hutter}, "
-                               "title = {{Fast Bayesian Optimization of Machine Learning Hyperparameters on "
-                               "Large Datasets}}"
-                               "pages = {528--536}, year = {2017},"
-                               "editor = {Aarti Singh and Jerry Zhu},"
-                               "volume = {54},"
-                               "series = {Proceedings of Machine Learning Research},"
-                               "address = {Fort Lauderdale, FL, USA},"
-                               "month = {20--22 Apr},"
-                               "publisher = {PMLR},"
-                               "pdf = {http://proceedings.mlr.press/v54/klein17a/klein17a.pdf}, "
-                               "url = {http://proceedings.mlr.press/v54/klein17a.html}, "
-                               ],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/container/hpolib/benchmarks/ml/svm_benchmark.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
+        if fidelity_choice == 0:
+            subsample = CS.Constant('subsample', value=1)
+        else:
+            # TODO: dynamically adapt based on 1/512 and lower_bound_train_size and set log=True
+            lower = 0.1
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=lower, upper=1, default_value=0.33, log=False
+            )
+        z_cs.add_hyperparameter(subsample)
+        return z_cs
+
+    def init_model(self, config, fidelity=None, rng=None):
+        # initializing model
+        rng = self.rng if rng is None else rng
+        config = config.get_dictionary()
+        model = SVC(
+            **config,
+            random_state=rng,
+            cache_size=self.cache_size
+        )
+        return model
diff --git a/hpobench/benchmarks/ml/svm_benchmark_2.py b/hpobench/benchmarks/ml/svm_benchmark_2.py
deleted file mode 100644
index 845f40e0..00000000
--- a/hpobench/benchmarks/ml/svm_benchmark_2.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import time
-import openml
-import numpy as np
-import pandas as pd
-import ConfigSpace as CS
-from typing import Union, Dict
-
-from sklearn.svm import SVC
-from sklearn.impute import SimpleImputer
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-from sklearn.pipeline import make_pipeline, Pipeline
-from sklearn.metrics import accuracy_score, make_scorer
-
-from hpobench.benchmarks.ml.ml_benchmark_template import Benchmark
-
-
-class SVMBenchmark(Benchmark):
-    def __init__(
-            self,
-            task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
-            valid_size: float = 0.33,
-            fidelity_choice: int = 1
-    ):
-        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
-        self.cache_size = 200
-
-    @staticmethod
-    def get_configuration_space(seed=None):
-        """Parameter space to be optimized --- contains the hyperparameters
-        """
-        cs = CS.ConfigurationSpace(seed=seed)
-        # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
-        cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter(
-                "C", 0.03125, 32768, log=True, default_value=1.0
-            ),
-            CS.UniformFloatHyperparameter(
-                "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1
-            )
-        ])
-        return cs
-
-    @classmethod
-    def get_fidelity_space(cls, seed=None, fidelity_choice=None):
-        """Fidelity space available --- specifies the fidelity dimensions
-
-        For SVM, only a single fidelity exists, i.e., subsample fraction.
-        if fidelity_choice == 0
-            uses the entire data (subsample=1), reflecting the black-box setup
-        else
-            parameterizes the fraction of data to subsample
-
-        """
-        z_cs = CS.ConfigurationSpace(seed=seed)
-
-        if fidelity_choice == 0:
-            subsample = CS.Constant('subsample', value=1)
-        else:
-            # TODO: dynamically adapt based on 1/512 and lower_bound_train_size and set log=True
-            lower = 0.1
-            subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=lower, upper=1, default_value=0.33, log=False
-            )
-        z_cs.add_hyperparameter(subsample)
-        return z_cs
-
-    def init_model(self, config, fidelity=None, rng=None):
-        # initializing model
-        rng = self.rng if rng is None else rng
-        config = config.get_dictionary()
-        model = SVC(
-            **config,
-            random_state=rng,
-            cache_size=self.cache_size
-        )
-        return model
diff --git a/hpobench/benchmarks/ml/svm_benchmark_old.py b/hpobench/benchmarks/ml/svm_benchmark_old.py
new file mode 100644
index 00000000..0a765e45
--- /dev/null
+++ b/hpobench/benchmarks/ml/svm_benchmark_old.py
@@ -0,0 +1,350 @@
+"""
+
+Changelog:
+==========
+0.0.2:
+* Standardize the structure of the meta information
+
+0.0.1:
+* First implementation
+
+"""
+
+import logging
+import time
+from typing import Union, Tuple, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+from scipy import sparse
+from sklearn import pipeline
+from sklearn import svm
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
+
+__version__ = '0.0.2'
+
+logger = logging.getLogger('SVMBenchmark')
+
+
+class SupportVectorMachine(AbstractBenchmark):
+    """
+    Hyperparameter optimization task to optimize the regularization
+    parameter C and the kernel parameter gamma of a support vector machine.
+    Both hyperparameters are optimized on a log scale in [-10, 10].
+    The X_test data set is only used for a final offline evaluation of
+    a configuration. For that the validation and training data is
+    concatenated to form the whole training data set.
+    """
+
+    def __init__(self, task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        Parameters
+        ----------
+        task_id : int, None
+        rng : np.random.RandomState, int, None
+        """
+        super(SupportVectorMachine, self).__init__(rng=rng)
+
+        self.task_id = task_id
+        self.cache_size = 200  # Cache for the SVC in MB
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
+            self.get_data()
+        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
+
+        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
+        categorical_idx = np.argwhere(self.categorical_data)
+        continuous_idx = np.argwhere(~self.categorical_data)
+        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
+        self.categorical_data = self.categorical_data[sorting]
+        self.x_train = self.x_train[:, sorting]
+        self.x_valid = self.x_valid[:, sorting]
+        self.x_test = self.x_test[:, sorting]
+
+        nan_columns = np.all(np.isnan(self.x_train), axis=0)
+        self.categorical_data = self.categorical_data[~nan_columns]
+        self.x_train, self.x_valid, self.x_test, self.categories = \
+            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
+                                                                 is_categorical=self.categorical_data)
+
+        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
+                                         size=len(self.x_train),
+                                         replace=False)
+
+        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
+        # (https://arxiv.org/pdf/1605.07079.pdf),
+        # use 10 time the number of classes as lower bound for the dataset fraction
+        n_classes = np.unique(self.y_train).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
+
+    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
+        """ Loads the data given a task or another source. """
+
+        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
+                                                             'overwrite the get_data method.')
+
+        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
+        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
+
+        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
+
+    def shuffle_data(self, rng=None):
+        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
+        class-random-state"""
+        random_state = rng_helper.get_rng(rng, self.rng)
+        random_state.shuffle(self.train_idx)
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           shuffle: bool = False,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a SVM model given a hyperparameter configuration and
+        evaluates the model on the validation set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the SVM model
+        fidelity: Dict, None
+            Fidelity parameters for the SVM model, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation loss
+            cost : time to train and evaluate the model
+            info : Dict
+                train_loss : training loss
+                fidelity : used fidelities in this evaluation
+        """
+        start_time = time.time()
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        # Split of dataset subset
+        if self.lower_bound_train_size > fidelity['dataset_fraction']:
+            train_size = self.lower_bound_train_size
+            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
+                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
+                           f'{self.lower_bound_train_size:.8f}')
+        else:
+            train_size = fidelity['dataset_fraction']
+
+        train_size = int(train_size * len(self.train_idx))
+        train_idx = self.train_idx[:train_size]
+
+        # Transform hyperparameters to linear scale
+        hp_c = np.exp(float(configuration['C']))
+        hp_gamma = np.exp(float(configuration['gamma']))
+
+        # Train support vector machine
+        model = self.get_pipeline(hp_c, hp_gamma)
+        model.fit(self.x_train[train_idx], self.y_train[train_idx])
+
+        # Compute validation error
+        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
+        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
+
+        cost = time.time() - start_time
+
+        return {'function_value': float(val_loss),
+                "cost": cost,
+                'info': {'train_loss': float(train_loss),
+                         'fidelity': fidelity}}
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                shuffle: bool = False,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a SVM model with a given configuration on both the X_train
+        and validation data set and evaluates the model on the X_test data set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the SVM Model
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : X_test loss
+            cost : time to X_train and evaluate the model
+            info : Dict
+                train_valid_loss: Loss on the train+valid data set
+                fidelity : used fidelities in this evaluation
+        """
+        assert np.isclose(fidelity['dataset_fraction'], 1), \
+            f'Data set fraction must be 1 but was {fidelity["dataset_fraction"]}'
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        start_time = time.time()
+
+        # Concatenate training and validation dataset
+        if isinstance(self.x_train, sparse.csr.csr_matrix) or isinstance(self.x_valid, sparse.csr.csr_matrix):
+            data = sparse.vstack((self.x_train, self.x_valid))
+        else:
+            data = np.concatenate((self.x_train, self.x_valid))
+        targets = np.concatenate((self.y_train, self.y_valid))
+
+        # Transform hyperparameters to linear scale
+        hp_c = np.exp(float(configuration['C']))
+        hp_gamma = np.exp(float(configuration['gamma']))
+
+        model = self.get_pipeline(hp_c, hp_gamma)
+        model.fit(data, targets)
+
+        # Compute validation error
+        train_valid_loss = 1 - self.accuracy_scorer(model, data, targets)
+
+        # Compute test error
+        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
+
+        cost = time.time() - start_time
+
+        return {'function_value': float(test_loss),
+                "cost": cost,
+                'info': {'train_valid_loss': float(train_valid_loss),
+                         'fidelity': fidelity}}
+
+    def get_pipeline(self, C: float, gamma: float) -> pipeline.Pipeline:
+        """ Create the scikit-learn (training-)pipeline """
+
+        model = pipeline.Pipeline([
+            ('preprocess_impute',
+             ColumnTransformer([
+                 ("categorical", "passthrough", self.categorical_data),
+                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
+            ('preprocess_one_hot',
+             ColumnTransformer([
+                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
+                 ("continuous", MinMaxScaler(feature_range=(0, 1)), ~self.categorical_data)])),
+            ('svm',
+             svm.SVC(gamma=gamma, C=C, random_state=self.rng, cache_size=self.cache_size))
+        ])
+        return model
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the SVM Model
+
+        For a detailed explanation of the hyperparameters:
+        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformFloatHyperparameter('C', lower=-10., upper=10., default_value=0., log=False),
+            CS.UniformFloatHyperparameter('gamma', lower=-10., upper=10., default_value=1., log=False),
+        ])
+        # cs.generate_all_continuous_from_bounds(SupportVectorMachine.get_meta_information()['bounds'])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the SupportVector Benchmark
+
+        Fidelities
+        ----------
+        dataset_fraction: float - [0.1, 1]
+            fraction of training data set to use
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
+        ])
+        return fidel_space
+
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        return {'name': 'Support Vector Machine',
+                'references': ["@InProceedings{pmlr-v54-klein17a",
+                               "author = {Aaron Klein and Stefan Falkner and Simon Bartels and Philipp Hennig and "
+                               "Frank Hutter}, "
+                               "title = {{Fast Bayesian Optimization of Machine Learning Hyperparameters on "
+                               "Large Datasets}}"
+                               "pages = {528--536}, year = {2017},"
+                               "editor = {Aarti Singh and Jerry Zhu},"
+                               "volume = {54},"
+                               "series = {Proceedings of Machine Learning Research},"
+                               "address = {Fort Lauderdale, FL, USA},"
+                               "month = {20--22 Apr},"
+                               "publisher = {PMLR},"
+                               "pdf = {http://proceedings.mlr.press/v54/klein17a/klein17a.pdf}, "
+                               "url = {http://proceedings.mlr.press/v54/klein17a.html}, "
+                               ],
+                'code': 'https://github.com/automl/HPOlib1.5/blob/container/hpolib/benchmarks/ml/svm_benchmark.py',
+                'shape of train data': self.x_train.shape,
+                'shape of test data': self.x_test.shape,
+                'shape of valid data': self.x_valid.shape,
+                'initial random seed': self.rng,
+                'task_id': self.task_id
+                }
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index e956d5a4..b038e4c9 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -1,430 +1,125 @@
-"""
-
-Changelog:
-==========
-0.0.3
-* New container release due to a general change in the communication between container and HPOBench.
-  Works with HPOBench >= v0.0.8
-
-0.0.2:
-* Change the search space definiton to match the paper: (https://arxiv.org/pdf/1802.09596.pdf)
-    eta:                [1e-5, 1] (def: 0.3)    ->  [2**-10, 1] (def: 0.3)
-    min_child_weight:   [0,05, 10] (def: 1)     ->  [1, 2**7] (def: 1)
-    colsample_bytree:   [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    colsample_bylevel:  [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    reg_lambda:         [1e-5, 2] (def: 1)      ->  [2**-10, 2**10] (def: 1)
-    reg_alpha:          [1e-5, 2] (def: 1e-5)   ->  [2**-10, 2**10] (def: 1)
-    max_depth:          -                       ->  [1, 15] (def: 6)
-    subsample_per_it:   -                       ->  [0.01, 1] (def: 1)
-    [booster:            -                      ->  [gbtree, gblinear, dart] (def: gbtree)]  *)
-
-    *) This parameter is only in the XGBoostExtendedBenchmark. Not in the XGBoostBenchmark class.
-
-* Increase the fidelity `n_estimators`
-    n_estimators        [2, 128] (def: 128)     ->  [1, 256] (def: 256)
-
-* Add class to optimize also the used booster method: (gbtree, gblinear or dart)
-    We have introduced a new class, which adds the used booster as parameter to the configuration space. To read more
-    about booster, please take a look in the official XGBoost-documentation (https://xgboost.readthedocs.io/en/latest).
-
-
-0.0.1:
-* First implementation of a XGBoost Benchmark.
-
-
-"""
-
-import logging
 import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
+import openml
 import numpy as np
+import pandas as pd
+import ConfigSpace as CS
+from typing import Union, Dict
+
 import xgboost as xgb
-from sklearn import pipeline
-from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, make_scorer
 
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.3'
-
-logger = logging.getLogger('XGBBenchmark')
-
-
-class XGBoostBenchmark(AbstractBenchmark):
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-
-        Parameters
-        ----------
-        task_id : int, None
-        n_threads  : int, None
-        rng : np.random.RandomState, int, None
-        """
-
-        super(XGBoostBenchmark, self).__init__(rng=rng)
-        self.n_threads = n_threads
-        self.task_id = task_id
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        # Determine the number of categories in the labels.
-        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
-        self.num_class = len(np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid])))
-        self.num_class = 1 if self.num_class == 2 else self.num_class
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost model
-        fidelity: Dict, None
-            Fidelity parameters for the XGBoost model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : trainings loss
-                fidelity : used fidelities in this evaluation
-        """
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_data_fraction = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_data_fraction = fidelity['dataset_fraction']
-
-        train_idx = self.train_idx[:int(len(self.train_idx) * train_data_fraction)]
-
-        model = self._get_pipeline(n_estimators=fidelity["n_estimators"], **configuration)
-        model.fit(X=self.x_train[train_idx], y=self.y_train[train_idx])
-
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-        cost = time.time() - start
-
-        return {'function_value': float(val_loss),
-                'cost': cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}
-                }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model with a given configuration on both the train
-        and validation data set and evaluates the model on the test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : test loss
-            cost : time to train and evaluate the model
-            info : Dict
-                fidelity : used fidelities in this evaluation
-        """
-        default_dataset_fraction = self.get_fidelity_space().get_hyperparameter('dataset_fraction').default_value
-        if fidelity['dataset_fraction'] != default_dataset_fraction:
-            raise NotImplementedError(f'Test error can not be computed for dataset_fraction <= '
-                                      f'{default_dataset_fraction}')
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        # Impute potential nan values with the feature-
-        data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        model = self._get_pipeline(n_estimators=fidelity['n_estimators'], **configuration)
-        model.fit(X=data, y=targets)
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-        cost = time.time() - start
 
-        return {'function_value': float(test_loss),
-                'cost': cost,
-                'info': {'fidelity': fidelity}}
+class XGBoostBenchmark(MLBenchmark):
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1
+    ):
+        super(XGBoostBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
+        pass
 
     @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
         """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the XGBoost Model
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
         cs = CS.ConfigurationSpace(seed=seed)
 
         cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('eta', lower=2**-10, upper=1., default_value=0.3, log=True),
-            CS.UniformIntegerHyperparameter('max_depth', lower=1, upper=15, default_value=6, log=False),
-            CS.UniformFloatHyperparameter('min_child_weight', lower=1., upper=2**7., default_value=1., log=True),
-            CS.UniformFloatHyperparameter('colsample_bytree', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('colsample_bylevel', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('subsample_per_it', lower=0.1, upper=1, default_value=1, log=False)
+            CS.UniformFloatHyperparameter(
+                'eta', lower=2**-10, upper=1., default_value=0.3, log=True
+            ),  # learning rate
+            CS.UniformIntegerHyperparameter(
+                'max_depth', lower=1, upper=15, default_value=6, log=False
+            ),
+            CS.UniformFloatHyperparameter(
+                'min_child_weight', lower=1., upper=2**7., default_value=1., log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'colsample_bytree', lower=0.01, upper=1., default_value=1.
+            ),
+            # CS.UniformFloatHyperparameter(
+            #     'colsample_bylevel', lower=0.01, upper=1., default_value=1.
+            # ),
+            CS.UniformFloatHyperparameter(
+                'reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True
+            ),
+            # CS.UniformFloatHyperparameter(
+            #     'reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True
+            # ),
+            # CS.UniformFloatHyperparameter(
+            #     'subsample_per_it', lower=0.1, upper=1, default_value=1, log=False
+            # )
         ])
-
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
         """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the XGBoost Benchmark
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
+        if fidelity_choice == 0:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 1:
+            # only n_estimators as fidelity
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 2:
+            # only subsample as fidelity
+            ntrees = CS.Constant('n_estimators', value=100)
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+            )
+        else:
+            # both n_estimators and subsample as fidelities
+            ntrees = CS.UniformIntegerHyperparameter(
+                'n_estimators', lower=2, upper=100, default_value=10, log=False
+            )
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+            )
+        z_cs.add_hyperparameters([ntrees, subsample])
+        return z_cs
+
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
         """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-            CS.UniformIntegerHyperparameter("n_estimators", lower=1, upper=256, default_value=256, log=False)
-        ])
-
-        return fidel_space
-
-    def get_meta_information(self) -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'XGBoost',
-                'references': ['@article{probst2019tunability,'
-                               'title={Tunability: Importance of hyperparameters of machine learning algorithms.},'
-                               'author={Probst, Philipp and Boulesteix, Anne-Laure and Bischl, Bernd},'
-                               'journal={J. Mach. Learn. Res.},'
-                               'volume={20},'
-                               'number={53},'
-                               'pages={1--32},'
-                               'year={2019}'
-                               '}'],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/development/hpolib/benchmarks/ml/'
-                        'xgboost_benchmark.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
-
-    def _get_pipeline(self, max_depth: int, eta: float, min_child_weight: int,
-                      colsample_bytree: float, colsample_bylevel: float, reg_lambda: int, reg_alpha: int,
-                      n_estimators: int, subsample_per_it: float) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(
-                 max_depth=max_depth,
-                 learning_rate=eta,
-                 min_child_weight=min_child_weight,
-                 colsample_bytree=colsample_bytree,
-                 colsample_bylevel=colsample_bylevel,
-                 reg_alpha=reg_alpha,
-                 reg_lambda=reg_lambda,
-                 n_estimators=n_estimators,
-                 objective=objective,
-                 n_jobs=self.n_threads,
-                 random_state=self.rng.randint(1, 100000),
-                 num_class=self.num_class,
-                 subsample=subsample_per_it))
-            ])
-        return clf
-
-
-class XGBoostExtendedBenchmark(XGBoostBenchmark):
-    """
-    Similar to XGBoostBenchmark but enables also the optimization of the used booster.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        super(XGBoostExtendedBenchmark, self).__init__(task_id=task_id, n_threads=n_threads, rng=rng)
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = XGBoostBenchmark.get_configuration_space(seed)
-        hp_booster = CS.CategoricalHyperparameter('booster', choices=['gbtree', 'gblinear', 'dart'],
-                                                  default_value='gbtree')
-        cs.add_hyperparameter(hp_booster)
-
-        # XGBoost with 'gblinear' can not use some
-        # parameters. Exclude them from the configuration space by introducing a condition.
-        hps = ['colsample_bylevel', 'colsample_bytree', 'max_depth',  'min_child_weight', 'subsample_per_it']
-
-        # The NotEqualsCondition means: "Make parameter X active if hp_booster is not equal to gblinear."
-        conditions = [CS.NotEqualsCondition(cs.get_hyperparameter(hp), hp_booster, 'gblinear') for hp in hps]
-        cs.add_conditions(conditions)
-        return cs
-
-    # noinspection PyMethodOverriding
-    # pylint: disable=arguments-differ
-    def _get_pipeline(self, n_estimators: int, booster: str, reg_lambda: int, reg_alpha: int, eta: float,
-                      min_child_weight: int = None, max_depth: int = None, colsample_bytree: float = None,
-                      colsample_bylevel: float = None, subsample_per_it: float = None) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        configuration = dict(booster=booster,
-                             max_depth=max_depth,
-                             learning_rate=eta,
-                             min_child_weight=min_child_weight,
-                             colsample_bytree=colsample_bytree,
-                             colsample_bylevel=colsample_bylevel,
-                             reg_alpha=reg_alpha,
-                             reg_lambda=reg_lambda,
-                             n_estimators=n_estimators,
-                             objective=objective,
-                             n_jobs=self.n_threads,
-                             random_state=self.rng.randint(1, 100000),
-                             num_class=self.num_class,
-                             subsample=subsample_per_it)
-
-        configuration = {k: v for k, v in configuration.items() if v is not None}
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(**configuration))
-        ])
-        return clf
+        rng = rng if (rng is None and isinstance(rng, int)) else self.seed
+        extra_args = dict(
+            n_estimators=fidelity['n_estimators'],
+            objective="binary:logistic",
+            random_state=rng,
+            subsample=1
+        )
+        if self.n_classes > 2:
+            extra_args["objective"] = "multi:softmax"
+            extra_args.update({"num_class": self.n_classes})
+        model = xgb.XGBClassifier(
+            **config.get_dictionary(),
+            **extra_args
+        )
+        return model
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/benchmarks/ml/xgboost_benchmark_old.py
new file mode 100644
index 00000000..fb380c89
--- /dev/null
+++ b/hpobench/benchmarks/ml/xgboost_benchmark_old.py
@@ -0,0 +1,426 @@
+"""
+
+Changelog:
+==========
+0.0.2:
+* Change the search space definiton to match the paper: (https://arxiv.org/pdf/1802.09596.pdf)
+    eta:                [1e-5, 1] (def: 0.3)    ->  [2**-10, 1] (def: 0.3)
+    min_child_weight:   [0,05, 10] (def: 1)     ->  [1, 2**7] (def: 1)
+    colsample_bytree:   [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
+    colsample_bylevel:  [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
+    reg_lambda:         [1e-5, 2] (def: 1)      ->  [2**-10, 2**10] (def: 1)
+    reg_alpha:          [1e-5, 2] (def: 1e-5)   ->  [2**-10, 2**10] (def: 1)
+    max_depth:          -                       ->  [1, 15] (def: 6)
+    subsample_per_it:   -                       ->  [0.01, 1] (def: 1)
+    [booster:            -                      ->  [gbtree, gblinear, dart] (def: gbtree)]  *)
+
+    *) This parameter is only in the XGBoostExtendedBenchmark. Not in the XGBoostBenchmark class.
+
+* Increase the fidelity `n_estimators`
+    n_estimators        [2, 128] (def: 128)     ->  [1, 256] (def: 256)
+
+* Add class to optimize also the used booster method: (gbtree, gblinear or dart)
+    We have introduced a new class, which adds the used booster as parameter to the configuration space. To read more
+    about booster, please take a look in the official XGBoost-documentation (https://xgboost.readthedocs.io/en/latest).
+
+
+0.0.1:
+* First implementation of a XGBoost Benchmark.
+
+
+"""
+
+import logging
+import time
+from typing import Union, Tuple, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+import xgboost as xgb
+from sklearn import pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.preprocessing import OneHotEncoder
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
+
+__version__ = '0.0.2'
+
+logger = logging.getLogger('XGBBenchmark')
+
+
+class XGBoostBenchmark(AbstractBenchmark):
+
+    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+
+        Parameters
+        ----------
+        task_id : int, None
+        n_threads  : int, None
+        rng : np.random.RandomState, int, None
+        """
+
+        super(XGBoostBenchmark, self).__init__(rng=rng)
+        self.n_threads = n_threads
+        self.task_id = task_id
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
+            self.get_data()
+        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
+
+        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
+        categorical_idx = np.argwhere(self.categorical_data)
+        continuous_idx = np.argwhere(~self.categorical_data)
+        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
+        self.categorical_data = self.categorical_data[sorting]
+        self.x_train = self.x_train[:, sorting]
+        self.x_valid = self.x_valid[:, sorting]
+        self.x_test = self.x_test[:, sorting]
+
+        nan_columns = np.all(np.isnan(self.x_train), axis=0)
+        self.categorical_data = self.categorical_data[~nan_columns]
+
+        self.x_train, self.x_valid, self.x_test, self.categories = \
+            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
+                                                                 is_categorical=self.categorical_data)
+
+        # Determine the number of categories in the labels.
+        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
+        self.num_class = len(np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid])))
+        self.num_class = 1 if self.num_class == 2 else self.num_class
+
+        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
+                                         size=len(self.x_train),
+                                         replace=False)
+
+        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
+        # (https://arxiv.org/pdf/1605.07079.pdf),
+        # use 10 time the number of classes as lower bound for the dataset fraction
+        n_classes = np.unique(self.y_train).shape[0]
+        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
+
+    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
+        """ Loads the data given a task or another source. """
+
+        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
+                                                             'overwrite the get_data method.')
+
+        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
+        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
+
+        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
+
+    def shuffle_data(self, rng=None):
+        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
+        class-random-state"""
+        random_state = rng_helper.get_rng(rng, self.rng)
+        random_state.shuffle(self.train_idx)
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           shuffle: bool = False,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a XGBoost model given a hyperparameter configuration and
+        evaluates the model on the validation set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the XGBoost model
+        fidelity: Dict, None
+            Fidelity parameters for the XGBoost model, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation loss
+            cost : time to train and evaluate the model
+            info : Dict
+                train_loss : trainings loss
+                fidelity : used fidelities in this evaluation
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        start = time.time()
+
+        if self.lower_bound_train_size > fidelity['dataset_fraction']:
+            train_data_fraction = self.lower_bound_train_size
+            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
+                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
+                           f'{self.lower_bound_train_size:.8f}')
+        else:
+            train_data_fraction = fidelity['dataset_fraction']
+
+        train_idx = self.train_idx[:int(len(self.train_idx) * train_data_fraction)]
+
+        model = self._get_pipeline(n_estimators=fidelity["n_estimators"], **configuration)
+        model.fit(X=self.x_train[train_idx], y=self.y_train[train_idx])
+
+        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
+        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
+        cost = time.time() - start
+
+        return {'function_value': float(val_loss),
+                'cost': cost,
+                'info': {'train_loss': float(train_loss),
+                         'fidelity': fidelity}
+                }
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                shuffle: bool = False,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a XGBoost model with a given configuration on both the train
+        and validation data set and evaluates the model on the test data set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the XGBoost Model
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : test loss
+            cost : time to train and evaluate the model
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        default_dataset_fraction = self.get_fidelity_space().get_hyperparameter('dataset_fraction').default_value
+        if fidelity['dataset_fraction'] != default_dataset_fraction:
+            raise NotImplementedError(f'Test error can not be computed for dataset_fraction <= '
+                                      f'{default_dataset_fraction}')
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        start = time.time()
+
+        # Impute potential nan values with the feature-
+        data = np.concatenate((self.x_train, self.x_valid))
+        targets = np.concatenate((self.y_train, self.y_valid))
+
+        model = self._get_pipeline(n_estimators=fidelity['n_estimators'], **configuration)
+        model.fit(X=data, y=targets)
+
+        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
+        cost = time.time() - start
+
+        return {'function_value': float(test_loss),
+                'cost': cost,
+                'info': {'fidelity': fidelity}}
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the XGBoost Model
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformFloatHyperparameter('eta', lower=2**-10, upper=1., default_value=0.3, log=True),
+            CS.UniformIntegerHyperparameter('max_depth', lower=1, upper=15, default_value=6, log=False),
+            CS.UniformFloatHyperparameter('min_child_weight', lower=1., upper=2**7., default_value=1., log=True),
+            CS.UniformFloatHyperparameter('colsample_bytree', lower=0.01, upper=1., default_value=1.),
+            CS.UniformFloatHyperparameter('colsample_bylevel', lower=0.01, upper=1., default_value=1.),
+            CS.UniformFloatHyperparameter('reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True),
+            CS.UniformFloatHyperparameter('reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True),
+            CS.UniformFloatHyperparameter('subsample_per_it', lower=0.1, upper=1, default_value=1, log=False)
+        ])
+
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the XGBoost Benchmark
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
+            CS.UniformIntegerHyperparameter("n_estimators", lower=1, upper=256, default_value=256, log=False)
+        ])
+
+        return fidel_space
+
+    def get_meta_information(self) -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'XGBoost',
+                'references': ['@article{probst2019tunability,'
+                               'title={Tunability: Importance of hyperparameters of machine learning algorithms.},'
+                               'author={Probst, Philipp and Boulesteix, Anne-Laure and Bischl, Bernd},'
+                               'journal={J. Mach. Learn. Res.},'
+                               'volume={20},'
+                               'number={53},'
+                               'pages={1--32},'
+                               'year={2019}'
+                               '}'],
+                'code': 'https://github.com/automl/HPOlib1.5/blob/development/hpolib/benchmarks/ml/'
+                        'xgboost_benchmark_old.py',
+                'shape of train data': self.x_train.shape,
+                'shape of test data': self.x_test.shape,
+                'shape of valid data': self.x_valid.shape,
+                'initial random seed': self.rng,
+                'task_id': self.task_id
+                }
+
+    def _get_pipeline(self, max_depth: int, eta: float, min_child_weight: int,
+                      colsample_bytree: float, colsample_bylevel: float, reg_lambda: int, reg_alpha: int,
+                      n_estimators: int, subsample_per_it: float) \
+            -> pipeline.Pipeline:
+        """ Create the scikit-learn (training-)pipeline """
+        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
+
+        clf = pipeline.Pipeline([
+            ('preprocess_impute',
+             ColumnTransformer([
+                 ("categorical", "passthrough", self.categorical_data),
+                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
+            ('preprocess_one_hot',
+             ColumnTransformer([
+                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
+                 ("continuous", "passthrough", ~self.categorical_data)])),
+            ('xgb',
+             xgb.XGBClassifier(
+                 max_depth=max_depth,
+                 learning_rate=eta,
+                 min_child_weight=min_child_weight,
+                 colsample_bytree=colsample_bytree,
+                 colsample_bylevel=colsample_bylevel,
+                 reg_alpha=reg_alpha,
+                 reg_lambda=reg_lambda,
+                 n_estimators=n_estimators,
+                 objective=objective,
+                 n_jobs=self.n_threads,
+                 random_state=self.rng.randint(1, 100000),
+                 num_class=self.num_class,
+                 subsample=subsample_per_it))
+            ])
+        return clf
+
+
+class XGBoostExtendedBenchmark(XGBoostBenchmark):
+    """
+    Similar to XGBoostBenchmark but enables also the optimization of the used booster.
+    """
+
+    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(XGBoostExtendedBenchmark, self).__init__(task_id=task_id, n_threads=n_threads, rng=rng)
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        cs = XGBoostBenchmark.get_configuration_space(seed)
+        hp_booster = CS.CategoricalHyperparameter('booster', choices=['gbtree', 'gblinear', 'dart'],
+                                                  default_value='gbtree')
+        cs.add_hyperparameter(hp_booster)
+
+        # XGBoost with 'gblinear' can not use some
+        # parameters. Exclude them from the configuration space by introducing a condition.
+        hps = ['colsample_bylevel', 'colsample_bytree', 'max_depth',  'min_child_weight', 'subsample_per_it']
+
+        # The NotEqualsCondition means: "Make parameter X active if hp_booster is not equal to gblinear."
+        conditions = [CS.NotEqualsCondition(cs.get_hyperparameter(hp), hp_booster, 'gblinear') for hp in hps]
+        cs.add_conditions(conditions)
+        return cs
+
+    # noinspection PyMethodOverriding
+    # pylint: disable=arguments-differ
+    def _get_pipeline(self, n_estimators: int, booster: str, reg_lambda: int, reg_alpha: int, eta: float,
+                      min_child_weight: int = None, max_depth: int = None, colsample_bytree: float = None,
+                      colsample_bylevel: float = None, subsample_per_it: float = None) \
+            -> pipeline.Pipeline:
+        """ Create the scikit-learn (training-)pipeline """
+        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
+
+        configuration = dict(booster=booster,
+                             max_depth=max_depth,
+                             learning_rate=eta,
+                             min_child_weight=min_child_weight,
+                             colsample_bytree=colsample_bytree,
+                             colsample_bylevel=colsample_bylevel,
+                             reg_alpha=reg_alpha,
+                             reg_lambda=reg_lambda,
+                             n_estimators=n_estimators,
+                             objective=objective,
+                             n_jobs=self.n_threads,
+                             random_state=self.rng.randint(1, 100000),
+                             num_class=self.num_class,
+                             subsample=subsample_per_it)
+
+        configuration = {k: v for k, v in configuration.items() if v is not None}
+
+        clf = pipeline.Pipeline([
+            ('preprocess_impute',
+             ColumnTransformer([
+                 ("categorical", "passthrough", self.categorical_data),
+                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
+            ('preprocess_one_hot',
+             ColumnTransformer([
+                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
+                 ("continuous", "passthrough", ~self.categorical_data)])),
+            ('xgb',
+             xgb.XGBClassifier(**configuration))
+        ])
+        return clf
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 885ce606..9bc5ff3b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -64,7 +64,7 @@ def test_rng_serialization():
 def test_rng_serialization_xgb():
     import json
     from hpobench.util.container_utils import BenchmarkEncoder, BenchmarkDecoder
-    from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark
+    from hpobench.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark
 
     b = XGBoostBenchmark(task_id=167149, rng=0)
     meta = b.get_meta_information()
diff --git a/tests/test_whitebox.py b/tests/test_whitebox.py
index 7e4c32aa..c3f5e0ff 100644
--- a/tests/test_whitebox.py
+++ b/tests/test_whitebox.py
@@ -14,7 +14,7 @@
 
 
 def test_whitebox_without_container_xgb():
-    from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark as Benchmark
+    from hpobench.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark as Benchmark
     b = Benchmark(task_id=167199, rng=0)
     cs = b.get_configuration_space(seed=0)
 

From 9e907e6ef5c5b6f60699dc0f8dffd0a6607a4134 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 8 Jul 2021 19:18:18 +0200
Subject: [PATCH 025/147] Option to load data splits from disk

---
 hpobench/benchmarks/ml/histgb_benchmark.py    |  5 +-
 .../benchmarks/ml/ml_benchmark_template.py    | 53 ++++++++-----------
 hpobench/benchmarks/ml/rf_benchmark.py        |  7 ++-
 hpobench/benchmarks/ml/svm_benchmark.py       |  5 +-
 hpobench/benchmarks/ml/xgboost_benchmark.py   |  7 ++-
 5 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 21ed4ec0..0edcd3fa 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -28,9 +28,10 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1
+            fidelity_choice: int = 1,
+            data_path: Union[str, None] = None
     ):
-        super(HistGBBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
+        super(HistGBBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice, data_path)
         pass
 
     @staticmethod
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index e0ab59bc..24cccbd4 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -1,3 +1,4 @@
+import os
 import time
 import openml
 import numpy as np
@@ -44,10 +45,13 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1
+            fidelity_choice: int = 1,
+            data_path: Union[str, None] = None,
+            global_seed: int = 1
     ):
         self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
         self.rng = check_random_state(self.seed)
+        self.global_seed = global_seed  # used for fixed training-validation splits
         super(MLBenchmark, self).__init__(rng=seed)
 
         self.task_id = task_id
@@ -55,7 +59,7 @@ def __init__(
         self.scorers = dict()
         for k, v in metrics.items():
             self.scorers[k] = make_scorer(v, **metrics_kwargs[k])
-        # self.scorers = make_scorer(accuracy_score)
+        self.data_path = data_path
 
         # Data variables
         self.train_X = None
@@ -129,6 +133,19 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
 
         The validation set is fixed till this function is called again or explicitly altered
         """
+        if self.data_path is not None and os.path.isdir(self.data_path):
+            data_path = os.path.join(self.data_path, str(self.task_id))
+            data_str = os.path.join(data_path, "{}_{}.parquet.gzip")
+            required_file_list = [
+                ("train", "x"), ("train", "y"),
+                ("valid", "x"), ("valid", "y"),
+                ("test", "x"), ("test", "y")
+            ]
+            for files in required_file_list:
+                if not os.path.isfile(data_str.format("train", "x")):
+                    raise FileNotFoundError("{} not found!".format(data_str.format(*files)))
+            return
+
         # fetches task
         self.task = openml.tasks.get_task(self.task_id, download_data=False)
         # fetches dataset
@@ -146,7 +163,7 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
         (cont_idx,) = np.where(~categorical_ind)
 
         # splitting dataset into train and test (10% test)
-        # train-test split is fixed for a task and its associated dataset
+        # train-test split is fixed for a task and its associated dataset (from OpenML)
         self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
         train_x = X.iloc[self.train_idx]
         train_y = y.iloc[self.train_idx]
@@ -158,7 +175,7 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
         valid_size = self.valid_size if valid_size is None else valid_size
         self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
             train_x, train_y, test_size=valid_size,
-            shuffle=True, stratify=train_y, random_state=self.rng
+            shuffle=True, stratify=train_y, random_state=check_random_state(self.global_seed)
         )
 
         # preprocessor to handle missing values, categorical columns encodings,
@@ -350,34 +367,6 @@ def objective_function_test(
             'info': info
         }
 
-    # # pylint: disable=arguments-differ
-    # @AbstractBenchmark.check_parameters
-    # def objective_function(
-    #         self,
-    #         configuration: Union[CS.Configuration, Dict],
-    #         fidelity: Union[CS.Configuration, Dict, None] = None,
-    #         shuffle: bool = False,
-    #         rng: Union[np.random.RandomState, int, None] = None,
-    #         **kwargs
-    # ) -> Dict:
-    #     """Function that evaluates a 'config' on a 'fidelity' on the validation set
-    #     """
-    #     return dict()
-    #
-    # # pylint: disable=arguments-differ
-    # @AbstractBenchmark.check_parameters
-    # def objective_function_test(
-    #         self,
-    #         configuration: Union[CS.Configuration, Dict],
-    #         fidelity: Union[CS.Configuration, Dict, None] = None,
-    #         shuffle: bool = False,
-    #         rng: Union[np.random.RandomState, int, None] = None,
-    #         **kwargs
-    # ) -> Dict:
-    #     """Function that evaluates a 'config' on a 'fidelity' on the test set
-    #     """
-    #     return dict()
-
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
         return {'name': 'Support Vector Machine',
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index b815e1bd..3850399c 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -24,9 +24,12 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1
+            fidelity_choice: int = 1,
+            data_path: Union[str, None] = None
     ):
-        super(RandomForestBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
+        super(RandomForestBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, data_path
+        )
         pass
 
     @staticmethod
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 1d0e2d00..190671ca 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -24,9 +24,10 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1
+            fidelity_choice: int = 1,
+            data_path: Union[str, None] = None
     ):
-        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
+        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice, data_path)
         self.cache_size = 200
 
     @staticmethod
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index b038e4c9..4c93d2ef 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -24,9 +24,12 @@ def __init__(
             task_id: Union[int, None] = None,
             seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1
+            fidelity_choice: int = 1,
+            data_path: Union[str, None] = None
     ):
-        super(XGBoostBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice)
+        super(XGBoostBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, data_path
+        )
         pass
 
     @staticmethod

From f0d4f36ca01b8c9141841b41ec65d1f570e9c6f4 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 12 Jul 2021 15:40:54 +0200
Subject: [PATCH 026/147] Reordering data load to work for different cases

---
 .../benchmarks/ml/ml_benchmark_template.py    | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index 24cccbd4..dacb64db 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -133,6 +133,16 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
 
         The validation set is fixed till this function is called again or explicitly altered
         """
+        # fetches task
+        self.task = openml.tasks.get_task(self.task_id, download_data=False)
+        self.n_classes = len(self.task.class_labels)
+        # fetches dataset
+        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
+        if verbose:
+            print(self.task, '\n')
+            print(self.dataset, '\n')
+
+        # check if the path to data splits is valid
         if self.data_path is not None and os.path.isdir(self.data_path):
             data_path = os.path.join(self.data_path, str(self.task_id))
             data_str = os.path.join(data_path, "{}_{}.parquet.gzip")
@@ -144,16 +154,9 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
             for files in required_file_list:
                 if not os.path.isfile(data_str.format("train", "x")):
                     raise FileNotFoundError("{} not found!".format(data_str.format(*files)))
+            # ignore the remaining data loaders and preprocessors as valid data splits available
             return
 
-        # fetches task
-        self.task = openml.tasks.get_task(self.task_id, download_data=False)
-        # fetches dataset
-        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
-        if verbose:
-            print(self.task, '\n')
-            print(self.dataset, '\n')
-
         # loads full data
         X, y, categorical_ind, feature_names = self.dataset.get_data(
             target=self.task.target_name, dataset_format="dataframe"
@@ -211,7 +214,7 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
 
         # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
         # use 10 times the number of classes as lower bound for the dataset fraction
-        self.n_classes = len(self.task.class_labels)
+
         self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
         self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
 

From dbeae7c07b8ab59883be01cdc591b8810e8ae434 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 14 Jul 2021 20:21:52 +0200
Subject: [PATCH 027/147] Updating source of SVM HP range

---
 hpobench/benchmarks/ml/svm_benchmark.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 190671ca..e6d9e0f7 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -35,13 +35,13 @@ def get_configuration_space(seed=None):
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
-        # from https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.p
+        # https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf (Section 3.2)
         cs.add_hyperparameters([
             CS.UniformFloatHyperparameter(
-                "C", 0.03125, 32768, log=True, default_value=1.0
+                "C", 2**-5, 2**15, log=True, default_value=1.0
             ),
             CS.UniformFloatHyperparameter(
-                "gamma", 3.0517578125e-05, 8, log=True, default_value=0.1
+                "gamma", 2**-15, 2**3, log=True, default_value=0.1
             )
         ])
         return cs

From f277a2e7532e7678a4bd7da0d65a1da5707c4798 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 14 Jul 2021 22:52:52 +0200
Subject: [PATCH 028/147] Adding Tabular Benchmark class

---
 hpobench/benchmarks/ml/__init__.py          |  4 ++
 hpobench/benchmarks/ml/histgb_benchmark.py  | 14 +----
 hpobench/benchmarks/ml/rf_benchmark.py      | 14 +----
 hpobench/benchmarks/ml/svm_benchmark.py     | 14 +----
 hpobench/benchmarks/ml/tabular_benchmark.py | 70 +++++++++++++++++++++
 hpobench/benchmarks/ml/xgboost_benchmark.py | 11 ----
 6 files changed, 77 insertions(+), 50 deletions(-)
 create mode 100644 hpobench/benchmarks/ml/tabular_benchmark.py

diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index e69de29b..54cf8d51 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -0,0 +1,4 @@
+from .svm_benchmark import SVMBenchmark
+from .rf_benchmark import RandomForestBenchmark
+from .xgboost_benchmark import XGBoostBenchmark
+from .histgb_benchmark import HistGBBenchmark
\ No newline at end of file
diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 0edcd3fa..a803aeea 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -1,19 +1,7 @@
-import time
-import openml
 import numpy as np
-import pandas as pd
 import ConfigSpace as CS
 from copy import deepcopy
-from typing import Union, Dict
-
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import make_pipeline
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, make_scorer
+from typing import Union
 
 # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 3850399c..e6cfa8ba 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -1,19 +1,7 @@
-import time
-import openml
 import numpy as np
-import pandas as pd
 import ConfigSpace as CS
-from typing import Union, Dict
-
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import make_pipeline
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import StandardScaler
+from typing import Union
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, make_scorer
 
 from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index e6d9e0f7..fc541567 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -1,19 +1,7 @@
-import time
-import openml
-import numpy as np
-import pandas as pd
 import ConfigSpace as CS
-from typing import Union, Dict
+from typing import Union, List, Dict
 
 from sklearn.svm import SVC
-from sklearn.impute import SimpleImputer
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-from sklearn.pipeline import make_pipeline, Pipeline
-from sklearn.metrics import accuracy_score, make_scorer
 
 from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
new file mode 100644
index 00000000..782ee254
--- /dev/null
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -0,0 +1,70 @@
+import os
+import glom
+import numpy as np
+import ConfigSpace as CS
+import pickle5 as pickle
+from typing import Union, List
+
+
+class TabularBenchmark:
+    def __init__(self, table_path: str, seed: Union[int, None]=None):
+        assert os.path.isfile(table_path), "Not a valid path: {}".format(table_path)
+        table = self._load_table(table_path)
+        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
+        self.exp_args = table['exp_args']
+        self.config_spaces = table['config_spaces']
+        self.x_cs = self.get_hyperparameter_space(seed=self.seed)
+        self.z_cs = self.get_fidelity_space(seed=self.seed)
+        self.table = table['data']
+        self.global_minimums = table['global_min']
+
+    def _load_table(self, path):
+        with open(path, "rb") as f:
+            table = pickle.load(f)
+        return table
+
+    def get_hyperparameter_space(self, seed=None, original=False):
+        cs = CS.ConfigurationSpace(seed=seed)
+        if original:
+            _cs = self.config_spaces['x']
+        _cs = self.config_spaces['x_discrete']
+        for hp in _cs.get_hyperparameters():
+            cs.add_hyperparameter(hp)
+        return cs
+
+    def get_fidelity_space(self, seed=None, original=False):
+        cs = CS.ConfigurationSpace(seed=seed)
+        if original:
+            _cs = self.config_spaces['z']
+        _cs = self.config_spaces['z_discrete']
+        for hp in _cs.get_hyperparameters():
+            cs.add_hyperparameter(hp)
+        return cs
+
+    def sample_hyperparamer(self, n: int = 1) -> Union[CS.Configuration, List]:
+        return self.x_cs.sample_configuration(n)
+
+    def sample_fidelity(self, n: int = 1) -> Union[CS.Configuration, List]:
+        return self.z_cs.sample_configuration(n)
+
+    def get_global_min(self, metric: str = "acc"):
+        """ Retrieves the minimum (1 - metric) for train, validation and test splits
+        """
+        assert metric in self.global_minimums.keys(), \
+            "Not a valid metric: {}".format(list(self.global_minimums.keys()))
+        return self.global_minimums[metric]
+
+    def objective_function(self, config, fidelity):
+        self.x_cs.check_configuration(config)
+        self.z_cs.check_configuration(fidelity)
+        key_path = []
+        for name in np.sort(self.x_cs.get_hyperparameter_names()):
+            key_path.append(config[str(name)])
+        for name in np.sort(self.z_cs.get_hyperparameter_names()):
+            key_path.append(fidelity[str(name)])
+        val = glom.glom(self.table, glom.Path(*key_path), default=None)
+        if val is None:
+            raise ValueError(
+                "Invalid config-fidelity or not recorded in table!\n{}\n{}".format(config, fidelity)
+            )
+        return val
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 4c93d2ef..5efe56ed 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -1,19 +1,8 @@
-import time
-import openml
 import numpy as np
-import pandas as pd
 import ConfigSpace as CS
 from typing import Union, Dict
 
 import xgboost as xgb
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import make_pipeline
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, make_scorer
 
 from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
 

From 60d564683175259b074fad355f8fd4dafe40a61a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 15 Jul 2021 21:42:58 +0200
Subject: [PATCH 029/147] Adding TabularBenchmark interface + easy import

---
 hpobench/benchmarks/ml/__init__.py          |  1 +
 hpobench/benchmarks/ml/tabular_benchmark.py | 63 +++++++++++++++++++--
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index 54cf8d51..1e64edc9 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -1,3 +1,4 @@
+from .tabular_benchmark import TabularBenchmark
 from .svm_benchmark import SVMBenchmark
 from .rf_benchmark import RandomForestBenchmark
 from .xgboost_benchmark import XGBoostBenchmark
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 782ee254..eff31523 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -3,14 +3,17 @@
 import numpy as np
 import ConfigSpace as CS
 import pickle5 as pickle
-from typing import Union, List
+from copy import deepcopy
+from typing import Union, List, Dict
+from hpobench.benchmarks.ml.ml_benchmark_template import metrics
 
 
 class TabularBenchmark:
-    def __init__(self, table_path: str, seed: Union[int, None]=None):
+    def __init__(self, table_path: str, seed: Union[int, None] = None):
         assert os.path.isfile(table_path), "Not a valid path: {}".format(table_path)
         table = self._load_table(table_path)
         self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
+        self.rng = np.random.RandomState(self.seed)
         self.exp_args = table['exp_args']
         self.config_spaces = table['config_spaces']
         self.x_cs = self.get_hyperparameter_space(seed=self.seed)
@@ -54,7 +57,14 @@ def get_global_min(self, metric: str = "acc"):
             "Not a valid metric: {}".format(list(self.global_minimums.keys()))
         return self.global_minimums[metric]
 
-    def objective_function(self, config, fidelity):
+    def _objective(
+            self,
+            config: CS.Configuration,
+            fidelity: CS.Configuration,
+            seed: Union[int, None] = None,
+            metric: Union[str, None] = "acc",
+            eval: Union[str] = "val"
+    ) -> Dict:
         self.x_cs.check_configuration(config)
         self.z_cs.check_configuration(fidelity)
         key_path = []
@@ -67,4 +77,49 @@ def objective_function(self, config, fidelity):
             raise ValueError(
                 "Invalid config-fidelity or not recorded in table!\n{}\n{}".format(config, fidelity)
             )
-        return val
+        seeds = list(val.keys())
+        assert metric in list(metrics.keys()), \
+            "metric not found among: {{{}}}".format(", ".join(list(metrics.keys())))
+        score_key = "{}_scores".format(eval)
+        cost_key = "{}_scores".format(eval)
+        if seed is None:
+            result = dict(function_value=0.0, cost=0.0, info=dict())
+            loss = []
+            costs = 0.0
+            info = dict()
+            for seed in seeds:
+                result = deepcopy(val[seed])
+                loss.append(1 - result["info"][score_key][metric])
+                costs += result["info"]["model_cost"] + result["info"][cost_key][metric]
+                info[seed] = result["info"]
+            loss = np.mean(loss)
+            result["function_value"] = loss
+            result["cost"] = costs
+            result["info"] = info
+        else:
+            assert seed in list(val.keys()), \
+                "seed not found among: {{{}}}".format(", ".join([str(s) for s in seeds]))
+            result = deepcopy(val[seed])
+            result["function_value"] = 1 - result["info"][score_key][metric]
+            result["cost"] = result["info"]["model_cost"] + result["info"][cost_key][metric]
+        return result
+
+    def objective_function(
+            self,
+            config: CS.Configuration,
+            fidelity: CS.Configuration,
+            seed: Union[int, None] = None,
+            metric: Union[str, None] = "acc"
+    ) -> Dict:
+        result = self._objective(config, fidelity, seed, metric, eval="val")
+        return result
+
+    def objective_function_test(
+            self,
+            config: CS.Configuration,
+            fidelity: CS.Configuration,
+            seed: Union[int, None] = None,
+            metric: Union[str, None] = "acc"
+    ) -> Dict:
+        result = self._objective(config, fidelity, seed, metric, eval="test")
+        return result

From c4100fd55c5c1da7e57a8b6057e9f318fe107e2e Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 16 Jul 2021 21:39:03 +0200
Subject: [PATCH 030/147] Adding LR space

---
 hpobench/benchmarks/ml/__init__.py          |  3 +-
 hpobench/benchmarks/ml/histgb_benchmark.py  |  5 +-
 hpobench/benchmarks/ml/lr_benchmark.py      | 83 +++++++++++++++++++++
 hpobench/benchmarks/ml/rf_benchmark.py      |  5 +-
 hpobench/benchmarks/ml/svm_benchmark.py     |  4 +-
 hpobench/benchmarks/ml/xgboost_benchmark.py |  5 +-
 6 files changed, 92 insertions(+), 13 deletions(-)
 create mode 100644 hpobench/benchmarks/ml/lr_benchmark.py

diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index 1e64edc9..d31a8bed 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -2,4 +2,5 @@
 from .svm_benchmark import SVMBenchmark
 from .rf_benchmark import RandomForestBenchmark
 from .xgboost_benchmark import XGBoostBenchmark
-from .histgb_benchmark import HistGBBenchmark
\ No newline at end of file
+from .histgb_benchmark import HistGBBenchmark
+from .lr_benchmark import LRBenchmark
\ No newline at end of file
diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index a803aeea..b2bb238f 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -63,7 +63,6 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
         if fidelity_choice == 0:
             # only subsample as fidelity
             ntrees = CS.Constant('n_estimators', value=100)
@@ -78,7 +77,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             # only subsample as fidelity
             ntrees = CS.Constant('n_estimators', value=100)
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         else:
             # both n_estimators and subsample as fidelities
@@ -86,7 +85,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
                 'n_estimators', lower=2, upper=100, default_value=10, log=False
             )
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs
diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
new file mode 100644
index 00000000..c9fd4c9b
--- /dev/null
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -0,0 +1,83 @@
+import ConfigSpace as CS
+from typing import Union, List, Dict
+
+from sklearn.linear_model import SGDClassifier
+
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
+
+
+class LRBenchmark(MLBenchmark):
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            data_path: Union[str, None] = None
+    ):
+        super(LRBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice, data_path)
+        self.cache_size = 200
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+        cs.add_hyperparameters([
+            CS.UniformFloatHyperparameter(
+                "alpha", 10**-5, 10**4, log=True, default_value=1.0
+            ),
+            CS.UniformFloatHyperparameter(
+                "eta0", 2**-10, 1, log=True, default_value=0.3
+            )
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=None):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        For SVM, only a single fidelity exists, i.e., subsample fraction.
+        if fidelity_choice == 0
+            uses the entire data (subsample=1), reflecting the black-box setup
+        else
+            parameterizes the fraction of data to subsample
+
+        """
+        z_cs = CS.ConfigurationSpace(seed=seed)
+
+        if fidelity_choice == 0:
+            iter = CS.Constant('iter', value=1000)
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 1:
+            iter = CS.UniformIntegerHyperparameter(
+                'iter', lower=100, upper=10000, default_value=100, log=False
+            )
+            subsample = CS.Constant('subsample', value=1)
+        elif fidelity_choice == 2:
+            iter = CS.Constant('iter', value=1000)
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
+            )
+        else:
+            iter = CS.UniformIntegerHyperparameter(
+                'iter', lower=100, upper=10000, default_value=100, log=False
+            )
+            subsample = CS.UniformFloatHyperparameter(
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
+            )
+        z_cs.add_hyperparameters([iter, subsample])
+        return z_cs
+
+    def init_model(self, config, fidelity=None, rng=None):
+        # initializing model
+        rng = self.rng if rng is None else rng
+        config = config.get_dictionary()
+        model = SGDClassifier(
+            **config,
+            loss="log",
+            max_iter=fidelity["iter"],
+            learning_rate="invscaling",
+            random_state=rng
+        )
+        return model
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index e6cfa8ba..1972a226 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -56,7 +56,6 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
         if fidelity_choice == 0:
             # only subsample as fidelity
             ntrees = CS.Constant('n_estimators', value=100)
@@ -71,7 +70,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             # only subsample as fidelity
             ntrees = CS.Constant('n_estimators', value=100)
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         else:
             # both n_estimators and subsample as fidelities
@@ -79,7 +78,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
                 'n_estimators', lower=2, upper=100, default_value=10, log=False
             )
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index fc541567..784a91b9 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -50,10 +50,8 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
         if fidelity_choice == 0:
             subsample = CS.Constant('subsample', value=1)
         else:
-            # TODO: dynamically adapt based on 1/512 and lower_bound_train_size and set log=True
-            lower = 0.1
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=lower, upper=1, default_value=0.33, log=False
+                'subsample', lower=0.1, upper=1, default_value=0.33, log=False
             )
         z_cs.add_hyperparameter(subsample)
         return z_cs
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 5efe56ed..f4680854 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -69,7 +69,6 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-        subsample_lower_bound = np.max((0.1, (0.1 or self.lower_bound_train_size)))
         if fidelity_choice == 0:
             # only subsample as fidelity
             ntrees = CS.Constant('n_estimators', value=100)
@@ -84,7 +83,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             # only subsample as fidelity
             ntrees = CS.Constant('n_estimators', value=100)
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         else:
             # both n_estimators and subsample as fidelities
@@ -92,7 +91,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
                 'n_estimators', lower=2, upper=100, default_value=10, log=False
             )
             subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=subsample_lower_bound, upper=1, default_value=1, log=False
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs

From 9c6dcdb4f926bd38be91d6f8cddb5e39e93db808 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 19 Jul 2021 14:55:31 +0200
Subject: [PATCH 031/147] Standardizing fidelity space definitions

---
 hpobench/benchmarks/ml/histgb_benchmark.py  | 42 +++++++++++---------
 hpobench/benchmarks/ml/lr_benchmark.py      | 43 ++++++++++++---------
 hpobench/benchmarks/ml/rf_benchmark.py      | 42 +++++++++++---------
 hpobench/benchmarks/ml/svm_benchmark.py     | 15 ++++---
 hpobench/benchmarks/ml/xgboost_benchmark.py | 42 +++++++++++---------
 5 files changed, 104 insertions(+), 80 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index b2bb238f..93b5d908 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -63,30 +63,34 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-        if fidelity_choice == 0:
-            # only subsample as fidelity
-            ntrees = CS.Constant('n_estimators', value=100)
-            subsample = CS.Constant('subsample', value=1)
-        elif fidelity_choice == 1:
-            # only n_estimators as fidelity
-            ntrees = CS.UniformIntegerHyperparameter(
+        fidelity1 = dict(
+            fixed=CS.Constant('n_estimators', value=100),
+            variable=CS.UniformIntegerHyperparameter(
                 'n_estimators', lower=2, upper=100, default_value=10, log=False
             )
-            subsample = CS.Constant('subsample', value=1)
-        elif fidelity_choice == 2:
-            # only subsample as fidelity
-            ntrees = CS.Constant('n_estimators', value=100)
-            subsample = CS.UniformFloatHyperparameter(
+        )
+        fidelity2 = dict(
+            fixed=CS.Constant('subsample', value=1),
+            variable=CS.UniformFloatHyperparameter(
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
+        )
+        if fidelity_choice == 0:
+            # black-box setting (full fidelity)
+            ntrees = fidelity1["fixed"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 1:
+            # gray-box setting (multi-fidelity) - ntrees
+            ntrees = fidelity1["variable"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 2:
+            # gray-box setting (multi-fidelity) - data subsample
+            ntrees = fidelity1["fixed"]
+            subsample = fidelity2["variable"]
         else:
-            # both n_estimators and subsample as fidelities
-            ntrees = CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=2, upper=100, default_value=10, log=False
-            )
-            subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=0.1, upper=1, default_value=1, log=False
-            )
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            ntrees = fidelity1["variable"]
+            subsample = fidelity2["variable"]
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs
 
diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index c9fd4c9b..cdbdf33d 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -16,7 +16,7 @@ def __init__(
             data_path: Union[str, None] = None
     ):
         super(LRBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice, data_path)
-        self.cache_size = 200
+        self.cache_size = 500
 
     @staticmethod
     def get_configuration_space(seed=None):
@@ -45,27 +45,34 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
 
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-
-        if fidelity_choice == 0:
-            iter = CS.Constant('iter', value=1000)
-            subsample = CS.Constant('subsample', value=1)
-        elif fidelity_choice == 1:
-            iter = CS.UniformIntegerHyperparameter(
-                'iter', lower=100, upper=10000, default_value=100, log=False
+        fidelity1 = dict(
+            fixed=CS.Constant('iter', value=1000),
+            variable=CS.UniformIntegerHyperparameter(
+                'iter', lower=10, upper=1000, default_value=100, log=False
             )
-            subsample = CS.Constant('subsample', value=1)
-        elif fidelity_choice == 2:
-            iter = CS.Constant('iter', value=1000)
-            subsample = CS.UniformFloatHyperparameter(
+        )
+        fidelity2 = dict(
+            fixed=CS.Constant('subsample', value=1),
+            variable=CS.UniformFloatHyperparameter(
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
+        )
+        if fidelity_choice == 0:
+            # black-box setting (full fidelity)
+            iter = fidelity1["fixed"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 1:
+            # gray-box setting (multi-fidelity) - iterations
+            iter = fidelity1["variable"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 2:
+            # gray-box setting (multi-fidelity) - data subsample
+            iter = fidelity1["fixed"]
+            subsample = fidelity2["variable"]
         else:
-            iter = CS.UniformIntegerHyperparameter(
-                'iter', lower=100, upper=10000, default_value=100, log=False
-            )
-            subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=0.1, upper=1, default_value=1, log=False
-            )
+            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
+            iter = fidelity1["variable"]
+            subsample = fidelity2["variable"]
         z_cs.add_hyperparameters([iter, subsample])
         return z_cs
 
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 1972a226..9fa8416e 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -56,30 +56,34 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-        if fidelity_choice == 0:
-            # only subsample as fidelity
-            ntrees = CS.Constant('n_estimators', value=100)
-            subsample = CS.Constant('subsample', value=1)
-        elif fidelity_choice == 1:
-            # only n_estimators as fidelity
-            ntrees = CS.UniformIntegerHyperparameter(
+        fidelity1 = dict(
+            fixed=CS.Constant('n_estimators', value=100),
+            variable=CS.UniformIntegerHyperparameter(
                 'n_estimators', lower=2, upper=100, default_value=10, log=False
             )
-            subsample = CS.Constant('subsample', value=1)
-        elif fidelity_choice == 2:
-            # only subsample as fidelity
-            ntrees = CS.Constant('n_estimators', value=100)
-            subsample = CS.UniformFloatHyperparameter(
+        )
+        fidelity2 = dict(
+            fixed=CS.Constant('subsample', value=1),
+            variable=CS.UniformFloatHyperparameter(
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
+        )
+        if fidelity_choice == 0:
+            # black-box setting (full fidelity)
+            ntrees = fidelity1["fixed"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 1:
+            # gray-box setting (multi-fidelity) - ntrees
+            ntrees = fidelity1["variable"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 2:
+            # gray-box setting (multi-fidelity) - data subsample
+            ntrees = fidelity1["fixed"]
+            subsample = fidelity2["variable"]
         else:
-            # both n_estimators and subsample as fidelities
-            ntrees = CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=2, upper=100, default_value=10, log=False
-            )
-            subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=0.1, upper=1, default_value=1, log=False
-            )
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            ntrees = fidelity1["variable"]
+            subsample = fidelity2["variable"]
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs
 
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 784a91b9..267620b4 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -46,13 +46,18 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
 
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-
-        if fidelity_choice == 0:
-            subsample = CS.Constant('subsample', value=1)
-        else:
-            subsample = CS.UniformFloatHyperparameter(
+        fidelity = dict(
+            fixed=CS.Constant('subsample', value=1),
+            variable=CS.UniformFloatHyperparameter(
                 'subsample', lower=0.1, upper=1, default_value=0.33, log=False
             )
+        )
+        if fidelity_choice == 0:
+            # black-box setting (full fidelity)
+            subsample = fidelity["fixed"]
+        else:
+            # gray-box setting (multi-fidelity) - data subsample
+            subsample = fidelity["variable"]
         z_cs.add_hyperparameter(subsample)
         return z_cs
 
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index f4680854..dc4a4621 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -69,30 +69,34 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
         z_cs = CS.ConfigurationSpace(seed=seed)
-        if fidelity_choice == 0:
-            # only subsample as fidelity
-            ntrees = CS.Constant('n_estimators', value=100)
-            subsample = CS.Constant('subsample', value=1)
-        elif fidelity_choice == 1:
-            # only n_estimators as fidelity
-            ntrees = CS.UniformIntegerHyperparameter(
+        fidelity1 = dict(
+            fixed=CS.Constant('n_estimators', value=100),
+            variable=CS.UniformIntegerHyperparameter(
                 'n_estimators', lower=2, upper=100, default_value=10, log=False
             )
-            subsample = CS.Constant('subsample', value=1)
-        elif fidelity_choice == 2:
-            # only subsample as fidelity
-            ntrees = CS.Constant('n_estimators', value=100)
-            subsample = CS.UniformFloatHyperparameter(
+        )
+        fidelity2 = dict(
+            fixed=CS.Constant('subsample', value=1),
+            variable=CS.UniformFloatHyperparameter(
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
+        )
+        if fidelity_choice == 0:
+            # black-box setting (full fidelity)
+            ntrees = fidelity1["fixed"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 1:
+            # gray-box setting (multi-fidelity) - ntrees
+            ntrees = fidelity1["variable"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 2:
+            # gray-box setting (multi-fidelity) - data subsample
+            ntrees = fidelity1["fixed"]
+            subsample = fidelity2["variable"]
         else:
-            # both n_estimators and subsample as fidelities
-            ntrees = CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=2, upper=100, default_value=10, log=False
-            )
-            subsample = CS.UniformFloatHyperparameter(
-                'subsample', lower=0.1, upper=1, default_value=1, log=False
-            )
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            ntrees = fidelity1["variable"]
+            subsample = fidelity2["variable"]
         z_cs.add_hyperparameters([ntrees, subsample])
         return z_cs
 

From 74b6919b30db1fb32887a352fedf4494c84e6cfc Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 19 Jul 2021 18:39:08 +0200
Subject: [PATCH 032/147] Standardizing HPs + Adding NN space

---
 hpobench/benchmarks/ml/__init__.py          |   3 +-
 hpobench/benchmarks/ml/histgb_benchmark.py  |  24 +--
 hpobench/benchmarks/ml/nn_benchmark.py      | 174 ++++++++++++++++++++
 hpobench/benchmarks/ml/rf_benchmark.py      |   2 +-
 hpobench/benchmarks/ml/xgboost_benchmark.py |   4 +-
 5 files changed, 185 insertions(+), 22 deletions(-)
 create mode 100644 hpobench/benchmarks/ml/nn_benchmark.py

diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index d31a8bed..37d5cd33 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -3,4 +3,5 @@
 from .rf_benchmark import RandomForestBenchmark
 from .xgboost_benchmark import XGBoostBenchmark
 from .histgb_benchmark import HistGBBenchmark
-from .lr_benchmark import LRBenchmark
\ No newline at end of file
+from .lr_benchmark import LRBenchmark
+from .nn_benchmark import NNBenchmark
\ No newline at end of file
diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 93b5d908..ba2a4112 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -35,17 +35,12 @@ def get_configuration_space(seed=None):
             CS.UniformIntegerHyperparameter(
                 'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
             ),
-            #TODO: fix lr value range error in map_to_config()
             CS.UniformFloatHyperparameter(
-                'learning_rate', lower=1e-5, upper=1e-1, default_value=0.1, log=True
+                'learning_rate', lower=2**-10, upper=1, default_value=0.3, log=True
             ),
-            #TODO: find best way to encode l2 reg. since log params cannot have 0 as exact bound
-            # scales the regularization parameter by using it as a power of 10
-            # such that the range of the parameter becomes {0, 1e-7, 1e-6, ..., 1e-1}
-            # where 10 ** 0 is enforced to be 0 (no regularization)
-            CS.UniformIntegerHyperparameter(
-                'l2_regularization', lower=-7, upper=0, default_value=0, log=False
-            )  # value of 1 indicates 0 regularization
+            CS.UniformFloatHyperparameter(
+                'l2_regularization', lower=2**-10, upper=2**10, default_value=0.1, log=True
+            )
         ])
         return cs
 
@@ -66,7 +61,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('n_estimators', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=2, upper=100, default_value=10, log=False
+                'n_estimators', lower=1, upper=128, default_value=10, log=False
             )
         )
         fidelity2 = dict(
@@ -98,15 +93,8 @@ def init_model(self, config, fidelity=None, rng=None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
-        config = deepcopy(config).get_dictionary()
-        l2 = config.pop("l2_regularization")
-        l2 = 0 if l2 == 1 else 10 ** l2
-        # TODO: decide on encoding of learning rate
-        #TODO: allow non-encoded categoricals?
-        #TODO: early stopping set to False?
         model = HistGradientBoostingClassifier(
-            **config,
-            l2_regularization=l2,
+            **config.get_dictionary(),
             max_iter=fidelity['n_estimators'],  # a fidelity being used during initialization
             early_stopping=False,
             random_state=rng
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
new file mode 100644
index 00000000..6a2deb73
--- /dev/null
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -0,0 +1,174 @@
+import numpy as np
+import ConfigSpace as CS
+from copy import deepcopy
+from typing import Union, Tuple
+from sklearn.neural_network import MLPClassifier
+
+from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
+
+
+class NNBenchmark(MLBenchmark):
+    def __init__(
+            self,
+            task_id: Union[int, None] = None,
+            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            valid_size: float = 0.33,
+            fidelity_choice: int = 1,
+            data_path: Union[str, None] = None
+    ):
+        super(NNBenchmark, self).__init__(
+            task_id, seed, valid_size, fidelity_choice, data_path
+        )
+        # fixing layers in the architecture
+        self.n_layers = 5
+        pass
+
+    @staticmethod
+    def get_configuration_space(seed=None):
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.CategoricalHyperparameter(
+                'shape', default_value="funnel",
+                choices=["funnel", "long_funnel", "rhombus", "diamond", "hexagon",
+                         "brick", "triangle", "stairs"]
+            ),
+            CS.OrdinalHyperparameter(
+                'max_hidden_dim', sequence=[64, 128, 256, 512, 1024], default_value=128
+            ),
+            CS.UniformIntegerHyperparameter(
+                'batch_size', lower=4, upper=128, default_value=16, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'learning_rate_init', lower=2**-10, upper=1, default_value=0.3, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'momentum', lower=0, upper=1, default_value=0.9, log=False
+            ),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed=None, fidelity_choice=1):
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of epochs (max_iter)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
+        """
+        z_cs = CS.ConfigurationSpace(seed=seed)
+        fidelity1 = dict(
+            fixed=CS.Constant('iter', value=100),
+            variable=CS.UniformIntegerHyperparameter(
+                'iter', lower=3, upper=30, default_value=50, log=False
+            )
+        )
+        fidelity2 = dict(
+            fixed=CS.Constant('subsample', value=1),
+            variable=CS.UniformFloatHyperparameter(
+                'subsample', lower=0.1, upper=1, default_value=1, log=False
+            )
+        )
+        if fidelity_choice == 0:
+            # black-box setting (full fidelity)
+            iter = fidelity1["fixed"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 1:
+            # gray-box setting (multi-fidelity) - epochs/iteration
+            iter = fidelity1["variable"]
+            subsample = fidelity2["fixed"]
+        elif fidelity_choice == 2:
+            # gray-box setting (multi-fidelity) - data subsample
+            iter = fidelity1["fixed"]
+            subsample = fidelity2["variable"]
+        else:
+            # gray-box setting (multi-multi-fidelity) - epochs + data subsample
+            iter = fidelity1["variable"]
+            subsample = fidelity2["variable"]
+        z_cs.add_hyperparameters([iter, subsample])
+        return z_cs
+
+    def _get_architecture(self, shape: str, max_hidden_size: int) -> Tuple:
+        # https://mikkokotila.github.io/slate/#shapes
+        arch = []
+        if shape == "funnel":
+            for i in range(self.n_layers):
+                arch.append(max_hidden_size)
+                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
+        elif shape == "long_funnel":
+            brick_arch_len = np.ceil(self.n_layers / 2).astype(int)
+            for i in range(brick_arch_len):
+                arch.append(max_hidden_size)
+            for i in range(self.n_layers - brick_arch_len):
+                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
+                arch.append(max_hidden_size)
+        elif shape == "rhombus":
+            arch.append(max_hidden_size)
+            rhombus_len = self.n_layers // 2
+            _arch = []
+            for i in range(rhombus_len):
+                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
+                _arch.append(max_hidden_size)
+            arch = np.flip(_arch).tolist() + arch + _arch
+        elif shape == "diamond":
+            # open rhombus
+            arch.append(max_hidden_size)
+            rhombus_len = self.n_layers // 2
+            second_max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
+            _arch = []
+            for i in range(rhombus_len):
+                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
+                _arch.append(max_hidden_size)
+            arch = [second_max_hidden_size] * rhombus_len + arch + _arch
+        elif shape == "hexagon":
+            if self.n_layers % 2 == 0:
+                arch.append(max_hidden_size)
+            half_len = np.ceil(self.n_layers / 2).astype(int)
+            _arch = []
+            for i in range(half_len):
+                _arch.append(max_hidden_size)
+                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
+            arch = _arch[::-1] + arch + _arch[:-1]
+        elif shape == "triangle":
+            # reverse funnel
+            for i in range(self.n_layers):
+                arch.append(max_hidden_size)
+                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
+            arch = arch[::-1]
+        elif shape == "stairs":
+            for i in range(1, self.n_layers+1):
+                arch.append(max_hidden_size)
+                if i % 2 == 0 or self.n_layers < 4:
+                    max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
+        else:
+            # default to brick design
+            arch = tuple([max_hidden_size] * self.n_layers)
+        arch = tuple(arch)
+        return arch
+
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+        rng = self.rng if rng is None else rng
+        config = deepcopy(config.get_dictionary())
+        shape = config["shape"]
+        max_hidden_dim = config["max_hidden_dim"]
+        config.pop("shape")
+        config.pop("max_hidden_dim")
+        model = MLPClassifier(
+            **config,
+            hidden_layer_sizes=self._get_architecture(shape, max_hidden_dim),
+            activation="relu",
+            solver="sgd",
+            learning_rate="invscaling",
+            max_iter=fidelity['iter'],  # a fidelity being used during initialization
+            random_state=rng
+        )
+        return model
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 9fa8416e..a57b7726 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -59,7 +59,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('n_estimators', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=2, upper=100, default_value=10, log=False
+                'n_estimators', lower=1, upper=128, default_value=10, log=False
             )
         )
         fidelity2 = dict(
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index dc4a4621..4c77a92e 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -32,7 +32,7 @@ def get_configuration_space(seed=None):
                 'eta', lower=2**-10, upper=1., default_value=0.3, log=True
             ),  # learning rate
             CS.UniformIntegerHyperparameter(
-                'max_depth', lower=1, upper=15, default_value=6, log=False
+                'max_depth', lower=1, upper=15, default_value=2, log=False
             ),
             CS.UniformFloatHyperparameter(
                 'min_child_weight', lower=1., upper=2**7., default_value=1., log=True
@@ -72,7 +72,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('n_estimators', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=2, upper=100, default_value=10, log=False
+                'n_estimators', lower=1, upper=128, default_value=10, log=False
             )
         )
         fidelity2 = dict(

From 785055eccb6480586f512f6d2ace1625ba32591e Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 19 Jul 2021 18:40:24 +0200
Subject: [PATCH 033/147] Small placeholder for testing

---
 hpobench/benchmarks/ml/nn_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 6a2deb73..0063cdc9 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -67,7 +67,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'iter', lower=3, upper=30, default_value=50, log=False
+                'iter', lower=3, upper=30, default_value=30, log=False
             )
         )
         fidelity2 = dict(

From 0159a35ff4e2532bb9011326927eba77009b160a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 20 Jul 2021 14:34:33 +0200
Subject: [PATCH 034/147] Updating NN HP space + Helper function for
 TabularBenchmark

---
 hpobench/benchmarks/ml/nn_benchmark.py      | 13 +++++++------
 hpobench/benchmarks/ml/tabular_benchmark.py | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 0063cdc9..89aa115f 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -38,15 +38,15 @@ def get_configuration_space(seed=None):
             CS.OrdinalHyperparameter(
                 'max_hidden_dim', sequence=[64, 128, 256, 512, 1024], default_value=128
             ),
+            CS.UniformFloatHyperparameter(
+                'alpha', lower=10**-5, upper=10**4, default_value=10**-3, log=True
+            ),
             CS.UniformIntegerHyperparameter(
-                'batch_size', lower=4, upper=128, default_value=16, log=True
+                'batch_size', lower=4, upper=256, default_value=32, log=True
             ),
             CS.UniformFloatHyperparameter(
                 'learning_rate_init', lower=2**-10, upper=1, default_value=0.3, log=True
-            ),
-            CS.UniformFloatHyperparameter(
-                'momentum', lower=0, upper=1, default_value=0.9, log=False
-            ),
+            )
         ])
         return cs
 
@@ -67,7 +67,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'iter', lower=3, upper=30, default_value=30, log=False
+                'iter', lower=3, upper=150, default_value=30, log=False
             )
         )
         fidelity2 = dict(
@@ -168,6 +168,7 @@ def init_model(self, config, fidelity=None, rng=None):
             activation="relu",
             solver="sgd",
             learning_rate="invscaling",
+            momentum=0.9,
             max_iter=fidelity['iter'],  # a fidelity being used during initialization
             random_state=rng
         )
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index eff31523..ff5fcd8e 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -26,6 +26,18 @@ def _load_table(self, path):
             table = pickle.load(f)
         return table
 
+    def _get_model_name(self):
+        return self.exp_args["space"]
+
+    def _total_number_of_configurations(self, space: str="hyperparameters") -> int:
+        """ Returns the number of unique configurations in the parameter/fidelity space
+        """
+        count = 1
+        cs = self.x_cs if space == "hyperparameters" else self.z_cs
+        for hp in cs.get_hyperparameters():
+            count *= len(hp.sequence)
+        return count
+
     def get_hyperparameter_space(self, seed=None, original=False):
         cs = CS.ConfigurationSpace(seed=seed)
         if original:
@@ -57,6 +69,12 @@ def get_global_min(self, metric: str = "acc"):
             "Not a valid metric: {}".format(list(self.global_minimums.keys()))
         return self.global_minimums[metric]
 
+    def get_max_fidelity(self) -> Dict:
+        max_fidelity = dict()
+        for hp in self.z_cs.get_hyperparameters():
+            max_fidelity[hp.name] = np.sort(hp.sequence)[-1]
+        return max_fidelity
+
     def _objective(
             self,
             config: CS.Configuration,

From e9e097af3d094495c81abbe94c2984597e63273a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 20 Jul 2021 20:11:49 +0200
Subject: [PATCH 035/147] Adding fidelity range retrieval utility to
 TabularBenchmark

---
 hpobench/benchmarks/ml/tabular_benchmark.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index ff5fcd8e..e0be2fc0 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -75,6 +75,13 @@ def get_max_fidelity(self) -> Dict:
             max_fidelity[hp.name] = np.sort(hp.sequence)[-1]
         return max_fidelity
 
+    def get_fidelity_range(self):
+        fidelities = []
+        for hp in self.z_cs.get_hyperparameters():
+            if not isinstance(hp, CS.Constant) and len(hp.sequence) > 1:
+                fidelities.append((hp.name, hp.sequence[0], hp.sequence[-1]))
+        return fidelities
+
     def _objective(
             self,
             config: CS.Configuration,

From 47971098ea3a9221d6839ff031be1970a3318933 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 21 Jul 2021 16:14:12 +0200
Subject: [PATCH 036/147] Enforcing subsample lower bound check inside
 objective

---
 hpobench/benchmarks/ml/ml_benchmark_template.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index dacb64db..cdde98ad 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -177,8 +177,8 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
         # validation set is fixed till this function is called again or explicitly altered
         valid_size = self.valid_size if valid_size is None else valid_size
         self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
-            train_x, train_y, test_size=valid_size,
-            shuffle=True, stratify=train_y, random_state=check_random_state(self.global_seed)
+            train_x, train_y, test_size=valid_size, shuffle=True, stratify=train_y,
+            random_state=check_random_state(self.global_seed)  # uses global seed for fixed splits
         )
 
         # preprocessor to handle missing values, categorical columns encodings,
@@ -200,7 +200,7 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
             ])
         )
         if verbose:
-            print("Shape of data pre-preprocessing: {}".format(train_X.shape))
+            print("Shape of data pre-preprocessing: {}".format(self.train_X.shape))
 
         # preprocessor fit only on the training set
         self.train_X = self.preprocessor.fit_transform(self.train_X)
@@ -219,7 +219,7 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
         self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
 
         if verbose:
-            print("Shape of data post-preprocessing: {}".format(train_X.shape), "\n")
+            print("Shape of data post-preprocessing: {}".format(self.train_X.shape), "\n")
 
         if verbose:
             print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
@@ -261,9 +261,13 @@ def _train_objective(self, config, fidelity, shuffle, rng, eval="valid"):
 
         # subsample here:
         # application of the other fidelity to the dataset that the model interfaces
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
         train_idx = self.rng.choice(
             np.arange(len(train_X)), size=int(
-                fidelity['subsample'] * len(train_X)
+                subsample * len(train_X)
             )
         )
         # fitting the model with subsampled data

From dbb73278d067a163ef7c032c0c85095ec2091e8a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 22 Jul 2021 00:55:00 +0200
Subject: [PATCH 037/147] Bug fix + adding precicion as metric

---
 .../benchmarks/ml/ml_benchmark_template.py    | 43 +++++++++----------
 hpobench/benchmarks/ml/svm_benchmark.py       |  2 +-
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index cdde98ad..bc169077 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -13,27 +13,23 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, \
-    top_k_accuracy_score, balanced_accuracy_score
+from sklearn.metrics import make_scorer, accuracy_score, balanced_accuracy_score, \
+    precision_score, f1_score
 
 from hpobench.abstract_benchmark import AbstractBenchmark
 
 
 metrics = dict(
-    #TODO: decide on metrics generalized for different datasets
     acc=accuracy_score,
     bal_acc=balanced_accuracy_score,
     f1=f1_score,
-    # roc=roc_auc_score,
-    # topk=top_k_accuracy_score
+    precision=precision_score,
 )
 metrics_kwargs = dict(
-    #TODO: decide on metric parameters
     acc=dict(),
     bal_acc=dict(),
-    f1=dict(average="weighted"),
-    # roc=dict(average="weighted"),
-    # topk=dict()
+    f1=dict(average="macro", zero_division=0),
+    precision=dict(average="macro", zero_division=0),
 )
 
 
@@ -174,11 +170,11 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
         self.test_y = y.iloc[self.test_idx]
 
         # splitting training into training and validation
-        # validation set is fixed till this function is called again or explicitly altered
+        # validation set is fixed as per the global seed independent of the benchmark seed
         valid_size = self.valid_size if valid_size is None else valid_size
         self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
             train_x, train_y, test_size=valid_size, shuffle=True, stratify=train_y,
-            random_state=check_random_state(self.global_seed)  # uses global seed for fixed splits
+            random_state=check_random_state(self.global_seed)
         )
 
         # preprocessor to handle missing values, categorical columns encodings,
@@ -214,7 +210,6 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
 
         # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
         # use 10 times the number of classes as lower bound for the dataset fraction
-
         self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
         self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
 
@@ -228,7 +223,7 @@ def load_data_from_openml(self, valid_size=None, verbose=False):
             print("\nData loading complete!\n")
         return
 
-    def shuffle_data_idx(self, train_id=None, ng=None):
+    def shuffle_data_idx(self, train_idx=None, rng=None):
         rng = self.rng if rng is None else rng
         train_idx = self.train_idx if train_idx is None else train_idx
         rng.shuffle(train_idx)
@@ -311,11 +306,12 @@ def objective_function(
             _start = time.time()
             test_scores[k] = v(model, self.test_X, self.test_y)
             test_score_cost[k] = time.time() - _start
-        val_loss = 1 - test_scores["acc"]
+        test_loss = 1 - test_scores["acc"]
 
         info = {
             'train_loss': train_loss,
             'val_loss': val_loss,
+            'test_loss': test_loss,
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
@@ -330,7 +326,7 @@ def objective_function(
 
         return {
             'function_value': info['val_loss'],
-            'cost': model_fit_time + info['train_costs']['acc'] + info['val_costs']['acc'],
+            'cost': model_fit_time + info['val_costs']['acc'],
             'info': info
         }
 
@@ -370,16 +366,17 @@ def objective_function_test(
 
         return {
             'function_value': info['test_loss'],
-            'cost': model_fit_time + info['train_costs']['acc'] + info['test_costs']['acc'],
+            'cost': model_fit_time + info['test_costs']['acc'],
             'info': info
         }
 
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
-        return {'name': 'Support Vector Machine',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
+        return {
+            'name': 'Support Vector Machine',
+            'shape of train data': self.train_X.shape,
+            'shape of test data': self.test_X.shape,
+            'shape of valid data': self.valid_X.shape,
+            'initial random seed': self.seed,
+            'task_id': self.task_id
+        }
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 267620b4..61e9840d 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -1,5 +1,5 @@
 import ConfigSpace as CS
-from typing import Union, List, Dict
+from typing import Union
 
 from sklearn.svm import SVC
 

From 7d5ca578bafdc2f97c0d27550110e3d2474c39df Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 22 Jul 2021 21:25:02 +0200
Subject: [PATCH 038/147] Fixing param spaces and model building for LR, SVM

---
 hpobench/benchmarks/ml/lr_benchmark.py  | 21 +++++++++++----------
 hpobench/benchmarks/ml/svm_benchmark.py |  8 ++++----
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index cdbdf33d..de791aa6 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -25,10 +25,10 @@ def get_configuration_space(seed=None):
         cs = CS.ConfigurationSpace(seed=seed)
         cs.add_hyperparameters([
             CS.UniformFloatHyperparameter(
-                "alpha", 10**-5, 10**4, log=True, default_value=1.0
+                "alpha", 1e-5, 1, log=True, default_value=1e-3
             ),
             CS.UniformFloatHyperparameter(
-                "eta0", 2**-10, 1, log=True, default_value=0.3
+                "eta0", 1e-5, 1, log=True, default_value=1e-2
             )
         ])
         return cs
@@ -48,13 +48,13 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=1000),
             variable=CS.UniformIntegerHyperparameter(
-                'iter', lower=10, upper=1000, default_value=100, log=False
+                'iter', lower=10, upper=1000, default_value=1000, log=False
             )
         )
         fidelity2 = dict(
-            fixed=CS.Constant('subsample', value=1),
+            fixed=CS.Constant('subsample', value=1.0),
             variable=CS.UniformFloatHyperparameter(
-                'subsample', lower=0.1, upper=1, default_value=1, log=False
+                'subsample', lower=0.1, upper=1.0, default_value=1.0, log=False
             )
         )
         if fidelity_choice == 0:
@@ -79,12 +79,13 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
     def init_model(self, config, fidelity=None, rng=None):
         # initializing model
         rng = self.rng if rng is None else rng
-        config = config.get_dictionary()
+        # https://scikit-learn.org/stable/modules/sgd.html
         model = SGDClassifier(
-            **config,
-            loss="log",
+            **config.get_dictionary(),
+            loss="log",  # performs Logistic Regression
             max_iter=fidelity["iter"],
-            learning_rate="invscaling",
-            random_state=rng
+            learning_rate="adaptive",
+            tol=None,
+            random_state=rng,
         )
         return model
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 61e9840d..fe1afb66 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -23,13 +23,13 @@ def get_configuration_space(seed=None):
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
-        # https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf (Section 3.2)
+        # https://jmlr.org/papers/volume20/18-444/18-444.pdf (Table 1)
         cs.add_hyperparameters([
             CS.UniformFloatHyperparameter(
-                "C", 2**-5, 2**15, log=True, default_value=1.0
+                "C", 2**-10, 2**10, log=True, default_value=1.0
             ),
             CS.UniformFloatHyperparameter(
-                "gamma", 2**-15, 2**3, log=True, default_value=0.1
+                "gamma", 2**-10, 2**10, log=True, default_value=0.1
             )
         ])
         return cs
@@ -49,7 +49,7 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
         fidelity = dict(
             fixed=CS.Constant('subsample', value=1),
             variable=CS.UniformFloatHyperparameter(
-                'subsample', lower=0.1, upper=1, default_value=0.33, log=False
+                'subsample', lower=0.1, upper=1.0, default_value=1.0, log=False
             )
         )
         if fidelity_choice == 0:

From a6d94bbddb14560840b219d44999f97ea6e2e67b Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 26 Jul 2021 21:58:01 +0200
Subject: [PATCH 039/147] TabularBenchmark edit to read compressed files and
 query a dataframe

---
 .../benchmarks/ml/ml_benchmark_template.py    |   3 +
 hpobench/benchmarks/ml/tabular_benchmark.py   | 119 +++++++++++-------
 2 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index bc169077..b8a66790 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -353,10 +353,13 @@ def objective_function_test(
 
         info = {
             'train_loss': train_loss,
+            'val_loss': None,
             'test_loss': test_loss,
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
+            'val_scores': dict(),
+            'val_costs': dict(),
             'test_scores': test_scores,
             'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index e0be2fc0..528f2eef 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -1,6 +1,9 @@
 import os
 import glom
+import json
+import pickle
 import numpy as np
+import pandas as pd
 import ConfigSpace as CS
 import pickle5 as pickle
 from copy import deepcopy
@@ -9,25 +12,38 @@
 
 
 class TabularBenchmark:
-    def __init__(self, table_path: str, seed: Union[int, None] = None):
-        assert os.path.isfile(table_path), "Not a valid path: {}".format(table_path)
-        table = self._load_table(table_path)
+    def __init__(self, path: str, model: str, task_id: int, seed: Union[int, None] = None):
+        assert os.path.isdir(path), "Not a valid path: {}".format(path)
+        self.data_path = os.path.join(path, "{}_{}_data.parquet.gzip".format(model, task_id))
+        assert os.path.isfile(self.data_path)
+        self.config_path = os.path.join(path, "{}_{}_configs.pkl".format(model, task_id))
+        assert os.path.isfile(self.config_path)
+        self.exp_args_path = os.path.join(path, "{}_{}.json".format(model, task_id))
+        assert os.path.isfile(self.exp_args_path)
+
         self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
         self.rng = np.random.RandomState(self.seed)
-        self.exp_args = table['exp_args']
-        self.config_spaces = table['config_spaces']
+        self.table = self._load_parquet(self.data_path)
+        self.exp_args = self._load_json(self.exp_args_path)
+        self.config_spaces = self._load_pickle(self.config_path)
+
         self.x_cs = self.get_hyperparameter_space(seed=self.seed)
         self.z_cs = self.get_fidelity_space(seed=self.seed)
-        self.table = table['data']
-        self.global_minimums = table['global_min']
+        self.global_minimums = self.exp_args["global_min"]
 
-    def _load_table(self, path):
+    def _load_pickle(self, path):
         with open(path, "rb") as f:
-            table = pickle.load(f)
-        return table
+            data = pickle.load(f)
+        return data
 
-    def _get_model_name(self):
-        return self.exp_args["space"]
+    def _load_parquet(self, path):
+        data = pd.read_parquet(path)
+        return data
+
+    def _load_json(self, path):
+        with open(path, "r") as f:
+            data = json.load(f)
+        return data
 
     def _total_number_of_configurations(self, space: str="hyperparameters") -> int:
         """ Returns the number of unique configurations in the parameter/fidelity space
@@ -38,6 +54,9 @@ def _total_number_of_configurations(self, space: str="hyperparameters") -> int:
             count *= len(hp.sequence)
         return count
 
+    def _seeds_used(self):
+        return self.table.seed.unique().tolist()
+
     def get_hyperparameter_space(self, seed=None, original=False):
         cs = CS.ConfigurationSpace(seed=seed)
         if original:
@@ -82,51 +101,57 @@ def get_fidelity_range(self):
                 fidelities.append((hp.name, hp.sequence[0], hp.sequence[-1]))
         return fidelities
 
+    def _search_dataframe(self, row_dict, df):
+        # https://stackoverflow.com/a/46165056/8363967
+        mask = np.array([True] * df.shape[0])
+        for i, param in enumerate(df.drop("result", axis=1).columns):
+            mask *= df[param].values == row_dict[param]
+        idx = np.where(mask)
+        if len(idx) != 1:
+            return None
+        idx = idx[0][0]
+        result = df.iloc[idx]["result"]
+        return result
+
     def _objective(
             self,
             config: CS.Configuration,
             fidelity: CS.Configuration,
             seed: Union[int, None] = None,
             metric: Union[str, None] = "acc",
-            eval: Union[str] = "val"
+            evaluation: Union[str] = ""
     ) -> Dict:
         self.x_cs.check_configuration(config)
         self.z_cs.check_configuration(fidelity)
-        key_path = []
-        for name in np.sort(self.x_cs.get_hyperparameter_names()):
-            key_path.append(config[str(name)])
-        for name in np.sort(self.z_cs.get_hyperparameter_names()):
-            key_path.append(fidelity[str(name)])
-        val = glom.glom(self.table, glom.Path(*key_path), default=None)
-        if val is None:
-            raise ValueError(
-                "Invalid config-fidelity or not recorded in table!\n{}\n{}".format(config, fidelity)
-            )
-        seeds = list(val.keys())
         assert metric in list(metrics.keys()), \
             "metric not found among: {{{}}}".format(", ".join(list(metrics.keys())))
-        score_key = "{}_scores".format(eval)
-        cost_key = "{}_scores".format(eval)
-        if seed is None:
-            result = dict(function_value=0.0, cost=0.0, info=dict())
-            loss = []
-            costs = 0.0
-            info = dict()
-            for seed in seeds:
-                result = deepcopy(val[seed])
-                loss.append(1 - result["info"][score_key][metric])
-                costs += result["info"]["model_cost"] + result["info"][cost_key][metric]
-                info[seed] = result["info"]
-            loss = np.mean(loss)
-            result["function_value"] = loss
-            result["cost"] = costs
-            result["info"] = info
+        score_key = "{}_scores".format(evaluation)
+        cost_key = "{}_scores".format(evaluation)
+
+        key_path = dict()
+        for name in np.sort(self.x_cs.get_hyperparameter_names()):
+            key_path[str(name)] = config[str(name)]
+        for name in np.sort(self.z_cs.get_hyperparameter_names()):
+            key_path[str(name)] = fidelity[str(name)]
+
+        if seed is not None:
+            assert seed in self._seeds_used()
+            seeds = [seed]
         else:
-            assert seed in list(val.keys()), \
-                "seed not found among: {{{}}}".format(", ".join([str(s) for s in seeds]))
-            result = deepcopy(val[seed])
-            result["function_value"] = 1 - result["info"][score_key][metric]
-            result["cost"] = result["info"]["model_cost"] + result["info"][cost_key][metric]
+            seeds = self._seeds_used()
+
+        loss = []
+        costs = 0.0
+        info = dict()
+        for seed in seeds:
+            key_path["seed"] = seed
+            res = self._search_dataframe(key_path, self.table)
+            loss.append(1 - res["info"][score_key][metric])
+            costs += res["info"]["model_cost"] + res["info"][cost_key][metric]
+            info[seed] = res["info"]
+            key_path.pop("seed")
+        loss = np.mean(loss)
+        result = dict(function_value=loss, cost=costs, info=info)
         return result
 
     def objective_function(
@@ -136,7 +161,7 @@ def objective_function(
             seed: Union[int, None] = None,
             metric: Union[str, None] = "acc"
     ) -> Dict:
-        result = self._objective(config, fidelity, seed, metric, eval="val")
+        result = self._objective(config, fidelity, seed, metric, evaluation="val")
         return result
 
     def objective_function_test(
@@ -146,5 +171,5 @@ def objective_function_test(
             seed: Union[int, None] = None,
             metric: Union[str, None] = "acc"
     ) -> Dict:
-        result = self._objective(config, fidelity, seed, metric, eval="test")
+        result = self._objective(config, fidelity, seed, metric, evaluation="test")
         return result

From 93b69081e38dbb9ddae4a1a9e2505ea5ca8c6bbf Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 27 Jul 2021 02:41:16 +0200
Subject: [PATCH 040/147] Not evaluating training set to save time

---
 hpobench/benchmarks/ml/ml_benchmark_template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index b8a66790..d5cd6229 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -274,7 +274,7 @@ def _train_objective(self, config, fidelity, shuffle, rng, eval="valid"):
         score_cost = dict()
         for k, v in self.scorers.items():
             _start = time.time()
-            scores[k] = v(model, train_X, train_y)
+            scores[k] = 0  # v(model, train_X, train_y)
             score_cost[k] = time.time() - _start
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost

From 8164eb0e21748d80532f2c74967536f5a36870fe Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 27 Jul 2021 21:26:36 +0200
Subject: [PATCH 041/147] Fidelity change for trees + NN space change

---
 hpobench/benchmarks/ml/histgb_benchmark.py    |  2 +-
 .../benchmarks/ml/ml_benchmark_template.py    | 15 +--
 hpobench/benchmarks/ml/nn_benchmark.py        | 95 +++----------------
 hpobench/benchmarks/ml/rf_benchmark.py        | 15 +--
 hpobench/benchmarks/ml/xgboost_benchmark.py   |  4 +-
 5 files changed, 35 insertions(+), 96 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index ba2a4112..1860ff48 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -61,7 +61,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('n_estimators', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=1, upper=128, default_value=10, log=False
+                'n_estimators', lower=16, upper=512, default_value=512, log=False
             )
         )
         fidelity2 = dict(
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/benchmarks/ml/ml_benchmark_template.py
index d5cd6229..83b39957 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/benchmarks/ml/ml_benchmark_template.py
@@ -234,7 +234,7 @@ def init_model(self, config, fidelity=None, rng=None):
         """
         raise NotImplementedError()
 
-    def _train_objective(self, config, fidelity, shuffle, rng, eval="valid"):
+    def _train_objective(self, config, fidelity, shuffle, rng, evaluation="valid"):
         # initializing model
         model = self.init_model(config, fidelity, rng)
 
@@ -273,9 +273,12 @@ def _train_objective(self, config, fidelity, shuffle, rng, eval="valid"):
         scores = dict()
         score_cost = dict()
         for k, v in self.scorers.items():
-            _start = time.time()
-            scores[k] = 0  # v(model, train_X, train_y)
-            score_cost[k] = time.time() - _start
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            if evaluation == "test":
+                _start = time.time()
+                scores[k] = v(model, train_X, train_y)
+                score_cost[k] = time.time() - _start
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
@@ -290,7 +293,7 @@ def objective_function(
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng
+            configuration, fidelity, shuffle, rng, evaluation="val"
         )
         val_scores = dict()
         val_score_cost = dict()
@@ -341,7 +344,7 @@ def objective_function_test(
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, eval="test"
+            configuration, fidelity, shuffle, rng, evaluation="test"
         )
         test_scores = dict()
         test_score_cost = dict()
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 89aa115f..0bec70e4 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -19,8 +19,6 @@ def __init__(
         super(NNBenchmark, self).__init__(
             task_id, seed, valid_size, fidelity_choice, data_path
         )
-        # fixing layers in the architecture
-        self.n_layers = 5
         pass
 
     @staticmethod
@@ -30,22 +28,16 @@ def get_configuration_space(seed=None):
         cs = CS.ConfigurationSpace(seed=seed)
 
         cs.add_hyperparameters([
-            CS.CategoricalHyperparameter(
-                'shape', default_value="funnel",
-                choices=["funnel", "long_funnel", "rhombus", "diamond", "hexagon",
-                         "brick", "triangle", "stairs"]
-            ),
-            CS.OrdinalHyperparameter(
-                'max_hidden_dim', sequence=[64, 128, 256, 512, 1024], default_value=128
-            ),
-            CS.UniformFloatHyperparameter(
-                'alpha', lower=10**-5, upper=10**4, default_value=10**-3, log=True
-            ),
+            CS.UniformIntegerHyperparameter('depth', default_value=3, lower=1, upper=3),
+            CS.UniformIntegerHyperparameter('width', default_value=64, lower=16, upper=256),
             CS.UniformIntegerHyperparameter(
                 'batch_size', lower=4, upper=256, default_value=32, log=True
             ),
             CS.UniformFloatHyperparameter(
-                'learning_rate_init', lower=2**-10, upper=1, default_value=0.3, log=True
+                'alpha', lower=10**-8, upper=1, default_value=10**-3, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True
             )
         ])
         return cs
@@ -67,7 +59,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'iter', lower=3, upper=150, default_value=30, log=False
+                'iter', lower=3, upper=243, default_value=243, log=False
             )
         )
         fidelity2 = dict(
@@ -95,80 +87,21 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         z_cs.add_hyperparameters([iter, subsample])
         return z_cs
 
-    def _get_architecture(self, shape: str, max_hidden_size: int) -> Tuple:
-        # https://mikkokotila.github.io/slate/#shapes
-        arch = []
-        if shape == "funnel":
-            for i in range(self.n_layers):
-                arch.append(max_hidden_size)
-                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
-        elif shape == "long_funnel":
-            brick_arch_len = np.ceil(self.n_layers / 2).astype(int)
-            for i in range(brick_arch_len):
-                arch.append(max_hidden_size)
-            for i in range(self.n_layers - brick_arch_len):
-                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
-                arch.append(max_hidden_size)
-        elif shape == "rhombus":
-            arch.append(max_hidden_size)
-            rhombus_len = self.n_layers // 2
-            _arch = []
-            for i in range(rhombus_len):
-                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
-                _arch.append(max_hidden_size)
-            arch = np.flip(_arch).tolist() + arch + _arch
-        elif shape == "diamond":
-            # open rhombus
-            arch.append(max_hidden_size)
-            rhombus_len = self.n_layers // 2
-            second_max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
-            _arch = []
-            for i in range(rhombus_len):
-                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
-                _arch.append(max_hidden_size)
-            arch = [second_max_hidden_size] * rhombus_len + arch + _arch
-        elif shape == "hexagon":
-            if self.n_layers % 2 == 0:
-                arch.append(max_hidden_size)
-            half_len = np.ceil(self.n_layers / 2).astype(int)
-            _arch = []
-            for i in range(half_len):
-                _arch.append(max_hidden_size)
-                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
-            arch = _arch[::-1] + arch + _arch[:-1]
-        elif shape == "triangle":
-            # reverse funnel
-            for i in range(self.n_layers):
-                arch.append(max_hidden_size)
-                max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
-            arch = arch[::-1]
-        elif shape == "stairs":
-            for i in range(1, self.n_layers+1):
-                arch.append(max_hidden_size)
-                if i % 2 == 0 or self.n_layers < 4:
-                    max_hidden_size = np.ceil(max_hidden_size / 2).astype(int)
-        else:
-            # default to brick design
-            arch = tuple([max_hidden_size] * self.n_layers)
-        arch = tuple(arch)
-        return arch
-
     def init_model(self, config, fidelity=None, rng=None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
         config = deepcopy(config.get_dictionary())
-        shape = config["shape"]
-        max_hidden_dim = config["max_hidden_dim"]
-        config.pop("shape")
-        config.pop("max_hidden_dim")
+        depth = config["depth"]
+        width = config["width"]
+        config.pop("depth")
+        config.pop("width")
+        hidden_layers = [width] * depth
         model = MLPClassifier(
             **config,
-            hidden_layer_sizes=self._get_architecture(shape, max_hidden_dim),
+            hidden_layer_sizes=hidden_layers,
             activation="relu",
-            solver="sgd",
-            learning_rate="invscaling",
-            momentum=0.9,
+            solver="adam",
             max_iter=fidelity['iter'],  # a fidelity being used during initialization
             random_state=rng
         )
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index a57b7726..0cebcdf5 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -28,16 +28,19 @@ def get_configuration_space(seed=None):
 
         cs.add_hyperparameters([
             CS.UniformIntegerHyperparameter(
-                'max_depth', lower=1, upper=15, default_value=2, log=False
+                'max_depth', lower=1, upper=30, default_value=10, log=False
             ),
-            CS.UniformIntegerHyperparameter(
-                'min_samples_split', lower=2, upper=128, default_value=2, log=True
+            CS.UniformFloatHyperparameter(
+                'min_samples_split', lower=0.05, upper=0.9, default_value=0.9, log=True
             ),
+            # CS.UniformIntegerHyperparameter(
+            #     'min_samples_split', lower=2, upper=20, default_value=2, log=False
+            # ),
             CS.UniformFloatHyperparameter(
-                'max_features', lower=0.1, upper=0.9, default_value=0.5, log=False
+                'max_features', lower=0.1, upper=1.0, default_value=0.5, log=False
             ),
             CS.UniformIntegerHyperparameter(
-                'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
+                'min_samples_leaf', lower=1, upper=20, default_value=1, log=False
             ),
         ])
         return cs
@@ -59,7 +62,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('n_estimators', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=1, upper=128, default_value=10, log=False
+                'n_estimators', lower=16, upper=512, default_value=512, log=False
             )
         )
         fidelity2 = dict(
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 4c77a92e..e8b25999 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -32,7 +32,7 @@ def get_configuration_space(seed=None):
                 'eta', lower=2**-10, upper=1., default_value=0.3, log=True
             ),  # learning rate
             CS.UniformIntegerHyperparameter(
-                'max_depth', lower=1, upper=15, default_value=2, log=False
+                'max_depth', lower=1, upper=30, default_value=10, log=False
             ),
             CS.UniformFloatHyperparameter(
                 'min_child_weight', lower=1., upper=2**7., default_value=1., log=True
@@ -72,7 +72,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('n_estimators', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=1, upper=128, default_value=10, log=False
+                'n_estimators', lower=16, upper=512, default_value=512, log=False
             )
         )
         fidelity2 = dict(

From 6916c9cfebfed82e32586228e1ebaea559162436 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 30 Jul 2021 01:14:53 +0200
Subject: [PATCH 042/147] Final RF space

---
 hpobench/benchmarks/ml/rf_benchmark.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 0cebcdf5..cca69d0b 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -25,17 +25,13 @@ def get_configuration_space(seed=None):
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
-
         cs.add_hyperparameters([
             CS.UniformIntegerHyperparameter(
-                'max_depth', lower=1, upper=30, default_value=10, log=False
+                'max_depth', lower=1, upper=50, default_value=10, log=True
             ),
             CS.UniformFloatHyperparameter(
                 'min_samples_split', lower=0.05, upper=0.9, default_value=0.9, log=True
             ),
-            # CS.UniformIntegerHyperparameter(
-            #     'min_samples_split', lower=2, upper=20, default_value=2, log=False
-            # ),
             CS.UniformFloatHyperparameter(
                 'max_features', lower=0.1, upper=1.0, default_value=0.5, log=False
             ),

From 8e5912bd19c9c5b6c859d1063943a697fbfb260a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 30 Jul 2021 01:59:44 +0200
Subject: [PATCH 043/147] Final XGB space

---
 hpobench/benchmarks/ml/xgboost_benchmark.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index e8b25999..221d9d59 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -32,26 +32,17 @@ def get_configuration_space(seed=None):
                 'eta', lower=2**-10, upper=1., default_value=0.3, log=True
             ),  # learning rate
             CS.UniformIntegerHyperparameter(
-                'max_depth', lower=1, upper=30, default_value=10, log=False
+                'max_depth', lower=6, upper=50, default_value=10, log=True
             ),
             CS.UniformFloatHyperparameter(
                 'min_child_weight', lower=1., upper=2**7., default_value=1., log=True
             ),
             CS.UniformFloatHyperparameter(
-                'colsample_bytree', lower=0.01, upper=1., default_value=1.
+                'colsample_bytree', lower=0.05, upper=1., default_value=1.
             ),
-            # CS.UniformFloatHyperparameter(
-            #     'colsample_bylevel', lower=0.01, upper=1., default_value=1.
-            # ),
             CS.UniformFloatHyperparameter(
                 'reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True
-            ),
-            # CS.UniformFloatHyperparameter(
-            #     'reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True
-            # ),
-            # CS.UniformFloatHyperparameter(
-            #     'subsample_per_it', lower=0.1, upper=1, default_value=1, log=False
-            # )
+            )
         ])
         return cs
 
@@ -72,7 +63,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('n_estimators', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=16, upper=512, default_value=512, log=False
+                'n_estimators', lower=50, upper=2000, default_value=1000, log=False
             )
         )
         fidelity2 = dict(
@@ -105,6 +96,7 @@ def init_model(self, config, fidelity=None, rng=None):
         """
         rng = rng if (rng is None and isinstance(rng, int)) else self.seed
         extra_args = dict(
+            booster="gbtree",
             n_estimators=fidelity['n_estimators'],
             objective="binary:logistic",
             random_state=rng,

From 6968ac365483ac51985954d3092100395ef687bf Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 30 Jul 2021 02:36:29 +0200
Subject: [PATCH 044/147] Final HistGB space

---
 hpobench/benchmarks/ml/histgb_benchmark.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 1860ff48..b431c056 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -30,13 +30,13 @@ def get_configuration_space(seed=None):
 
         cs.add_hyperparameters([
             CS.UniformIntegerHyperparameter(
-                'max_depth', lower=1, upper=15, default_value=2, log=False
+                'max_depth', lower=6, upper=30, default_value=6, log=True
             ),
             CS.UniformIntegerHyperparameter(
-                'min_samples_leaf', lower=1, upper=64, default_value=1, log=True
+                'max_leaf_node', lower=2, upper=64, default_value=32, log=True
             ),
             CS.UniformFloatHyperparameter(
-                'learning_rate', lower=2**-10, upper=1, default_value=0.3, log=True
+                'eta', lower=2**-10, upper=1, default_value=0.1, log=True
             ),
             CS.UniformFloatHyperparameter(
                 'l2_regularization', lower=2**-10, upper=2**10, default_value=0.1, log=True
@@ -61,7 +61,7 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         fidelity1 = dict(
             fixed=CS.Constant('n_estimators', value=100),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=16, upper=512, default_value=512, log=False
+                'n_estimators', lower=100, upper=1000, default_value=1000, log=False
             )
         )
         fidelity2 = dict(

From 79dd1f346cb0e6a16632cc6859a9a37e8dd598f7 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 2 Aug 2021 18:51:45 +0200
Subject: [PATCH 045/147] Finalizing RF, XGB, NN

---
 hpobench/benchmarks/ml/nn_benchmark.py      |  6 ++++--
 hpobench/benchmarks/ml/rf_benchmark.py      | 13 +++++++++----
 hpobench/benchmarks/ml/xgboost_benchmark.py |  8 +++-----
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 0bec70e4..2c92b371 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -28,8 +28,10 @@ def get_configuration_space(seed=None):
         cs = CS.ConfigurationSpace(seed=seed)
 
         cs.add_hyperparameters([
-            CS.UniformIntegerHyperparameter('depth', default_value=3, lower=1, upper=3),
-            CS.UniformIntegerHyperparameter('width', default_value=64, lower=16, upper=256),
+            CS.UniformIntegerHyperparameter('depth', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter(
+                'width', default_value=64, lower=16, upper=1024, log=True
+            ),
             CS.UniformIntegerHyperparameter(
                 'batch_size', lower=4, upper=256, default_value=32, log=True
             ),
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index cca69d0b..70e02bdb 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -1,6 +1,7 @@
 import numpy as np
 import ConfigSpace as CS
 from typing import Union
+from copy import deepcopy
 from sklearn.ensemble import RandomForestClassifier
 
 from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
@@ -29,11 +30,12 @@ def get_configuration_space(seed=None):
             CS.UniformIntegerHyperparameter(
                 'max_depth', lower=1, upper=50, default_value=10, log=True
             ),
-            CS.UniformFloatHyperparameter(
-                'min_samples_split', lower=0.05, upper=0.9, default_value=0.9, log=True
+            CS.UniformIntegerHyperparameter(
+                'min_samples_split', lower=2, upper=128, default_value=32, log=True
             ),
+            # the use of a float max_features is different than the sklearn usage
             CS.UniformFloatHyperparameter(
-                'max_features', lower=0.1, upper=1.0, default_value=0.5, log=False
+                'max_features', lower=0, upper=1.0, default_value=0.5, log=False
             ),
             CS.UniformIntegerHyperparameter(
                 'min_samples_leaf', lower=1, upper=20, default_value=1, log=False
@@ -90,8 +92,11 @@ def init_model(self, config, fidelity=None, rng=None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
+        config = deepcopy(config.get_dictionary())
+        n_features = self.train_X.shape[1]
+        config["max_features"] = int(np.rint(np.power(n_features, config["max_features"])))
         model = RandomForestClassifier(
-            **config.get_dictionary(),
+            **config,
             n_estimators=fidelity['n_estimators'],  # a fidelity being used during initialization
             bootstrap=True,
             random_state=rng
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 221d9d59..0fe3f07c 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -32,13 +32,10 @@ def get_configuration_space(seed=None):
                 'eta', lower=2**-10, upper=1., default_value=0.3, log=True
             ),  # learning rate
             CS.UniformIntegerHyperparameter(
-                'max_depth', lower=6, upper=50, default_value=10, log=True
+                'max_depth', lower=1, upper=50, default_value=10, log=True
             ),
             CS.UniformFloatHyperparameter(
-                'min_child_weight', lower=1., upper=2**7., default_value=1., log=True
-            ),
-            CS.UniformFloatHyperparameter(
-                'colsample_bytree', lower=0.05, upper=1., default_value=1.
+                'colsample_bytree', lower=0.1, upper=1., default_value=1., log=False
             ),
             CS.UniformFloatHyperparameter(
                 'reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True
@@ -105,6 +102,7 @@ def init_model(self, config, fidelity=None, rng=None):
         if self.n_classes > 2:
             extra_args["objective"] = "multi:softmax"
             extra_args.update({"num_class": self.n_classes})
+
         model = xgb.XGBClassifier(
             **config.get_dictionary(),
             **extra_args

From ca1e0d4c090ae51e7b30d2308d5f6229c9b970cf Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 2 Aug 2021 23:44:00 +0200
Subject: [PATCH 046/147] TabularBenchmark edit to process only table and
 metadata

---
 hpobench/benchmarks/ml/tabular_benchmark.py | 44 ++++++++++-----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 528f2eef..9566d130 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -1,13 +1,11 @@
 import os
-import glom
 import json
-import pickle
 import numpy as np
 import pandas as pd
 import ConfigSpace as CS
-import pickle5 as pickle
-from copy import deepcopy
+from ConfigSpace.read_and_write import json as json_cs
 from typing import Union, List, Dict
+
 from hpobench.benchmarks.ml.ml_benchmark_template import metrics
 
 
@@ -16,25 +14,18 @@ def __init__(self, path: str, model: str, task_id: int, seed: Union[int, None] =
         assert os.path.isdir(path), "Not a valid path: {}".format(path)
         self.data_path = os.path.join(path, "{}_{}_data.parquet.gzip".format(model, task_id))
         assert os.path.isfile(self.data_path)
-        self.config_path = os.path.join(path, "{}_{}_configs.pkl".format(model, task_id))
-        assert os.path.isfile(self.config_path)
-        self.exp_args_path = os.path.join(path, "{}_{}.json".format(model, task_id))
-        assert os.path.isfile(self.exp_args_path)
+        self.metadata_path = os.path.join(path, "{}_{}_metadata.json".format(model, task_id))
+        assert os.path.isfile(self.metadata_path)
 
         self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
         self.rng = np.random.RandomState(self.seed)
         self.table = self._load_parquet(self.data_path)
-        self.exp_args = self._load_json(self.exp_args_path)
-        self.config_spaces = self._load_pickle(self.config_path)
-
+        self.metadata = self._load_json(self.metadata_path)
+        self.exp_args = self.metadata["exp_args"]
+        self.config_spaces = self.metadata["config_spaces"]
+        self.global_minimums = self.metadata["global_min"]
         self.x_cs = self.get_hyperparameter_space(seed=self.seed)
         self.z_cs = self.get_fidelity_space(seed=self.seed)
-        self.global_minimums = self.exp_args["global_min"]
-
-    def _load_pickle(self, path):
-        with open(path, "rb") as f:
-            data = pickle.load(f)
-        return data
 
     def _load_parquet(self, path):
         data = pd.read_parquet(path)
@@ -45,6 +36,13 @@ def _load_json(self, path):
             data = json.load(f)
         return data
 
+    def _preprocess_configspace(self, config_space):
+        """ Converts floats to np.float32 """
+        for hp in config_space.get_hyperparameters():
+            hp.sequence = tuple(np.array(hp.sequence).astype(np.float32))
+            hp.default_value = np.float32(hp.default_value)
+        return config_space
+
     def _total_number_of_configurations(self, space: str="hyperparameters") -> int:
         """ Returns the number of unique configurations in the parameter/fidelity space
         """
@@ -59,18 +57,18 @@ def _seeds_used(self):
 
     def get_hyperparameter_space(self, seed=None, original=False):
         cs = CS.ConfigurationSpace(seed=seed)
-        if original:
-            _cs = self.config_spaces['x']
-        _cs = self.config_spaces['x_discrete']
+        load_name = "x" if original else "x_discrete"
+        _cs = json_cs.read(self.config_spaces[load_name])
         for hp in _cs.get_hyperparameters():
             cs.add_hyperparameter(hp)
+        if not original:
+            cs = self._preprocess_configspace(cs)
         return cs
 
     def get_fidelity_space(self, seed=None, original=False):
         cs = CS.ConfigurationSpace(seed=seed)
-        if original:
-            _cs = self.config_spaces['z']
-        _cs = self.config_spaces['z_discrete']
+        load_name = "z" if original else "z_discrete"
+        _cs = json_cs.read(self.config_spaces[load_name])
         for hp in _cs.get_hyperparameters():
             cs.add_hyperparameter(hp)
         return cs

From 0d70d366cfc0095ccd12819600cc263e95b6cf80 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Wed, 11 Aug 2021 13:44:37 +0200
Subject: [PATCH 047/147] TabularBenchmark

- Rearrange the benchmark.
- Move data parts to a data manager. It is able to download the data from the web, if it is not present on the local machine.
- Enforce the API structure
---
 extra_requirements/ml.json                  |   3 +
 hpobench/abstract_benchmark.py              |   4 +-
 hpobench/benchmarks/ml/tabular_benchmark.py | 183 ++++++++++++--------
 hpobench/dependencies/ml/__init__.py        |   0
 hpobench/util/data_manager.py               |  99 +++++++++++
 tests/test_data_manager.py                  |  13 ++
 6 files changed, 225 insertions(+), 77 deletions(-)
 create mode 100644 extra_requirements/ml.json
 create mode 100644 hpobench/dependencies/ml/__init__.py

diff --git a/extra_requirements/ml.json b/extra_requirements/ml.json
new file mode 100644
index 00000000..8a68761f
--- /dev/null
+++ b/extra_requirements/ml.json
@@ -0,0 +1,3 @@
+{
+  "ml_tabular_benchmarks": ["pandas>=1.0.0"]
+}
\ No newline at end of file
diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index abbbcb22..5f141f6a 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -36,8 +36,8 @@ def __init__(self, rng: Union[int, np.random.RandomState, None] = None, **kwargs
         """
 
         self.rng = rng_helper.get_rng(rng=rng)
-        self.configuration_space = self.get_configuration_space()
-        self.fidelity_space = self.get_fidelity_space()
+        self.configuration_space = self.get_configuration_space(self.rng.randint(0, 10000))
+        self.fidelity_space = self.get_fidelity_space(self.rng.randint(0, 10000))
 
     @abc.abstractmethod
     def objective_function(self, configuration: Union[ConfigSpace.Configuration, Dict],
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 9566d130..b86eb426 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -1,40 +1,72 @@
-import os
-import json
-import numpy as np
-import pandas as pd
+from pathlib import Path
+from typing import Union, List, Dict
+
+import ConfigSpace
 import ConfigSpace as CS
+import numpy as np
 from ConfigSpace.read_and_write import json as json_cs
-from typing import Union, List, Dict
 
-from hpobench.benchmarks.ml.ml_benchmark_template import metrics
+from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import metrics
+from hpobench.util.data_manager import TabularDataManager
+
 
+class BaseTabularBenchmark(AbstractBenchmark):
 
-class TabularBenchmark:
-    def __init__(self, path: str, model: str, task_id: int, seed: Union[int, None] = None):
-        assert os.path.isdir(path), "Not a valid path: {}".format(path)
-        self.data_path = os.path.join(path, "{}_{}_data.parquet.gzip".format(model, task_id))
-        assert os.path.isfile(self.data_path)
-        self.metadata_path = os.path.join(path, "{}_{}_metadata.json".format(model, task_id))
-        assert os.path.isfile(self.metadata_path)
+    def __init__(self, model: str, task_id: int, data_dir: Union[Path, str, None] = None,
+                 rng: Union[int, np.random.RandomState, None] = None, **kwargs):
+
+        super(BaseTabularBenchmark, self).__init__(rng=rng, **kwargs)
+
+        self.task_id = task_id
+        self.model = model
+
+        self.table, self.metadata = TabularDataManager(model, task_id, data_dir)
 
-        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
-        self.rng = np.random.RandomState(self.seed)
-        self.table = self._load_parquet(self.data_path)
-        self.metadata = self._load_json(self.metadata_path)
         self.exp_args = self.metadata["exp_args"]
         self.config_spaces = self.metadata["config_spaces"]
         self.global_minimums = self.metadata["global_min"]
-        self.x_cs = self.get_hyperparameter_space(seed=self.seed)
-        self.z_cs = self.get_fidelity_space(seed=self.seed)
 
-    def _load_parquet(self, path):
-        data = pd.read_parquet(path)
-        return data
+    @AbstractBenchmark.check_parameters
+    def objective_function(self,
+                           configuration: Union[ConfigSpace.Configuration, Dict],
+                           fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           seed: Union[int, None] = None,
+                           metric: Union[str, None] = 'acc',
+                           **kwargs) -> Dict:
 
-    def _load_json(self, path):
-        with open(path, "r") as f:
-            data = json.load(f)
-        return data
+        result = self._objective(configuration, fidelity, seed, metric, evaluation="val")
+        return result
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self,
+                                configuration: Union[ConfigSpace.Configuration, Dict],
+                                fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                seed: Union[int, None] = None,
+                                metric: Union[str, None] = 'acc',
+                                **kwargs) -> Dict:
+
+        result = self._objective(configuration, fidelity, seed, metric, evaluation="test")
+        return result
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+        raise NotImplementedError
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+        raise NotImplementedError
+
+    # pylint: disable=arguments-differ
+    def get_meta_information(self) -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'BaseTabularBenchmark',
+                'references': [],
+                'task_id': self.task_id,
+                'model': self.model
+                }
 
     def _preprocess_configspace(self, config_space):
         """ Converts floats to np.float32 """
@@ -47,7 +79,7 @@ def _total_number_of_configurations(self, space: str="hyperparameters") -> int:
         """ Returns the number of unique configurations in the parameter/fidelity space
         """
         count = 1
-        cs = self.x_cs if space == "hyperparameters" else self.z_cs
+        cs = self.configuration_space if space == "hyperparameters" else self.fidelity_space
         for hp in cs.get_hyperparameters():
             count *= len(hp.sequence)
         return count
@@ -55,29 +87,11 @@ def _total_number_of_configurations(self, space: str="hyperparameters") -> int:
     def _seeds_used(self):
         return self.table.seed.unique().tolist()
 
-    def get_hyperparameter_space(self, seed=None, original=False):
-        cs = CS.ConfigurationSpace(seed=seed)
-        load_name = "x" if original else "x_discrete"
-        _cs = json_cs.read(self.config_spaces[load_name])
-        for hp in _cs.get_hyperparameters():
-            cs.add_hyperparameter(hp)
-        if not original:
-            cs = self._preprocess_configspace(cs)
-        return cs
-
-    def get_fidelity_space(self, seed=None, original=False):
-        cs = CS.ConfigurationSpace(seed=seed)
-        load_name = "z" if original else "z_discrete"
-        _cs = json_cs.read(self.config_spaces[load_name])
-        for hp in _cs.get_hyperparameters():
-            cs.add_hyperparameter(hp)
-        return cs
-
     def sample_hyperparamer(self, n: int = 1) -> Union[CS.Configuration, List]:
-        return self.x_cs.sample_configuration(n)
+        return self.configuration_space.sample_configuration(n)
 
     def sample_fidelity(self, n: int = 1) -> Union[CS.Configuration, List]:
-        return self.z_cs.sample_configuration(n)
+        return self.fidelity_space.sample_configuration(n)
 
     def get_global_min(self, metric: str = "acc"):
         """ Retrieves the minimum (1 - metric) for train, validation and test splits
@@ -88,13 +102,13 @@ def get_global_min(self, metric: str = "acc"):
 
     def get_max_fidelity(self) -> Dict:
         max_fidelity = dict()
-        for hp in self.z_cs.get_hyperparameters():
+        for hp in self.fidelity_space.get_hyperparameters():
             max_fidelity[hp.name] = np.sort(hp.sequence)[-1]
         return max_fidelity
 
     def get_fidelity_range(self):
         fidelities = []
-        for hp in self.z_cs.get_hyperparameters():
+        for hp in self.fidelity_space.get_hyperparameters():
             if not isinstance(hp, CS.Constant) and len(hp.sequence) > 1:
                 fidelities.append((hp.name, hp.sequence[0], hp.sequence[-1]))
         return fidelities
@@ -119,17 +133,16 @@ def _objective(
             metric: Union[str, None] = "acc",
             evaluation: Union[str] = ""
     ) -> Dict:
-        self.x_cs.check_configuration(config)
-        self.z_cs.check_configuration(fidelity)
-        assert metric in list(metrics.keys()), \
-            "metric not found among: {{{}}}".format(", ".join(list(metrics.keys())))
-        score_key = "{}_scores".format(evaluation)
-        cost_key = "{}_scores".format(evaluation)
+
+        metric_str = ', '.join(list(metrics.keys))
+        assert metric in list(metrics.keys()), f"metric not found among: {metric_str}"
+        score_key = f"{evaluation}_scores"
+        cost_key = f"{evaluation}_scores"
 
         key_path = dict()
-        for name in np.sort(self.x_cs.get_hyperparameter_names()):
+        for name in np.sort(self.configuration_space.get_hyperparameter_names()):
             key_path[str(name)] = config[str(name)]
-        for name in np.sort(self.z_cs.get_hyperparameter_names()):
+        for name in np.sort(self.fidelity_space.get_hyperparameter_names()):
             key_path[str(name)] = fidelity[str(name)]
 
         if seed is not None:
@@ -152,22 +165,42 @@ def _objective(
         result = dict(function_value=loss, cost=costs, info=info)
         return result
 
-    def objective_function(
-            self,
-            config: CS.Configuration,
-            fidelity: CS.Configuration,
-            seed: Union[int, None] = None,
-            metric: Union[str, None] = "acc"
-    ) -> Dict:
-        result = self._objective(config, fidelity, seed, metric, evaluation="val")
-        return result
 
-    def objective_function_test(
-            self,
-            config: CS.Configuration,
-            fidelity: CS.Configuration,
-            seed: Union[int, None] = None,
-            metric: Union[str, None] = "acc"
-    ) -> Dict:
-        result = self._objective(config, fidelity, seed, metric, evaluation="test")
-        return result
+class TabularBenchmark(BaseTabularBenchmark):
+    def __init__(self, model: str, task_id: int, data_dir: Union[Path, str, None] = None,
+                 rng: Union[int, np.random.RandomState, None] = None, **kwargs):
+        super(TabularBenchmark, self).__init__(model, task_id, data_dir, rng, **kwargs)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        cs = json_cs.read(self.config_spaces['x_discrete'])
+        cs = self._preprocess_configspace(cs)
+        cs.seed(seed)
+        return cs
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+        cs = json_cs.read(self.config_spaces['z_discrete'])
+        cs.seed(seed=seed)
+        return cs
+
+
+class OriginalTabularBenchmark(BaseTabularBenchmark):
+    def __init__(self, model: str, task_id: int, data_dir: Union[Path, str, None] = None,
+                 rng: Union[int, np.random.RandomState, None] = None, **kwargs):
+        super(OriginalTabularBenchmark, self).__init__(model, task_id, data_dir, rng, **kwargs)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+        cs = json_cs.read(self.config_spaces['x'])
+        cs.seed(seed)
+        return cs
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+        cs = json_cs.read(self.config_spaces['z'])
+        cs.seed(seed=seed)
+        return cs
+
+
+__all__ = [TabularBenchmark, OriginalTabularBenchmark]
diff --git a/hpobench/dependencies/ml/__init__.py b/hpobench/dependencies/ml/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 6e401215..371cbd3c 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -32,6 +32,12 @@
 except ImportError:
     print("oslo_concurrency not installed, can't download datasets for nasbench201 (not needed for containers)")
 
+try:
+    import pandas as pd
+except ImportError:
+    print("pandas is not installed, can't download datasets for the ml.tabular_benchmarks (not needed for containers)")
+
+
 import hpobench
 
 
@@ -66,6 +72,50 @@ def create_save_directory(self, save_dir: Path):
             self.logger.debug(f'Create directory {save_dir}')
             save_dir.mkdir(parents=True, exist_ok=True)
 
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/lock_download_file', delay=0.5)
+    def _download_file_with_progressbar(self, data_url: str, data_file: Path):
+        data_file = Path(data_file)
+
+        if data_file.exists():
+            self.logger.info('Data File already exists. Skip downloading.')
+            return
+
+        self.logger.info(f"Download the file from {data_url} to {data_file}")
+        data_file.parent.mkdir(parents=True, exist_ok=True)
+
+        from tqdm import tqdm
+        r = requests.get(data_url, stream=True)
+        with open(data_file, 'wb') as f:
+            total_length = int(r.headers.get('content-length'))
+            for chunk in tqdm(r.iter_content(chunk_size=1024),
+                              unit_divisor=1024, unit='kB', total=int(total_length / 1024) + 1):
+                if chunk:
+                    _ = f.write(chunk)
+                    f.flush()
+        self.logger.info("Finished downloading")
+
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/lock_unzip_file', delay=0.5)
+    def _untar_data(self, compressed_file: Path, save_dir: Union[Path, None] = None):
+        self.logger.debug('Extract the compressed data')
+        with tarfile.open(compressed_file, 'r') as fh:
+            if save_dir is None:
+                save_dir = compressed_file.parent
+            fh.extractall(save_dir)
+        self.logger.debug(f'Successfully extracted the data to {save_dir}')
+
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/lock_unzip_file', delay=0.5)
+    def _unzip_data(self, compressed_file: Path, save_dir: Union[Path, None] = None):
+        self.logger.debug('Extract the compressed data')
+        with ZipFile(compressed_file, 'r') as fh:
+            if save_dir is None:
+                save_dir = compressed_file.parent
+            fh.extractall(save_dir)
+        self.logger.debug(f'Successfully extracted the data to {save_dir}')
+
+
 
 class HoldoutDataManager(DataManager):
     """  Base Class for loading and managing the Holdout data sets.
@@ -874,3 +924,52 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_tst, y_tst = data[n_trn + n_val:, 1:], data[n_trn + n_val:, 0]
 
         return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+
+class TabularDataManager(DataManager):
+    def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
+        super(TabularDataManager, self).__init__()
+
+        assert model in ['lr', 'svm']
+
+        self.model = model
+        self.task_id = str(task_id)
+
+        url_svm = 'https://figshare.com/s/5a0929ad9b2ccd8dda58'
+        url_lr = 'https://ndownloader.figshare.com/files/29027112?private_link=d644493a93dbab4b4ee1'
+
+        self.url_to_use = url_svm if model == 'svm' else url_lr
+
+        if data_dir is None:
+            data_dir = hpobench.config_file.data_dir / "TabularData"
+
+        self._save_dir = Path(data_dir)
+        self.create_save_directory(self._save_dir)
+
+        self.parquet_file = self._save_dir / self.task_id / f'{self.model}_{self.task_id}_data.parquet.gzip'
+        self.metadata_file = self._save_dir / self.task_id / f'{self.model}_{self.task_id}_metadata.json'
+
+    def load(self):
+        # Can we directly load the files?
+        if self.parquet_file.exists() and self.metadata_file.exists():
+            table = self._load_parquet(self.parquet_file)
+            metadata = self._load_json(self.metadata_file)
+            return table, metadata
+
+        # We have to download the entire zip file and etract then extract the parquet file.
+        self._download_file_with_progressbar(self.url_to_use, self._save_dir / f'{self.model}.zip')
+        self._unzip_data(self._save_dir / f'{self.model}.zip', self._save_dir)
+        table = self._load_parquet(self.parquet_file)
+        metadata = self._load_json(self.metadata_file)
+        return table, metadata
+
+    @staticmethod
+    def _load_parquet(path):
+        data = pd.read_parquet(path)
+        return data
+
+    @staticmethod
+    def _load_json(path):
+        with open(path, "r") as f:
+            data = json.load(f)
+        return data
diff --git a/tests/test_data_manager.py b/tests/test_data_manager.py
index 3ea3ecc4..fd57b627 100644
--- a/tests/test_data_manager.py
+++ b/tests/test_data_manager.py
@@ -99,3 +99,16 @@ def test_boston_data():
     assert 0 < len(x_test) == len(y_test)
     assert 0 < len(x_valid) == len(y_valid)
     assert len(y_valid) < len(x_train) == len(y_train)
+
+
+def test_tabular_datamanager():
+    from hpobench.util.data_manager import TabularDataManager
+    dm = TabularDataManager(model='lr',
+                            task_id='3')
+
+    table, meta_data = dm.load()
+
+    assert (hpobench.config_file.data_dir / "TabularData" / str(3) / f'lr_3_data.parquet.gzip').exists()
+    assert (hpobench.config_file.data_dir / "TabularData" / str(3) / f'lr_3_metadata.json').exists()
+
+    table_2, meta_data_2 = dm.load()

From 12ebce825d44aa00fe1d0b50d572abc7fd4fc6c8 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Wed, 11 Aug 2021 13:47:12 +0200
Subject: [PATCH 048/147] Pycodestyle

---
 hpobench/benchmarks/ml/tabular_benchmark.py | 21 ++++++++++-----------
 hpobench/util/data_manager.py               |  1 -
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index b86eb426..dd07ec02 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -1,7 +1,6 @@
 from pathlib import Path
 from typing import Union, List, Dict
 
-import ConfigSpace
 import ConfigSpace as CS
 import numpy as np
 from ConfigSpace.read_and_write import json as json_cs
@@ -29,8 +28,8 @@ def __init__(self, model: str, task_id: int, data_dir: Union[Path, str, None] =
 
     @AbstractBenchmark.check_parameters
     def objective_function(self,
-                           configuration: Union[ConfigSpace.Configuration, Dict],
-                           fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
+                           configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
                            seed: Union[int, None] = None,
                            metric: Union[str, None] = 'acc',
@@ -41,8 +40,8 @@ def objective_function(self,
 
     @AbstractBenchmark.check_parameters
     def objective_function_test(self,
-                                configuration: Union[ConfigSpace.Configuration, Dict],
-                                fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
+                                configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
                                 seed: Union[int, None] = None,
                                 metric: Union[str, None] = 'acc',
@@ -52,11 +51,11 @@ def objective_function_test(self,
         return result
 
     # pylint: disable=arguments-differ
-    def get_configuration_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         raise NotImplementedError
 
     # pylint: disable=arguments-differ
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         raise NotImplementedError
 
     # pylint: disable=arguments-differ
@@ -75,7 +74,7 @@ def _preprocess_configspace(self, config_space):
             hp.default_value = np.float32(hp.default_value)
         return config_space
 
-    def _total_number_of_configurations(self, space: str="hyperparameters") -> int:
+    def _total_number_of_configurations(self, space: str = "hyperparameters") -> int:
         """ Returns the number of unique configurations in the parameter/fidelity space
         """
         count = 1
@@ -179,7 +178,7 @@ def get_configuration_space(self, seed: Union[int, None] = None) -> CS.Configura
         return cs
 
     # pylint: disable=arguments-differ
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         cs = json_cs.read(self.config_spaces['z_discrete'])
         cs.seed(seed=seed)
         return cs
@@ -191,13 +190,13 @@ def __init__(self, model: str, task_id: int, data_dir: Union[Path, str, None] =
         super(OriginalTabularBenchmark, self).__init__(model, task_id, data_dir, rng, **kwargs)
 
     # pylint: disable=arguments-differ
-    def get_configuration_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         cs = json_cs.read(self.config_spaces['x'])
         cs.seed(seed)
         return cs
 
     # pylint: disable=arguments-differ
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         cs = json_cs.read(self.config_spaces['z'])
         cs.seed(seed=seed)
         return cs
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 371cbd3c..3063b2e7 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -116,7 +116,6 @@ def _unzip_data(self, compressed_file: Path, save_dir: Union[Path, None] = None)
         self.logger.debug(f'Successfully extracted the data to {save_dir}')
 
 
-
 class HoldoutDataManager(DataManager):
     """  Base Class for loading and managing the Holdout data sets.
 

From 873781e878d349497c6db34d309bf6c3bb1b817f Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Wed, 11 Aug 2021 13:48:13 +0200
Subject: [PATCH 049/147] Flake8

---
 hpobench/util/data_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 3063b2e7..9e6f8fb9 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -13,19 +13,19 @@
 
 import abc
 import gzip
+import json
 import logging
 import pickle
 import tarfile
-import requests
-
 from io import BytesIO
 from pathlib import Path
+from time import time
 from typing import Tuple, Dict, Any, Union
 from urllib.request import urlretrieve, urlopen
 from zipfile import ZipFile
-from time import time
 
 import numpy as np
+import requests
 
 try:
     from oslo_concurrency import lockutils

From 532a905ba496f773cd5057969637f3f14a4563bf Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Wed, 11 Aug 2021 17:39:07 +0200
Subject: [PATCH 050/147] Adapt ML Benchmark Template to fit with current API

---
 hpobench/abstract_benchmark.py                |   7 +-
 hpobench/dependencies/ml/data_manager.py      | 163 ++++++++++++
 .../ml/ml_benchmark_template.py               | 245 +++++-------------
 3 files changed, 231 insertions(+), 184 deletions(-)
 create mode 100644 hpobench/dependencies/ml/data_manager.py
 rename hpobench/{benchmarks => dependencies}/ml/ml_benchmark_template.py (52%)

diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index 5f141f6a..c9db4216 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -226,17 +226,12 @@ def get_configuration_space(seed: Union[int, None] = None) -> ConfigSpace.Config
 
     @staticmethod
     @abc.abstractmethod
-    def get_fidelity_space(
-            seed: Union[int, None] = None, fidelity_choice: Union[int, None] = None
-    ) -> ConfigSpace.ConfigurationSpace:
+    def get_fidelity_space(seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
         """ Defines the available fidelity parameters as a "fidelity space" for each benchmark.
         Parameters
         ----------
         seed: int, None
             Seed for the fidelity space.
-        fidelity_choice: int, None
-            integer value to choose the type of fidelity space
-
         Returns
         -------
         ConfigSpace.ConfigurationSpace
diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml/data_manager.py
new file mode 100644
index 00000000..9cc7f5f7
--- /dev/null
+++ b/hpobench/dependencies/ml/data_manager.py
@@ -0,0 +1,163 @@
+import openml
+import numpy as np
+import pandas as pd
+from typing import Union
+from pathlib import Path
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+
+
+from hpobench.util.data_manager import DataManager
+
+
+from hpobench import config_file
+
+
+class OpenMLDataManager(DataManager):
+
+    def __init__(self, task_id: int,
+                 valid_size: Union[float, None] = 0.33,
+                 data_path: Union[str, Path, None] = None,
+                 global_seed: Union[int, None] = 1):
+
+        self.task_id = task_id
+        self.global_seed = global_seed
+
+        self.valid_size = valid_size
+
+        self.train_X = None
+        self.valid_X = None
+        self.test_X = None
+        self.train_y = None
+        self.valid_y = None
+        self.test_y = None
+        self.train_idx = None
+        self.test_idx = None
+        self.task = None
+        self.dataset = None
+        self.preprocessor = None
+        self.lower_bound_train_size = None
+        self.n_classes = None
+
+        if data_path is None:
+            data_path = config_file.data_dir / "OpenML"
+
+        self.data_path = data_path
+        super(OpenMLDataManager, self).__init__()
+
+    def load(self, valid_size=None, verbose=False):
+        """Fetches data from OpenML and initializes the train-validation-test data splits
+
+        The validation set is fixed till this function is called again or explicitly altered
+        """
+        # fetches task
+        self.task = openml.tasks.get_task(self.task_id, download_data=False)
+        self.n_classes = len(self.task.class_labels)
+
+        # fetches dataset
+        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
+        if verbose:
+            self.logger.debug(self.task)
+            self.logger.debug(self.dataset)
+
+        # check if the path to data splits is valid
+        if self.data_path is not None and self.data_path.is_dir():
+            data_path = self.data_path / str(self.task_id)
+            required_file_list = [
+                ("train", "x"), ("train", "y"),
+                ("valid", "x"), ("valid", "y"),
+                ("test", "x"), ("test", "y")
+            ]
+            for files in required_file_list:
+                data_str = "{}_{}.parquet.gzip".format(*files)
+                if (data_path / data_str).exists():
+                    raise FileNotFoundError("{} not found!".format(data_str.format(*files)))
+            # ignore the remaining data loaders and preprocessors as valid data splits available
+            return
+
+        # loads full data
+        X, y, categorical_ind, feature_names = self.dataset.get_data(
+            target=self.task.target_name, dataset_format="dataframe"
+        )
+        categorical_ind = np.array(categorical_ind)
+        (cat_idx,) = np.where(categorical_ind)
+        (cont_idx,) = np.where(~categorical_ind)
+
+        # splitting dataset into train and test (10% test)
+        # train-test split is fixed for a task and its associated dataset (from OpenML)
+        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
+        train_x = X.iloc[self.train_idx]
+        train_y = y.iloc[self.train_idx]
+        self.test_X = X.iloc[self.test_idx]
+        self.test_y = y.iloc[self.test_idx]
+
+        # splitting training into training and validation
+        # validation set is fixed as per the global seed independent of the benchmark seed
+        valid_size = self.valid_size if valid_size is None else valid_size
+        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
+            train_x, train_y, test_size=valid_size, shuffle=True, stratify=train_y,
+            random_state=check_random_state(self.global_seed)
+        )
+
+        # preprocessor to handle missing values, categorical columns encodings,
+        # and scaling numeric columns
+        self.preprocessor = make_pipeline(
+            ColumnTransformer([
+                (
+                    "cat",
+                    make_pipeline(SimpleImputer(strategy="most_frequent"),
+                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
+                    cat_idx.tolist(),
+                ),
+                (
+                    "cont",
+                    make_pipeline(SimpleImputer(strategy="median"),
+                                  StandardScaler()),
+                    cont_idx.tolist(),
+                )
+            ])
+        )
+        if verbose:
+            self.logger.debug("Shape of data pre-preprocessing: {}".format(self.train_X.shape))
+
+        # preprocessor fit only on the training set
+        self.train_X = self.preprocessor.fit_transform(self.train_X)
+        # applying preprocessor built on the training set, across validation and test splits
+        self.valid_X = self.preprocessor.transform(self.valid_X)
+        self.test_X = self.preprocessor.transform(self.test_X)
+        # converting boolean labels to strings
+        self.train_y = self._convert_labels(self.train_y)
+        self.valid_y = self._convert_labels(self.valid_y)
+        self.test_y = self._convert_labels(self.test_y)
+
+        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
+        # use 10 times the number of classes as lower bound for the dataset fraction
+        self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+        self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+
+        if verbose:
+            self.logger.debug("Shape of data post-preprocessing: {}".format(self.train_X.shape), "\n")
+            self.logger.debug("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
+            self.logger.debug("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
+            self.logger.debug("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
+            self.logger.debug("\nData loading complete!\n")
+        return
+
+    @staticmethod
+    def _convert_labels(labels):
+        """Converts boolean labels (if exists) to strings
+        """
+        label_types = list(map(lambda x: isinstance(x, bool), labels))
+        if np.all(label_types):
+            _labels = list(map(lambda x: str(x), labels))
+            if isinstance(labels, pd.Series):
+                labels = pd.Series(_labels, index=labels.index)
+            elif isinstance(labels, np.array):
+                labels = np.array(labels)
+        return labels
diff --git a/hpobench/benchmarks/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
similarity index 52%
rename from hpobench/benchmarks/ml/ml_benchmark_template.py
rename to hpobench/dependencies/ml/ml_benchmark_template.py
index 83b39957..54029736 100644
--- a/hpobench/benchmarks/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -5,19 +5,13 @@
 import pandas as pd
 import ConfigSpace as CS
 from typing import Union, Dict
+from pathlib import Path
 
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import make_pipeline
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
 from sklearn.metrics import make_scorer, accuracy_score, balanced_accuracy_score, \
     precision_score, f1_score
 
 from hpobench.abstract_benchmark import AbstractBenchmark
-
+from hpobench.dependencies.ml.data_manager import OpenMLDataManager
 
 metrics = dict(
     acc=accuracy_score,
@@ -25,6 +19,7 @@
     f1=f1_score,
     precision=precision_score,
 )
+
 metrics_kwargs = dict(
     acc=dict(),
     bal_acc=dict(),
@@ -39,16 +34,19 @@ class MLBenchmark(AbstractBenchmark):
     def __init__(
             self,
             task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
+            rng: Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            data_path: Union[str, None] = None,
+            data_path: Union[str, Path, None] = None,
             global_seed: int = 1
     ):
-        self.seed = seed if seed is not None else np.random.randint(1, 10 ** 6)
-        self.rng = check_random_state(self.seed)
+        super(MLBenchmark, self).__init__(rng=rng)
+
+        if isinstance(rng, int):
+            self.seed = rng
+        else:
+            self.seed = self.rng.randint(1, 10**6)
+
         self.global_seed = global_seed  # used for fixed training-validation splits
-        super(MLBenchmark, self).__init__(rng=seed)
 
         self.task_id = task_id
         self.valid_size = valid_size
@@ -57,25 +55,27 @@ def __init__(
             self.scorers[k] = make_scorer(v, **metrics_kwargs[k])
         self.data_path = data_path
 
+        dm = OpenMLDataManager(task_id, valid_size, data_path, global_seed)
+        dm.load()
+
         # Data variables
-        self.train_X = None
-        self.valid_X = None
-        self.test_X = None
-        self.train_y = None
-        self.valid_y = None
-        self.test_y = None
-        self.train_idx = None
-        self.test_idx = None
-        self.task = None
-        self.dataset = None
-        self.preprocessor = None
-        self.lower_bound_train_size = None
-        self.load_data_from_openml()
+        self.train_X = dm.train_X
+        self.valid_X = dm.valid_X
+        self.test_X = dm.test_X
+        self.train_y = dm.train_y
+        self.valid_y = dm.valid_y
+        self.test_y = dm.test_y
+        self.train_idx = dm.train_idx
+        self.test_idx = dm.test_idx
+        self.task = dm.task
+        self.dataset = dm.dataset
+        self.preprocessor = dm.preprocessor
+        self.lower_bound_train_size = dm.lower_bound_train_size
+        self.n_classes = dm.n_classes
 
         # Observation and fidelity spaces
-        self.fidelity_choice = fidelity_choice
-        self.z_cs = self.get_fidelity_space(self.seed, self.fidelity_choice)
-        self.x_cs = self.get_configuration_space(self.seed)
+        self.fidelity_space = self.get_fidelity_space(self.seed)
+        self.configuration_space = self.get_configuration_space(self.seed)
 
     @staticmethod
     def get_configuration_space(seed=None):
@@ -84,7 +84,7 @@ def get_configuration_space(seed=None):
         raise NotImplementedError()
 
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=None):
+    def get_fidelity_space(seed=None):
         """Fidelity space available --- specifies the fidelity dimensions
 
         If fidelity_choice is 0
@@ -98,130 +98,35 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
         """
         raise NotImplementedError()
 
+    def get_meta_information(self):
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Support Vector Machine',
+            'shape of train data': self.train_X.shape,
+            'shape of test data': self.test_X.shape,
+            'shape of valid data': self.valid_X.shape,
+            'initial random seed': self.seed,
+            'task_id': self.task_id
+        }
+
+    def init_model(self, config, fidelity=None, rng=None):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+        raise NotImplementedError()
+
     def get_config(self, size=None):
         """Samples configuration(s) from the (hyper) parameter space
         """
         if size is None:  # return only one config
-            return self.x_cs.sample_configuration()
-        return [self.x_cs.sample_configuration() for i in range(size)]
+            return self.configuration_space.sample_configuration()
+        return [self.configuration_space.sample_configuration() for i in range(size)]
 
     def get_fidelity(self, size=None):
         """Samples candidate fidelities from the fidelity space
         """
         if size is None:  # return only one config
-            return self.z_cs.sample_configuration()
-        return [self.z_cs.sample_configuration() for i in range(size)]
-
-    def _convert_labels(self, labels):
-        """Converts boolean labels (if exists) to strings
-        """
-        label_types = list(map(lambda x: isinstance(x, bool), labels))
-        if np.all(label_types):
-            _labels = list(map(lambda x: str(x), labels))
-            if isinstance(labels, pd.Series):
-                labels = pd.Series(_labels, index=labels.index)
-            elif isinstance(labels, np.array):
-                labels = np.array(labels)
-        return labels
-
-    def load_data_from_openml(self, valid_size=None, verbose=False):
-        """Fetches data from OpenML and initializes the train-validation-test data splits
-
-        The validation set is fixed till this function is called again or explicitly altered
-        """
-        # fetches task
-        self.task = openml.tasks.get_task(self.task_id, download_data=False)
-        self.n_classes = len(self.task.class_labels)
-        # fetches dataset
-        self.dataset = openml.datasets.get_dataset(self.task.dataset_id, download_data=False)
-        if verbose:
-            print(self.task, '\n')
-            print(self.dataset, '\n')
-
-        # check if the path to data splits is valid
-        if self.data_path is not None and os.path.isdir(self.data_path):
-            data_path = os.path.join(self.data_path, str(self.task_id))
-            data_str = os.path.join(data_path, "{}_{}.parquet.gzip")
-            required_file_list = [
-                ("train", "x"), ("train", "y"),
-                ("valid", "x"), ("valid", "y"),
-                ("test", "x"), ("test", "y")
-            ]
-            for files in required_file_list:
-                if not os.path.isfile(data_str.format("train", "x")):
-                    raise FileNotFoundError("{} not found!".format(data_str.format(*files)))
-            # ignore the remaining data loaders and preprocessors as valid data splits available
-            return
-
-        # loads full data
-        X, y, categorical_ind, feature_names = self.dataset.get_data(
-            target=self.task.target_name, dataset_format="dataframe"
-        )
-        categorical_ind = np.array(categorical_ind)
-        (cat_idx,) = np.where(categorical_ind)
-        (cont_idx,) = np.where(~categorical_ind)
-
-        # splitting dataset into train and test (10% test)
-        # train-test split is fixed for a task and its associated dataset (from OpenML)
-        self.train_idx, self.test_idx = self.task.get_train_test_split_indices()
-        train_x = X.iloc[self.train_idx]
-        train_y = y.iloc[self.train_idx]
-        self.test_X = X.iloc[self.test_idx]
-        self.test_y = y.iloc[self.test_idx]
-
-        # splitting training into training and validation
-        # validation set is fixed as per the global seed independent of the benchmark seed
-        valid_size = self.valid_size if valid_size is None else valid_size
-        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
-            train_x, train_y, test_size=valid_size, shuffle=True, stratify=train_y,
-            random_state=check_random_state(self.global_seed)
-        )
-
-        # preprocessor to handle missing values, categorical columns encodings,
-        # and scaling numeric columns
-        self.preprocessor = make_pipeline(
-            ColumnTransformer([
-                (
-                    "cat",
-                    make_pipeline(SimpleImputer(strategy="most_frequent"),
-                                  OneHotEncoder(sparse=False, handle_unknown="ignore")),
-                    cat_idx.tolist(),
-                ),
-                (
-                    "cont",
-                    make_pipeline(SimpleImputer(strategy="median"),
-                                  StandardScaler()),
-                    cont_idx.tolist(),
-                )
-            ])
-        )
-        if verbose:
-            print("Shape of data pre-preprocessing: {}".format(self.train_X.shape))
-
-        # preprocessor fit only on the training set
-        self.train_X = self.preprocessor.fit_transform(self.train_X)
-        # applying preprocessor built on the training set, across validation and test splits
-        self.valid_X = self.preprocessor.transform(self.valid_X)
-        self.test_X = self.preprocessor.transform(self.test_X)
-        # converting boolean labels to strings
-        self.train_y = self._convert_labels(self.train_y)
-        self.valid_y = self._convert_labels(self.valid_y)
-        self.test_y = self._convert_labels(self.test_y)
-
-        # Similar to (https://arxiv.org/pdf/1605.07079.pdf)
-        # use 10 times the number of classes as lower bound for the dataset fraction
-        self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
-        self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
-
-        if verbose:
-            print("Shape of data post-preprocessing: {}".format(self.train_X.shape), "\n")
-
-        if verbose:
-            print("\nTraining data (X, y): ({}, {})".format(self.train_X.shape, self.train_y.shape))
-            print("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
-            print("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
-            print("\nData loading complete!\n")
-        return
+            return self.fidelity_space.sample_configuration()
+        return [self.fidelity_space.sample_configuration() for i in range(size)]
 
     def shuffle_data_idx(self, train_idx=None, rng=None):
         rng = self.rng if rng is None else rng
@@ -229,11 +134,6 @@ def shuffle_data_idx(self, train_idx=None, rng=None):
         rng.shuffle(train_idx)
         return train_idx
 
-    def init_model(self, config, fidelity=None, rng=None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
-        raise NotImplementedError()
-
     def _train_objective(self, config, fidelity, shuffle, rng, evaluation="valid"):
         # initializing model
         model = self.init_model(config, fidelity, rng)
@@ -282,14 +182,14 @@ def _train_objective(self, config, fidelity, shuffle, rng, evaluation="valid"):
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self,
+                           configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           shuffle: bool = False,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
@@ -333,14 +233,14 @@ def objective_function(
             'info': info
         }
 
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            **kwargs
-    ) -> Dict:
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self,
+                                configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                shuffle: bool = False,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
@@ -375,14 +275,3 @@ def objective_function_test(
             'cost': model_fit_time + info['test_costs']['acc'],
             'info': info
         }
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        return {
-            'name': 'Support Vector Machine',
-            'shape of train data': self.train_X.shape,
-            'shape of test data': self.test_X.shape,
-            'shape of valid data': self.valid_X.shape,
-            'initial random seed': self.seed,
-            'task_id': self.task_id
-        }

From 9dbd61c6255f2ac1ecde3e9f5250d78d9698bba5 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Wed, 11 Aug 2021 17:55:58 +0200
Subject: [PATCH 051/147] Corret Datamanager.

But how to download the task data to disk?
---
 hpobench/dependencies/ml/data_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml/data_manager.py
index 9cc7f5f7..d65ee62c 100644
--- a/hpobench/dependencies/ml/data_manager.py
+++ b/hpobench/dependencies/ml/data_manager.py
@@ -76,7 +76,7 @@ def load(self, valid_size=None, verbose=False):
             ]
             for files in required_file_list:
                 data_str = "{}_{}.parquet.gzip".format(*files)
-                if (data_path / data_str).exists():
+                if not (data_path / data_str).exists():
                     raise FileNotFoundError("{} not found!".format(data_str.format(*files)))
             # ignore the remaining data loaders and preprocessors as valid data splits available
             return

From 0304146fa91d818178cb3b98bea9b445edfe0d2f Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Wed, 11 Aug 2021 17:57:20 +0200
Subject: [PATCH 052/147] Finalize HistGB Benchmarks

- split them into multiple benchmarks according to their fidelity spaces.
---
 hpobench/benchmarks/ml/__init__.py         |   7 --
 hpobench/benchmarks/ml/histgb_benchmark.py | 123 ++++++++++++++-------
 2 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index 37d5cd33..e69de29b 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -1,7 +0,0 @@
-from .tabular_benchmark import TabularBenchmark
-from .svm_benchmark import SVMBenchmark
-from .rf_benchmark import RandomForestBenchmark
-from .xgboost_benchmark import XGBoostBenchmark
-from .histgb_benchmark import HistGBBenchmark
-from .lr_benchmark import LRBenchmark
-from .nn_benchmark import NNBenchmark
\ No newline at end of file
diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index b431c056..332dadc4 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -1,31 +1,25 @@
-import numpy as np
 import ConfigSpace as CS
-from copy import deepcopy
+import numpy as np
 from typing import Union
 
 # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
-from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
 class HistGBBenchmark(MLBenchmark):
-    def __init__(
-            self,
-            task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
-            valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            data_path: Union[str, None] = None
-    ):
-        super(HistGBBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice, data_path)
-        pass
+    def __init__(self,
+                 task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 valid_size: float = 0.33,
+                 data_path: Union[str, None] = None):
+        super(HistGBBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
-    def get_configuration_space(seed=None):
-        """Parameter space to be optimized --- contains the hyperparameters
-        """
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """Parameter space to be optimized --- contains the hyperparameters"""
         cs = CS.ConfigurationSpace(seed=seed)
 
         cs.add_hyperparameters([
@@ -45,21 +39,24 @@ def get_configuration_space(seed=None):
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=1):
+    def _get_fidelity_choices(ntrees_choice: str, subsample_choice: str):
         """Fidelity space available --- specifies the fidelity dimensions
 
-        If fidelity_choice is 0
+        If SearchSpace is 0
             Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
+        If SearchSpace is 1
             Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If fidelity_choice is 2
+        If SearchSpace is 2
             Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
+        If SearchSpace is >2
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
-        z_cs = CS.ConfigurationSpace(seed=seed)
+        assert ntrees_choice in ['fixed', 'variable']
+        assert subsample_choice in ['fixed', 'variable']
+
         fidelity1 = dict(
-            fixed=CS.Constant('n_estimators', value=100),
+            # TODO: this value was 100 in the original code. Please check if 100 or 1000.
+            fixed=CS.Constant('n_estimators', value=1000),
             variable=CS.UniformIntegerHyperparameter(
                 'n_estimators', lower=100, upper=1000, default_value=1000, log=False
             )
@@ -70,24 +67,24 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         )
-        if fidelity_choice == 0:
-            # black-box setting (full fidelity)
-            ntrees = fidelity1["fixed"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 1:
-            # gray-box setting (multi-fidelity) - ntrees
-            ntrees = fidelity1["variable"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 2:
-            # gray-box setting (multi-fidelity) - data subsample
-            ntrees = fidelity1["fixed"]
-            subsample = fidelity2["variable"]
-        else:
-            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            ntrees = fidelity1["variable"]
-            subsample = fidelity2["variable"]
-        z_cs.add_hyperparameters([ntrees, subsample])
-        return z_cs
+        ntrees = fidelity1[ntrees_choice]
+        subsample = fidelity2[subsample_choice]
+        return ntrees, subsample
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """Fidelity space available --- specifies the fidelity dimensions
+
+        If fidelity_choice is 0
+            Fidelity space is the maximal fidelity, akin to a black-box function
+        If fidelity_choice is 1
+            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
+        If fidelity_choice is 2
+            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
+        If fidelity_choice is >2
+            Fidelity space is multi-multi fidelity, all possible fidelities
+        """
+        raise NotImplementedError()
 
     def init_model(self, config, fidelity=None, rng=None):
         """ Function that returns the model initialized based on the configuration and fidelity
@@ -100,3 +97,47 @@ def init_model(self, config, fidelity=None, rng=None):
             random_state=rng
         )
         return model
+
+
+class HistGBSearchSpace0Benchmark(HistGBBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # black-box setting (full fidelity)
+            HistGBBenchmark._get_fidelity_choices(ntrees_choice='fixed', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class HistGBSearchSpace1Benchmark(HistGBBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - ntrees
+            HistGBBenchmark._get_fidelity_choices(ntrees_choice='variable', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class HistGBSearchSpace2Benchmark(HistGBBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - subsample
+            HistGBBenchmark._get_fidelity_choices(ntrees_choice='fixed', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+class HistGBSearchSpace3Benchmark(HistGBBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            HistGBBenchmark._get_fidelity_choices(ntrees_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+__all__ = [HistGBSearchSpace0Benchmark, HistGBSearchSpace1Benchmark,
+           HistGBSearchSpace2Benchmark, HistGBSearchSpace3Benchmark]

From 3e95d191f379baecb759e3dff81a62d8838a5359 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 16 Aug 2021 18:11:35 +0200
Subject: [PATCH 053/147] Write OpenML Datamanager

---
 hpobench/dependencies/ml/data_manager.py      | 73 ++++++++++++++-----
 .../dependencies/ml/ml_benchmark_template.py  |  7 +-
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml/data_manager.py
index d65ee62c..244cd0cf 100644
--- a/hpobench/dependencies/ml/data_manager.py
+++ b/hpobench/dependencies/ml/data_manager.py
@@ -11,7 +11,7 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
-
+from oslo_concurrency import lockutils
 
 from hpobench.util.data_manager import DataManager
 
@@ -48,9 +48,13 @@ def __init__(self, task_id: int,
         if data_path is None:
             data_path = config_file.data_dir / "OpenML"
 
-        self.data_path = data_path
+        self.data_path = Path(data_path)
+        openml.config.set_cache_directory(str(self.data_path))
+
         super(OpenMLDataManager, self).__init__()
 
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{config_file.cache_dir}/openml_dm_lock', delay=0.2)
     def load(self, valid_size=None, verbose=False):
         """Fetches data from OpenML and initializes the train-validation-test data splits
 
@@ -66,25 +70,42 @@ def load(self, valid_size=None, verbose=False):
             self.logger.debug(self.task)
             self.logger.debug(self.dataset)
 
-        # check if the path to data splits is valid
-        if self.data_path is not None and self.data_path.is_dir():
-            data_path = self.data_path / str(self.task_id)
-            required_file_list = [
-                ("train", "x"), ("train", "y"),
-                ("valid", "x"), ("valid", "y"),
-                ("test", "x"), ("test", "y")
-            ]
-            for files in required_file_list:
-                data_str = "{}_{}.parquet.gzip".format(*files)
-                if not (data_path / data_str).exists():
-                    raise FileNotFoundError("{} not found!".format(data_str.format(*files)))
-            # ignore the remaining data loaders and preprocessors as valid data splits available
+        data_set_path = self.data_path / "org/openml/www/datasets" / str(self.task.dataset_id)
+        successfully_loaded = self.try_to_load_data(data_set_path)
+        if successfully_loaded:
+            self.logger.info(f'Successfully loaded the preprocessed splits from '
+                             f'{data_set_path}')
             return
 
+        # If the data is not available, download it.
+        self.__download_data(verbose=verbose, valid_size=valid_size)
+
+        # Save the preprocessed splits to file for later usage.
+        self.generate_openml_splits(data_set_path)
+
+        return
+
+    def try_to_load_data(self, data_path: Path) -> bool:
+        path_str = "{}_{}.parquet.gzip"
+        try:
+            self.train_X = pd.read_parquet(data_path / path_str.format("train", "x")).to_numpy()
+            self.train_y = pd.read_parquet(data_path / path_str.format("train", "y")).squeeze(axis=1)
+            self.valid_X = pd.read_parquet(data_path / path_str.format("valid", "x")).to_numpy()
+            self.valid_y = pd.read_parquet(data_path / path_str.format("valid", "y")).squeeze(axis=1)
+            self.test_X = pd.read_parquet(data_path / path_str.format("test", "x")).to_numpy()
+            self.test_y = pd.read_parquet(data_path / path_str.format("test", "y")).squeeze(axis=1)
+        except FileNotFoundError:
+            return False
+        return True
+
+    def __download_data(self, valid_size: Union[int, float, None], verbose: bool):
+        self.logger.info(f'Start to download the OpenML dataset')
+
         # loads full data
-        X, y, categorical_ind, feature_names = self.dataset.get_data(
-            target=self.task.target_name, dataset_format="dataframe"
-        )
+        X, y, categorical_ind, feature_names = self.dataset.get_data(target=self.task.target_name,
+                                                                     dataset_format="dataframe")
+        assert Path(self.dataset.data_file).exists(), f'The datafile {self.dataset.data_file} does not exists.'
+
         categorical_ind = np.array(categorical_ind)
         (cat_idx,) = np.where(categorical_ind)
         (cont_idx,) = np.where(~categorical_ind)
@@ -147,7 +168,21 @@ def load(self, valid_size=None, verbose=False):
             self.logger.debug("Validation data (X, y): ({}, {})".format(self.valid_X.shape, self.valid_y.shape))
             self.logger.debug("Test data (X, y): ({}, {})".format(self.test_X.shape, self.test_y.shape))
             self.logger.debug("\nData loading complete!\n")
-        return
+
+    def generate_openml_splits(self, data_path):
+        """ Store the created splits to file for later use… """
+        self.logger.info(f'Save the splits to {data_path}')
+
+        path_str = "{}_{}.parquet.gzip"
+        colnames = np.arange(self.train_X.shape[1]).astype(str)
+        label_name = str(self.task.target_name)
+
+        pd.DataFrame(self.train_X, columns=colnames).to_parquet(data_path / path_str.format("train", "x"))
+        self.train_y.to_frame(label_name).to_parquet(data_path / path_str.format("train", "y"))
+        pd.DataFrame(self.valid_X, columns=colnames).to_parquet(data_path / path_str.format("valid", "x"))
+        self.valid_y.to_frame(label_name).to_parquet(data_path / path_str.format("valid", "y"))
+        pd.DataFrame(self.test_X, columns=colnames).to_parquet(data_path / path_str.format("test", "x"))
+        self.test_y.to_frame(label_name).to_parquet(data_path / path_str.format("test", "y"))
 
     @staticmethod
     def _convert_labels(labels):
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 54029736..c773e830 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -53,7 +53,12 @@ def __init__(
         self.scorers = dict()
         for k, v in metrics.items():
             self.scorers[k] = make_scorer(v, **metrics_kwargs[k])
-        self.data_path = data_path
+
+        if data_path is None:
+            from hpobench import config_file
+            data_path = config_file.data_dir / "OpenML"
+
+        self.data_path = Path(data_path)
 
         dm = OpenMLDataManager(task_id, valid_size, data_path, global_seed)
         dm.load()

From f3fbd584249707292be1c354c565fcfea6f03bac Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 16 Aug 2021 18:52:17 +0200
Subject: [PATCH 054/147] Unify interface for the other ml benchmarks.

---
 hpobench/benchmarks/ml/histgb_benchmark.py  |  37 +++----
 hpobench/benchmarks/ml/lr_benchmark.py      | 104 +++++++++++++------
 hpobench/benchmarks/ml/nn_benchmark.py      | 105 ++++++++++++-------
 hpobench/benchmarks/ml/rf_benchmark.py      | 109 +++++++++++++-------
 hpobench/benchmarks/ml/svm_benchmark.py     |  67 +++++++-----
 hpobench/benchmarks/ml/xgboost_benchmark.py | 108 ++++++++++++-------
 6 files changed, 337 insertions(+), 193 deletions(-)

diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
index 332dadc4..929f2bbf 100644
--- a/hpobench/benchmarks/ml/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -1,10 +1,11 @@
+from typing import Union, Tuple
+
 import ConfigSpace as CS
 import numpy as np
-from typing import Union
-
+from ConfigSpace.hyperparameters import Hyperparameter
+from sklearn.ensemble import HistGradientBoostingClassifier
 # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-from sklearn.ensemble import HistGradientBoostingClassifier
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
@@ -39,18 +40,23 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         return cs
 
     @staticmethod
-    def _get_fidelity_choices(ntrees_choice: str, subsample_choice: str):
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
 
-        If SearchSpace is 0
+        If fidelity_choice is 0
             Fidelity space is the maximal fidelity, akin to a black-box function
-        If SearchSpace is 1
+        If fidelity_choice is 1
             Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If SearchSpace is 2
+        If fidelity_choice is 2
             Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If SearchSpace is >2
+        If fidelity_choice is >2
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
+        raise NotImplementedError()
+
+    @staticmethod
+    def _get_fidelity_choices(ntrees_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+
         assert ntrees_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
 
@@ -71,21 +77,6 @@ def _get_fidelity_choices(ntrees_choice: str, subsample_choice: str):
         subsample = fidelity2[subsample_choice]
         return ntrees, subsample
 
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """Fidelity space available --- specifies the fidelity dimensions
-
-        If fidelity_choice is 0
-            Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
-            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If fidelity_choice is 2
-            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
-            Fidelity space is multi-multi fidelity, all possible fidelities
-        """
-        raise NotImplementedError()
-
     def init_model(self, config, fidelity=None, rng=None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index de791aa6..a7e1f857 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -1,22 +1,22 @@
-import ConfigSpace as CS
-from typing import Union, List, Dict
+from typing import Union, Tuple
 
+import ConfigSpace as CS
+import numpy as np
+from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.linear_model import SGDClassifier
 
-from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class LRBenchmark(MLBenchmark):
-    def __init__(
-            self,
-            task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
-            valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            data_path: Union[str, None] = None
-    ):
-        super(LRBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice, data_path)
-        self.cache_size = 500
+class LRBaseBenchmark(MLBenchmark):
+    def __init__(self,
+                 task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 valid_size: float = 0.33,
+                 data_path: Union[str, None] = None):
+
+        super(LRBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+        self.cache_size = 500  # TODO: Do we need this?
 
     @staticmethod
     def get_configuration_space(seed=None):
@@ -33,8 +33,11 @@ def get_configuration_space(seed=None):
         ])
         return cs
 
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        raise NotImplementedError()
+
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=None):
+    def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
         """Fidelity space available --- specifies the fidelity dimensions
 
         For SVM, only a single fidelity exists, i.e., subsample fraction.
@@ -44,7 +47,10 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
             parameterizes the fraction of data to subsample
 
         """
-        z_cs = CS.ConfigurationSpace(seed=seed)
+
+        assert iter_choice in ['fixed', 'variable']
+        assert subsample_choice in ['fixed', 'variable']
+
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=1000),
             variable=CS.UniformIntegerHyperparameter(
@@ -57,24 +63,10 @@ def get_fidelity_space(seed=None, fidelity_choice=None):
                 'subsample', lower=0.1, upper=1.0, default_value=1.0, log=False
             )
         )
-        if fidelity_choice == 0:
-            # black-box setting (full fidelity)
-            iter = fidelity1["fixed"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 1:
-            # gray-box setting (multi-fidelity) - iterations
-            iter = fidelity1["variable"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 2:
-            # gray-box setting (multi-fidelity) - data subsample
-            iter = fidelity1["fixed"]
-            subsample = fidelity2["variable"]
-        else:
-            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
-            iter = fidelity1["variable"]
-            subsample = fidelity2["variable"]
-        z_cs.add_hyperparameters([iter, subsample])
-        return z_cs
+
+        iter = fidelity1[iter_choice]
+        subsample = fidelity2[subsample_choice]
+        return iter, subsample
 
     def init_model(self, config, fidelity=None, rng=None):
         # initializing model
@@ -89,3 +81,47 @@ def init_model(self, config, fidelity=None, rng=None):
             random_state=rng,
         )
         return model
+
+
+class LRSearchSpace0Benchmark(LRBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # black-box setting (full fidelity)
+            LRBaseBenchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class LRSearchSpace1Benchmark(LRBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - iterations
+            LRBaseBenchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class LRSearchSpace2Benchmark(LRBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - data subsample
+            LRBaseBenchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+class LRSearchSpace3Benchmark(LRBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
+            LRBaseBenchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+__all__ = [LRSearchSpace0Benchmark, LRSearchSpace1Benchmark,
+           LRSearchSpace2Benchmark, LRSearchSpace3Benchmark]
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 2c92b371..fd6e0a51 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -1,25 +1,21 @@
-import numpy as np
-import ConfigSpace as CS
 from copy import deepcopy
 from typing import Union, Tuple
+
+import ConfigSpace as CS
+import numpy as np
+from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.neural_network import MLPClassifier
 
-from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class NNBenchmark(MLBenchmark):
-    def __init__(
-            self,
-            task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
-            valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            data_path: Union[str, None] = None
-    ):
-        super(NNBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, data_path
-        )
-        pass
+class NNBaseBenchmark(MLBenchmark):
+    def __init__(self,
+                 task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 valid_size: float = 0.33,
+                 data_path: Union[str, None] = None):
+        super(NNBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
     def get_configuration_space(seed=None):
@@ -28,7 +24,9 @@ def get_configuration_space(seed=None):
         cs = CS.ConfigurationSpace(seed=seed)
 
         cs.add_hyperparameters([
-            CS.UniformIntegerHyperparameter('depth', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter(
+                'depth', default_value=3, lower=1, upper=3, log=False
+            ),
             CS.UniformIntegerHyperparameter(
                 'width', default_value=64, lower=16, upper=1024, log=True
             ),
@@ -45,7 +43,7 @@ def get_configuration_space(seed=None):
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=1):
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
 
         If fidelity_choice is 0
@@ -57,7 +55,11 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         If fidelity_choice is >2
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
-        z_cs = CS.ConfigurationSpace(seed=seed)
+        raise NotImplementedError()
+
+    @staticmethod
+    def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=100),
             variable=CS.UniformIntegerHyperparameter(
@@ -70,24 +72,9 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         )
-        if fidelity_choice == 0:
-            # black-box setting (full fidelity)
-            iter = fidelity1["fixed"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 1:
-            # gray-box setting (multi-fidelity) - epochs/iteration
-            iter = fidelity1["variable"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 2:
-            # gray-box setting (multi-fidelity) - data subsample
-            iter = fidelity1["fixed"]
-            subsample = fidelity2["variable"]
-        else:
-            # gray-box setting (multi-multi-fidelity) - epochs + data subsample
-            iter = fidelity1["variable"]
-            subsample = fidelity2["variable"]
-        z_cs.add_hyperparameters([iter, subsample])
-        return z_cs
+        iter = fidelity1[iter_choice]
+        subsample = fidelity2[subsample_choice]
+        return iter, subsample
 
     def init_model(self, config, fidelity=None, rng=None):
         """ Function that returns the model initialized based on the configuration and fidelity
@@ -108,3 +95,47 @@ def init_model(self, config, fidelity=None, rng=None):
             random_state=rng
         )
         return model
+
+
+class NNSearchSpace0Benchmark(NNBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # black-box setting (full fidelity)
+            NNSearchSpace0Benchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class NNSearchSpace1Benchmark(NNBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - iterations
+            NNSearchSpace1Benchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class NNSearchSpace2Benchmark(NNBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - subsample
+            NNSearchSpace2Benchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+class NNSearchSpace3Benchmark(NNBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
+            NNSearchSpace3Benchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+__all__ = [NNSearchSpace0Benchmark, NNSearchSpace1Benchmark,
+           NNSearchSpace2Benchmark, NNSearchSpace3Benchmark]
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 70e02bdb..0ae819a6 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -1,25 +1,21 @@
-import numpy as np
-import ConfigSpace as CS
-from typing import Union
 from copy import deepcopy
+from typing import Union, Tuple
+
+import ConfigSpace as CS
+import numpy as np
+from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.ensemble import RandomForestClassifier
 
-from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class RandomForestBenchmark(MLBenchmark):
-    def __init__(
-            self,
-            task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
-            valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            data_path: Union[str, None] = None
-    ):
-        super(RandomForestBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, data_path
-        )
-        pass
+class RandomForestBaseBenchmark(MLBenchmark):
+    def __init__(self,
+                 task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 valid_size: float = 0.33,
+                 data_path: Union[str, None] = None):
+        super(RandomForestBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
     def get_configuration_space(seed=None):
@@ -44,7 +40,7 @@ def get_configuration_space(seed=None):
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=1):
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
 
         If fidelity_choice is 0
@@ -56,37 +52,30 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         If fidelity_choice is >2
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
-        z_cs = CS.ConfigurationSpace(seed=seed)
+        raise NotImplementedError()
+
+    @staticmethod
+    def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+
+        assert n_estimators_choice in ['fixed', 'variable']
+        assert subsample_choice in ['fixed', 'variable']
+
         fidelity1 = dict(
-            fixed=CS.Constant('n_estimators', value=100),
+            fixed=CS.Constant('n_estimators', value=100),  # TODO: is the default value here 100 or 512?
             variable=CS.UniformIntegerHyperparameter(
                 'n_estimators', lower=16, upper=512, default_value=512, log=False
             )
         )
+
         fidelity2 = dict(
             fixed=CS.Constant('subsample', value=1),
             variable=CS.UniformFloatHyperparameter(
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         )
-        if fidelity_choice == 0:
-            # black-box setting (full fidelity)
-            ntrees = fidelity1["fixed"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 1:
-            # gray-box setting (multi-fidelity) - ntrees
-            ntrees = fidelity1["variable"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 2:
-            # gray-box setting (multi-fidelity) - data subsample
-            ntrees = fidelity1["fixed"]
-            subsample = fidelity2["variable"]
-        else:
-            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            ntrees = fidelity1["variable"]
-            subsample = fidelity2["variable"]
-        z_cs.add_hyperparameters([ntrees, subsample])
-        return z_cs
+        n_estimators = fidelity1[n_estimators_choice]
+        subsample = fidelity2[subsample_choice]
+        return n_estimators, subsample
 
     def init_model(self, config, fidelity=None, rng=None):
         """ Function that returns the model initialized based on the configuration and fidelity
@@ -102,3 +91,47 @@ def init_model(self, config, fidelity=None, rng=None):
             random_state=rng
         )
         return model
+
+
+class RandomForestSearchSpace0Benchmark(RandomForestBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # black-box setting (full fidelity)
+            RandomForestBaseBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class RandomForestSearchSpace1Benchmark(RandomForestBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - ntrees
+            RandomForestBaseBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class RandomForestSearchSpace2Benchmark(RandomForestBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - data subsample
+            RandomForestBaseBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+class RandomForestSearchSpace3Benchmark(RandomForestBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            RandomForestBaseBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+__all__ = [RandomForestSearchSpace0Benchmark, RandomForestSearchSpace1Benchmark,
+           RandomForestSearchSpace2Benchmark, RandomForestSearchSpace3Benchmark]
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index fe1afb66..a61515f5 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -1,21 +1,21 @@
-import ConfigSpace as CS
 from typing import Union
 
+import ConfigSpace as CS
+import numpy as np
+from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.svm import SVC
 
-from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class SVMBenchmark(MLBenchmark):
-    def __init__(
-            self,
-            task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
-            valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            data_path: Union[str, None] = None
-    ):
-        super(SVMBenchmark, self).__init__(task_id, seed, valid_size, fidelity_choice, data_path)
+class SVMBaseBenchmark(MLBenchmark):
+    def __init__(self,
+                 task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 valid_size: float = 0.33,
+                 data_path: Union[str, None] = None):
+        super(SVMBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+
         self.cache_size = 200
 
     @staticmethod
@@ -35,31 +35,32 @@ def get_configuration_space(seed=None):
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=None):
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
 
         For SVM, only a single fidelity exists, i.e., subsample fraction.
         if fidelity_choice == 0
             uses the entire data (subsample=1), reflecting the black-box setup
         else
-            parameterizes the fraction of data to subsample
+            parameterize the fraction of data to subsample
 
         """
-        z_cs = CS.ConfigurationSpace(seed=seed)
+        raise NotImplementedError()
+
+    @staticmethod
+    def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
+
+        assert subsample_choice in ['fixed', 'variable']
+
         fidelity = dict(
             fixed=CS.Constant('subsample', value=1),
             variable=CS.UniformFloatHyperparameter(
                 'subsample', lower=0.1, upper=1.0, default_value=1.0, log=False
             )
         )
-        if fidelity_choice == 0:
-            # black-box setting (full fidelity)
-            subsample = fidelity["fixed"]
-        else:
-            # gray-box setting (multi-fidelity) - data subsample
-            subsample = fidelity["variable"]
-        z_cs.add_hyperparameter(subsample)
-        return z_cs
+        subsample = fidelity[subsample_choice]
+
+        return subsample
 
     def init_model(self, config, fidelity=None, rng=None):
         # initializing model
@@ -71,3 +72,23 @@ def init_model(self, config, fidelity=None, rng=None):
             cache_size=self.cache_size
         )
         return model
+
+
+class SVMSearchSpace0Benchmark(SVMBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameter(
+            # uses the entire data (subsample=1), reflecting the black-box setup
+            SVMBaseBenchmark._get_fidelity_choices(subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class SVMSearchSpace1Benchmark(SVMBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameter(
+            # parameterize the fraction of data to subsample
+            SVMBaseBenchmark._get_fidelity_choices(subsample_choice='fixed')
+        )
+        return fidelity_space
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 0fe3f07c..ca395f92 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -1,25 +1,20 @@
-import numpy as np
-import ConfigSpace as CS
-from typing import Union, Dict
+from typing import Union, Tuple
 
+import ConfigSpace as CS
+import numpy as np
 import xgboost as xgb
+from ConfigSpace.hyperparameters import Hyperparameter
 
-from hpobench.benchmarks.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class XGBoostBenchmark(MLBenchmark):
-    def __init__(
-            self,
-            task_id: Union[int, None] = None,
-            seed: Union[int, None] = None,  # Union[np.random.RandomState, int, None] = None,
-            valid_size: float = 0.33,
-            fidelity_choice: int = 1,
-            data_path: Union[str, None] = None
-    ):
-        super(XGBoostBenchmark, self).__init__(
-            task_id, seed, valid_size, fidelity_choice, data_path
-        )
-        pass
+class XGBoostBaseBenchmark(MLBenchmark):
+    def __init__(self,
+                 task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 valid_size: float = 0.33,
+                 data_path: Union[str, None] = None):
+        super(XGBoostBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
     def get_configuration_space(seed=None):
@@ -44,7 +39,7 @@ def get_configuration_space(seed=None):
         return cs
 
     @staticmethod
-    def get_fidelity_space(seed=None, fidelity_choice=1):
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
 
         If fidelity_choice is 0
@@ -56,9 +51,16 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
         If fidelity_choice is >2
             Fidelity space is multi-multi fidelity, all possible fidelities
         """
-        z_cs = CS.ConfigurationSpace(seed=seed)
+        raise NotImplementedError()
+
+    @staticmethod
+    def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+
+        assert n_estimators_choice in ['fixed', 'variable']
+        assert subsample_choice in ['fixed', 'variable']
+
         fidelity1 = dict(
-            fixed=CS.Constant('n_estimators', value=100),
+            fixed=CS.Constant('n_estimators', value=100),  # TODO: Should this be 1000 or 100?
             variable=CS.UniformIntegerHyperparameter(
                 'n_estimators', lower=50, upper=2000, default_value=1000, log=False
             )
@@ -69,24 +71,10 @@ def get_fidelity_space(seed=None, fidelity_choice=1):
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         )
-        if fidelity_choice == 0:
-            # black-box setting (full fidelity)
-            ntrees = fidelity1["fixed"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 1:
-            # gray-box setting (multi-fidelity) - ntrees
-            ntrees = fidelity1["variable"]
-            subsample = fidelity2["fixed"]
-        elif fidelity_choice == 2:
-            # gray-box setting (multi-fidelity) - data subsample
-            ntrees = fidelity1["fixed"]
-            subsample = fidelity2["variable"]
-        else:
-            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            ntrees = fidelity1["variable"]
-            subsample = fidelity2["variable"]
-        z_cs.add_hyperparameters([ntrees, subsample])
-        return z_cs
+
+        n_estimators = fidelity1[n_estimators_choice]
+        subsample = fidelity2[subsample_choice]
+        return n_estimators, subsample
 
     def init_model(self, config, fidelity=None, rng=None):
         """ Function that returns the model initialized based on the configuration and fidelity
@@ -108,3 +96,47 @@ def init_model(self, config, fidelity=None, rng=None):
             **extra_args
         )
         return model
+
+
+class XGBoostSearchSpace0Benchmark(XGBoostBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # black-box setting (full fidelity)
+            XGBoostBaseBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class XGBoostSearchSpace1Benchmark(XGBoostBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - ntrees
+            XGBoostBaseBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
+        )
+        return fidelity_space
+
+
+class XGBoostSearchSpace2Benchmark(XGBoostBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-fidelity) - data subsample
+            XGBoostBaseBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+class XGBoostSearchSpace3Benchmark(XGBoostBaseBenchmark):
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            XGBoostBaseBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
+
+
+__all__ = [XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark,
+           XGBoostSearchSpace2Benchmark, XGBoostSearchSpace3Benchmark]

From e57fbcbb52bdba3097f7456673e6c1da04b5e168 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 16 Aug 2021 18:52:24 +0200
Subject: [PATCH 055/147] Flake + Pep

---
 hpobench/dependencies/ml/data_manager.py          |  2 +-
 hpobench/dependencies/ml/ml_benchmark_template.py | 12 +++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml/data_manager.py
index 244cd0cf..55210933 100644
--- a/hpobench/dependencies/ml/data_manager.py
+++ b/hpobench/dependencies/ml/data_manager.py
@@ -99,7 +99,7 @@ def try_to_load_data(self, data_path: Path) -> bool:
         return True
 
     def __download_data(self, valid_size: Union[int, float, None], verbose: bool):
-        self.logger.info(f'Start to download the OpenML dataset')
+        self.logger.info('Start to download the OpenML dataset')
 
         # loads full data
         X, y, categorical_ind, feature_names = self.dataset.get_data(target=self.task.target_name,
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index c773e830..41be449a 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -1,12 +1,10 @@
-import os
 import time
-import openml
-import numpy as np
-import pandas as pd
-import ConfigSpace as CS
-from typing import Union, Dict
 from pathlib import Path
+from typing import Union, Dict
 
+import ConfigSpace as CS
+import numpy as np
+import pandas as pd
 from sklearn.metrics import make_scorer, accuracy_score, balanced_accuracy_score, \
     precision_score, f1_score
 
@@ -89,7 +87,7 @@ def get_configuration_space(seed=None):
         raise NotImplementedError()
 
     @staticmethod
-    def get_fidelity_space(seed=None):
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
 
         If fidelity_choice is 0

From f6131ea29e49a6a6ec3511ecc4d368f346299ed0 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 16 Aug 2021 19:14:38 +0200
Subject: [PATCH 056/147] Add Container Interface

---
 hpobench/benchmarks/{ml => ml_mmfb}/README.md |  0
 hpobench/benchmarks/ml_mmfb/__init__.py       |  0
 .../{ml => ml_mmfb}/histgb_benchmark.py       |  0
 .../{ml => ml_mmfb}/lr_benchmark.py           |  0
 .../{ml => ml_mmfb}/nn_benchmark.py           |  0
 .../{ml => ml_mmfb}/rf_benchmark.py           |  0
 .../{ml => ml_mmfb}/svm_benchmark.py          |  0
 .../{ml => ml_mmfb}/tabular_benchmark.py      |  0
 .../{ml => ml_mmfb}/xgboost_benchmark.py      |  0
 .../container/benchmarks/ml_mmfb/__init__.py  |  0
 .../benchmarks/ml_mmfb/histgb_benchmark.py    | 42 +++++++++++++++++++
 .../benchmarks/ml_mmfb/lr_benchmark.py        | 42 +++++++++++++++++++
 .../benchmarks/ml_mmfb/nn_benchmark.py        | 42 +++++++++++++++++++
 .../benchmarks/ml_mmfb/rf_benchmark.py        | 42 +++++++++++++++++++
 .../benchmarks/ml_mmfb/svm_benchmark.py       | 25 +++++++++++
 .../benchmarks/ml_mmfb/tabular_benchmark.py   | 25 +++++++++++
 .../benchmarks/ml_mmfb/xgboost_benchmark.py   | 42 +++++++++++++++++++
 17 files changed, 260 insertions(+)
 rename hpobench/benchmarks/{ml => ml_mmfb}/README.md (100%)
 create mode 100644 hpobench/benchmarks/ml_mmfb/__init__.py
 rename hpobench/benchmarks/{ml => ml_mmfb}/histgb_benchmark.py (100%)
 rename hpobench/benchmarks/{ml => ml_mmfb}/lr_benchmark.py (100%)
 rename hpobench/benchmarks/{ml => ml_mmfb}/nn_benchmark.py (100%)
 rename hpobench/benchmarks/{ml => ml_mmfb}/rf_benchmark.py (100%)
 rename hpobench/benchmarks/{ml => ml_mmfb}/svm_benchmark.py (100%)
 rename hpobench/benchmarks/{ml => ml_mmfb}/tabular_benchmark.py (100%)
 rename hpobench/benchmarks/{ml => ml_mmfb}/xgboost_benchmark.py (100%)
 create mode 100644 hpobench/container/benchmarks/ml_mmfb/__init__.py
 create mode 100644 hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
 create mode 100644 hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
 create mode 100644 hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
 create mode 100644 hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
 create mode 100644 hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
 create mode 100644 hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
 create mode 100644 hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py

diff --git a/hpobench/benchmarks/ml/README.md b/hpobench/benchmarks/ml_mmfb/README.md
similarity index 100%
rename from hpobench/benchmarks/ml/README.md
rename to hpobench/benchmarks/ml_mmfb/README.md
diff --git a/hpobench/benchmarks/ml_mmfb/__init__.py b/hpobench/benchmarks/ml_mmfb/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/benchmarks/ml/histgb_benchmark.py b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
similarity index 100%
rename from hpobench/benchmarks/ml/histgb_benchmark.py
rename to hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
similarity index 100%
rename from hpobench/benchmarks/ml/lr_benchmark.py
rename to hpobench/benchmarks/ml_mmfb/lr_benchmark.py
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
similarity index 100%
rename from hpobench/benchmarks/ml/nn_benchmark.py
rename to hpobench/benchmarks/ml_mmfb/nn_benchmark.py
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
similarity index 100%
rename from hpobench/benchmarks/ml/rf_benchmark.py
rename to hpobench/benchmarks/ml_mmfb/rf_benchmark.py
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
similarity index 100%
rename from hpobench/benchmarks/ml/svm_benchmark.py
rename to hpobench/benchmarks/ml_mmfb/svm_benchmark.py
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
similarity index 100%
rename from hpobench/benchmarks/ml/tabular_benchmark.py
rename to hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
similarity index 100%
rename from hpobench/benchmarks/ml/xgboost_benchmark.py
rename to hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
diff --git a/hpobench/container/benchmarks/ml_mmfb/__init__.py b/hpobench/container/benchmarks/ml_mmfb/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
new file mode 100644
index 00000000..77ed4bbb
--- /dev/null
+++ b/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the HistGB Benchmarks from hpobench/benchmarks/ml_mmfb/histgb_benchmark.py """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class HistGBSearchSpace0Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBSearchSpace0Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(HistGBSearchSpace0Benchmark, self).__init__(**kwargs)
+
+
+class HistGBSearchSpace1Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBSearchSpace1Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(HistGBSearchSpace1Benchmark, self).__init__(**kwargs)
+
+
+class HistGBSearchSpace2Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBSearchSpace2Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(HistGBSearchSpace2Benchmark, self).__init__(**kwargs)
+
+
+class HistGBSearchSpace3Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBSearchSpace3Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(HistGBSearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+__all__ = [HistGBSearchSpace0Benchmark, HistGBSearchSpace1Benchmark,
+           HistGBSearchSpace2Benchmark, HistGBSearchSpace3Benchmark]
diff --git a/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
new file mode 100644
index 00000000..fd1b4015
--- /dev/null
+++ b/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the learning rate Benchmarks from hpobench/benchmarks/ml_mmfb/lr_benchmarks.py """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class LRSearchSpace0Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRSearchSpace0Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(LRSearchSpace0Benchmark, self).__init__(**kwargs)
+
+
+class LRSearchSpace1Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRSearchSpace1Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(LRSearchSpace1Benchmark, self).__init__(**kwargs)
+
+
+class LRSearchSpace2Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRSearchSpace2Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(LRSearchSpace2Benchmark, self).__init__(**kwargs)
+
+
+class LRSearchSpace3Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRSearchSpace3Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(LRSearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+__all__ = [LRSearchSpace0Benchmark, LRSearchSpace1Benchmark,
+           LRSearchSpace2Benchmark, LRSearchSpace3Benchmark]
diff --git a/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
new file mode 100644
index 00000000..818fb606
--- /dev/null
+++ b/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the Neural Network Benchmarks from hpobench/benchmarks/ml_mmfb/nn_benchmark.py """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class NNSearchSpace0Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNSearchSpace0Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(NNSearchSpace0Benchmark, self).__init__(**kwargs)
+
+
+class NNSearchSpace1Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNSearchSpace1Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(NNSearchSpace1Benchmark, self).__init__(**kwargs)
+
+
+class NNSearchSpace2Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNSearchSpace2Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(NNSearchSpace2Benchmark, self).__init__(**kwargs)
+
+
+class NNSearchSpace3Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNSearchSpace3Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(NNSearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+__all__ = [NNSearchSpace0Benchmark, NNSearchSpace1Benchmark,
+           NNSearchSpace2Benchmark, NNSearchSpace3Benchmark]
diff --git a/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
new file mode 100644
index 00000000..3c7ced83
--- /dev/null
+++ b/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the Random Forest Benchmarks from hpobench/benchmarks/ml_mmfb/rf_benchmark.py """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class RandomForestSearchSpace0Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestSearchSpace0Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(RandomForestSearchSpace0Benchmark, self).__init__(**kwargs)
+
+
+class RandomForestSearchSpace1Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestSearchSpace1Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(RandomForestSearchSpace1Benchmark, self).__init__(**kwargs)
+
+
+class RandomForestSearchSpace2Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestSearchSpace2Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(RandomForestSearchSpace2Benchmark, self).__init__(**kwargs)
+
+
+class RandomForestSearchSpace3Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestSearchSpace3Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(RandomForestSearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+__all__ = [RandomForestSearchSpace0Benchmark, RandomForestSearchSpace1Benchmark,
+           RandomForestSearchSpace2Benchmark, RandomForestSearchSpace3Benchmark]
diff --git a/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
new file mode 100644
index 00000000..b2c46e75
--- /dev/null
+++ b/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the SVM Benchmarks from hpobench/benchmarks/ml_mmfb/svm_benchmark.py """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class SVMSearchSpace0Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMSearchSpace0Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(SVMSearchSpace0Benchmark, self).__init__(**kwargs)
+
+
+class SVMSearchSpace1Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMSearchSpace1Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(SVMSearchSpace1Benchmark, self).__init__(**kwargs)
+
+
+__all__ = [SVMSearchSpace0Benchmark, SVMSearchSpace1Benchmark]
diff --git a/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
new file mode 100644
index 00000000..f4a855d5
--- /dev/null
+++ b/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the Tabular Benchmarks from hpobench/benchmarks/ml_mmfb/tabular_benchmark.py """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class TabularBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'TabularBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_tabular_benchmarks')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(TabularBenchmark, self).__init__(**kwargs)
+
+
+class OriginalTabularBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'OriginalTabularBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_tabular_benchmarks')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(OriginalTabularBenchmark, self).__init__(**kwargs)
+
+
+__all__ = [TabularBenchmark, OriginalTabularBenchmark]
diff --git a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
new file mode 100644
index 00000000..72438d37
--- /dev/null
+++ b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the XGB Benchmarks from hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class XGBoostSearchSpace0Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace0Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(XGBoostSearchSpace0Benchmark, self).__init__(**kwargs)
+
+
+class XGBoostSearchSpace1Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace1Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(XGBoostSearchSpace1Benchmark, self).__init__(**kwargs)
+
+
+class XGBoostSearchSpace2Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace2Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(XGBoostSearchSpace2Benchmark, self).__init__(**kwargs)
+
+
+class XGBoostSearchSpace3Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace3Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+__all__ = [XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark,
+           XGBoostSearchSpace2Benchmark, XGBoostSearchSpace3Benchmark]

From 36bc391c2e06c25718ae3f5ac43c957a30054b9f Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 16 Aug 2021 21:56:06 +0200
Subject: [PATCH 057/147] Mark `task_id` as required.

---
 extra_requirements/ml.json                        |  3 ---
 extra_requirements/ml_mfbb.json                   |  4 ++++
 hpobench/benchmarks/ml_mmfb/entry_point.py        | 12 ++++++++++++
 hpobench/benchmarks/ml_mmfb/histgb_benchmark.py   |  2 +-
 hpobench/benchmarks/ml_mmfb/lr_benchmark.py       |  2 +-
 hpobench/benchmarks/ml_mmfb/nn_benchmark.py       |  2 +-
 hpobench/benchmarks/ml_mmfb/rf_benchmark.py       |  2 +-
 hpobench/benchmarks/ml_mmfb/svm_benchmark.py      |  2 +-
 hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py  |  2 +-
 hpobench/dependencies/ml/ml_benchmark_template.py |  2 +-
 10 files changed, 23 insertions(+), 10 deletions(-)
 delete mode 100644 extra_requirements/ml.json
 create mode 100644 extra_requirements/ml_mfbb.json
 create mode 100644 hpobench/benchmarks/ml_mmfb/entry_point.py

diff --git a/extra_requirements/ml.json b/extra_requirements/ml.json
deleted file mode 100644
index 8a68761f..00000000
--- a/extra_requirements/ml.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "ml_tabular_benchmarks": ["pandas>=1.0.0"]
-}
\ No newline at end of file
diff --git a/extra_requirements/ml_mfbb.json b/extra_requirements/ml_mfbb.json
new file mode 100644
index 00000000..68b4a557
--- /dev/null
+++ b/extra_requirements/ml_mfbb.json
@@ -0,0 +1,4 @@
+{
+  "ml_tabular_benchmarks": ["pandas==1.2.4"],
+  "ml_mfbb": ["pandas==1.2.4","sklearn==0.24.2"]
+}
\ No newline at end of file
diff --git a/hpobench/benchmarks/ml_mmfb/entry_point.py b/hpobench/benchmarks/ml_mmfb/entry_point.py
new file mode 100644
index 00000000..4ec917a6
--- /dev/null
+++ b/hpobench/benchmarks/ml_mmfb/entry_point.py
@@ -0,0 +1,12 @@
+from hpobench.benchmarks.ml_mmfb.histgb_benchmark import HistGBSearchSpace0Benchmark, HistGBSearchSpace1Benchmark, \
+    HistGBSearchSpace2Benchmark, HistGBSearchSpace3Benchmark
+from hpobench.benchmarks.ml_mmfb.lr_benchmark import LRSearchSpace0Benchmark, LRSearchSpace1Benchmark, \
+    LRSearchSpace2Benchmark, LRSearchSpace3Benchmark
+from hpobench.benchmarks.ml_mmfb.nn_benchmark import NNSearchSpace0Benchmark, NNSearchSpace1Benchmark, \
+    NNSearchSpace2Benchmark, NNSearchSpace3Benchmark
+from hpobench.benchmarks.ml_mmfb.rf_benchmark import RandomForestSearchSpace0Benchmark, \
+    RandomForestSearchSpace1Benchmark, RandomForestSearchSpace2Benchmark, RandomForestSearchSpace3Benchmark
+from hpobench.benchmarks.ml_mmfb.svm_benchmark import SVMSearchSpace0Benchmark, SVMSearchSpace1Benchmark
+from hpobench.benchmarks.ml_mmfb.tabular_benchmark import TabularBenchmark, OriginalTabularBenchmark
+from hpobench.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark,\
+    XGBoostSearchSpace2Benchmark, XGBoostSearchSpace3Benchmark
diff --git a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
index 929f2bbf..442476ed 100644
--- a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -12,7 +12,7 @@
 
 class HistGBBenchmark(MLBenchmark):
     def __init__(self,
-                 task_id: Union[int, None] = None,
+                 task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
diff --git a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
index a7e1f857..ff85629d 100644
--- a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
@@ -10,7 +10,7 @@
 
 class LRBaseBenchmark(MLBenchmark):
     def __init__(self,
-                 task_id: Union[int, None] = None,
+                 task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
diff --git a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
index fd6e0a51..6c3344f9 100644
--- a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
@@ -11,7 +11,7 @@
 
 class NNBaseBenchmark(MLBenchmark):
     def __init__(self,
-                 task_id: Union[int, None] = None,
+                 task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
diff --git a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
index 0ae819a6..fca50eb7 100644
--- a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
@@ -11,7 +11,7 @@
 
 class RandomForestBaseBenchmark(MLBenchmark):
     def __init__(self,
-                 task_id: Union[int, None] = None,
+                 task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
diff --git a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
index a61515f5..fa8e324d 100644
--- a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
@@ -10,7 +10,7 @@
 
 class SVMBaseBenchmark(MLBenchmark):
     def __init__(self,
-                 task_id: Union[int, None] = None,
+                 task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
diff --git a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
index ca395f92..d77b0938 100644
--- a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -10,7 +10,7 @@
 
 class XGBoostBaseBenchmark(MLBenchmark):
     def __init__(self,
-                 task_id: Union[int, None] = None,
+                 task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 41be449a..59e348d6 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -31,7 +31,7 @@ class MLBenchmark(AbstractBenchmark):
 
     def __init__(
             self,
-            task_id: Union[int, None] = None,
+            task_id: int,
             rng: Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
             data_path: Union[str, Path, None] = None,

From a5c7d6200a517a1bf1f11325b3300df082cecaaa Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 16 Aug 2021 22:33:13 +0200
Subject: [PATCH 058/147] Adapt Interfaces

---
 .../benchmarks/ml_mmfb/histgb_benchmark.py    |  8 ++---
 hpobench/benchmarks/ml_mmfb/lr_benchmark.py   |  8 ++---
 hpobench/benchmarks/ml_mmfb/nn_benchmark.py   |  8 ++---
 hpobench/benchmarks/ml_mmfb/rf_benchmark.py   |  8 ++---
 hpobench/benchmarks/ml_mmfb/svm_benchmark.py  |  8 ++---
 .../benchmarks/ml_mmfb/tabular_benchmark.py   | 12 +++----
 .../benchmarks/ml_mmfb/xgboost_benchmark.py   | 17 +++++++---
 .../dependencies/ml/ml_benchmark_template.py  | 31 ++++++++++++-------
 8 files changed, 59 insertions(+), 41 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
index 442476ed..50b4fe5b 100644
--- a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -1,11 +1,11 @@
-from typing import Union, Tuple
+from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
 from ConfigSpace.hyperparameters import Hyperparameter
-from sklearn.ensemble import HistGradientBoostingClassifier
 # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingClassifier
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
@@ -77,12 +77,12 @@ def _get_fidelity_choices(ntrees_choice: str, subsample_choice: str) -> Tuple[Hy
         subsample = fidelity2[subsample_choice]
         return ntrees, subsample
 
-    def init_model(self, config, fidelity=None, rng=None):
+    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
         model = HistGradientBoostingClassifier(
-            **config.get_dictionary(),
+            **config,
             max_iter=fidelity['n_estimators'],  # a fidelity being used during initialization
             early_stopping=False,
             random_state=rng
diff --git a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
index ff85629d..944e77c4 100644
--- a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
@@ -1,4 +1,4 @@
-from typing import Union, Tuple
+from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
@@ -19,7 +19,7 @@ def __init__(self,
         self.cache_size = 500  # TODO: Do we need this?
 
     @staticmethod
-    def get_configuration_space(seed=None):
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
@@ -68,12 +68,12 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config, fidelity=None, rng=None):
+    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
         # initializing model
         rng = self.rng if rng is None else rng
         # https://scikit-learn.org/stable/modules/sgd.html
         model = SGDClassifier(
-            **config.get_dictionary(),
+            **config,
             loss="log",  # performs Logistic Regression
             max_iter=fidelity["iter"],
             learning_rate="adaptive",
diff --git a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
index 6c3344f9..4826f7fe 100644
--- a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
@@ -1,5 +1,5 @@
 from copy import deepcopy
-from typing import Union, Tuple
+from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
@@ -18,7 +18,7 @@ def __init__(self,
         super(NNBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
-    def get_configuration_space(seed=None):
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
@@ -76,11 +76,11 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config, fidelity=None, rng=None):
+    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
-        config = deepcopy(config.get_dictionary())
+        config = deepcopy(config)
         depth = config["depth"]
         width = config["width"]
         config.pop("depth")
diff --git a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
index fca50eb7..b17f74d1 100644
--- a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
@@ -1,5 +1,5 @@
 from copy import deepcopy
-from typing import Union, Tuple
+from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
@@ -18,7 +18,7 @@ def __init__(self,
         super(RandomForestBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
-    def get_configuration_space(seed=None):
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
@@ -77,11 +77,11 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self, config, fidelity=None, rng=None):
+    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
-        config = deepcopy(config.get_dictionary())
+        config = deepcopy(config)
         n_features = self.train_X.shape[1]
         config["max_features"] = int(np.rint(np.power(n_features, config["max_features"])))
         model = RandomForestClassifier(
diff --git a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
index fa8e324d..bc439fed 100644
--- a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Dict
 
 import ConfigSpace as CS
 import numpy as np
@@ -19,7 +19,7 @@ def __init__(self,
         self.cache_size = 200
 
     @staticmethod
-    def get_configuration_space(seed=None):
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
@@ -62,10 +62,10 @@ def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
 
         return subsample
 
-    def init_model(self, config, fidelity=None, rng=None):
+    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
         # initializing model
         rng = self.rng if rng is None else rng
-        config = config.get_dictionary()
+        config = config
         model = SVC(
             **config,
             random_state=rng,
diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index dd07ec02..9c8e0739 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -67,7 +67,7 @@ def get_meta_information(self) -> Dict:
                 'model': self.model
                 }
 
-    def _preprocess_configspace(self, config_space):
+    def _preprocess_configspace(self, config_space: CS.ConfigurationSpace) -> CS.ConfigurationSpace:
         """ Converts floats to np.float32 """
         for hp in config_space.get_hyperparameters():
             hp.sequence = tuple(np.array(hp.sequence).astype(np.float32))
@@ -83,7 +83,7 @@ def _total_number_of_configurations(self, space: str = "hyperparameters") -> int
             count *= len(hp.sequence)
         return count
 
-    def _seeds_used(self):
+    def _seeds_used(self) -> List:
         return self.table.seed.unique().tolist()
 
     def sample_hyperparamer(self, n: int = 1) -> Union[CS.Configuration, List]:
@@ -105,7 +105,7 @@ def get_max_fidelity(self) -> Dict:
             max_fidelity[hp.name] = np.sort(hp.sequence)[-1]
         return max_fidelity
 
-    def get_fidelity_range(self):
+    def get_fidelity_range(self) -> List:
         fidelities = []
         for hp in self.fidelity_space.get_hyperparameters():
             if not isinstance(hp, CS.Constant) and len(hp.sequence) > 1:
@@ -126,11 +126,11 @@ def _search_dataframe(self, row_dict, df):
 
     def _objective(
             self,
-            config: CS.Configuration,
-            fidelity: CS.Configuration,
+            config: Dict,
+            fidelity: Dict,
             seed: Union[int, None] = None,
             metric: Union[str, None] = "acc",
-            evaluation: Union[str] = ""
+            evaluation: Union[str, None] = ""
     ) -> Dict:
 
         metric_str = ', '.join(list(metrics.keys))
diff --git a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
index d77b0938..a5735b2c 100644
--- a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -1,4 +1,4 @@
-from typing import Union, Tuple
+from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
@@ -17,7 +17,7 @@ def __init__(self,
         super(XGBoostBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
-    def get_configuration_space(seed=None):
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
         """
         cs = CS.ConfigurationSpace(seed=seed)
@@ -76,9 +76,18 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self, config, fidelity=None, rng=None):
+    def init_model(self,
+                   config: Union[CS.Configuration, Dict],
+                   fidelity: Union[CS.Configuration, Dict, None] = None,
+                   rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        if isinstance(fidelity, CS.Configuration):
+            fidelity = fidelity.get_dictionary()
+
+        # TODO: This seems to be wrong. (AND-condition)
         rng = rng if (rng is None and isinstance(rng, int)) else self.seed
         extra_args = dict(
             booster="gbtree",
@@ -92,7 +101,7 @@ def init_model(self, config, fidelity=None, rng=None):
             extra_args.update({"num_class": self.n_classes})
 
         model = xgb.XGBClassifier(
-            **config.get_dictionary(),
+            **config,
             **extra_args
         )
         return model
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 59e348d6..ff2ba55e 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -1,6 +1,6 @@
 import time
 from pathlib import Path
-from typing import Union, Dict
+from typing import Union, Dict, Iterable
 
 import ConfigSpace as CS
 import numpy as np
@@ -10,6 +10,7 @@
 
 from hpobench.abstract_benchmark import AbstractBenchmark
 from hpobench.dependencies.ml.data_manager import OpenMLDataManager
+from hpobench.util.rng_helper import get_rng
 
 metrics = dict(
     acc=accuracy_score,
@@ -81,7 +82,7 @@ def __init__(
         self.configuration_space = self.get_configuration_space(self.seed)
 
     @staticmethod
-    def get_configuration_space(seed=None):
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
         """
         raise NotImplementedError()
@@ -112,32 +113,40 @@ def get_meta_information(self):
             'task_id': self.task_id
         }
 
-    def init_model(self, config, fidelity=None, rng=None):
+    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         raise NotImplementedError()
 
-    def get_config(self, size=None):
+    def get_config(self, size: Union[int, None] = None):
         """Samples configuration(s) from the (hyper) parameter space
         """
         if size is None:  # return only one config
             return self.configuration_space.sample_configuration()
         return [self.configuration_space.sample_configuration() for i in range(size)]
 
-    def get_fidelity(self, size=None):
+    def get_fidelity(self, size: Union[int, None] = None):
         """Samples candidate fidelities from the fidelity space
         """
         if size is None:  # return only one config
             return self.fidelity_space.sample_configuration()
         return [self.fidelity_space.sample_configuration() for i in range(size)]
 
-    def shuffle_data_idx(self, train_idx=None, rng=None):
+    def shuffle_data_idx(self, train_idx: Iterable = None, rng: Union[np.random.RandomState, None] = None) -> Iterable:
         rng = self.rng if rng is None else rng
         train_idx = self.train_idx if train_idx is None else train_idx
         rng.shuffle(train_idx)
         return train_idx
 
-    def _train_objective(self, config, fidelity, shuffle, rng, evaluation="valid"):
+    def _train_objective(self, config: Dict,
+                         fidelity: Dict,
+                         shuffle: bool,
+                         rng: Union[np.random.RandomState, int, None] = None,
+                         evaluation: Union[str, None] = "valid"):
+
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
         # initializing model
         model = self.init_model(config, fidelity, rng)
 
@@ -226,8 +235,8 @@ def objective_function(self,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
+            'fidelity': fidelity,
+            'config': configuration,
         }
 
         return {
@@ -269,8 +278,8 @@ def objective_function_test(self,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity.get_dictionary(),
-            'config': configuration.get_dictionary()
+            'fidelity': fidelity,
+            'config': configuration,
         }
 
         return {

From c5f6979926cef5dc06998f36a42807708156c07b Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 16 Aug 2021 23:42:36 +0200
Subject: [PATCH 059/147] Fix minor errors.

---
 .../benchmarks/ml_mmfb/histgb_benchmark.py    |  6 ++-
 .../benchmarks/ml_mmfb/tabular_benchmark.py   | 49 ++++++++++++++-----
 .../dependencies/ml/ml_benchmark_template.py  |  3 +-
 hpobench/util/data_manager.py                 | 15 +++---
 4 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
index 50b4fe5b..9507d81f 100644
--- a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -27,11 +27,13 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
             CS.UniformIntegerHyperparameter(
                 'max_depth', lower=6, upper=30, default_value=6, log=True
             ),
+            # TODO: The parameter max_leaf_node is not accepted. Changed it from max_leaf_node to max_leaf_nodes
             CS.UniformIntegerHyperparameter(
-                'max_leaf_node', lower=2, upper=64, default_value=32, log=True
+                'max_leaf_nodes', lower=2, upper=64, default_value=32, log=True
             ),
+            # TODO: The parameter eta is not accepted. Do you mean learning_rate? Changed it from eta to learning_rate
             CS.UniformFloatHyperparameter(
-                'eta', lower=2**-10, upper=1, default_value=0.1, log=True
+                'learning_rate', lower=2**-10, upper=1, default_value=0.1, log=True
             ),
             CS.UniformFloatHyperparameter(
                 'l2_regularization', lower=2**-10, upper=2**10, default_value=0.1, log=True
diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index 9c8e0739..0225b361 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -12,20 +12,25 @@
 
 class BaseTabularBenchmark(AbstractBenchmark):
 
-    def __init__(self, model: str, task_id: int, data_dir: Union[Path, str, None] = None,
+    def __init__(self,
+                 model: str, task_id: int,
+                 data_dir: Union[Path, str, None] = None,
                  rng: Union[int, np.random.RandomState, None] = None, **kwargs):
 
-        super(BaseTabularBenchmark, self).__init__(rng=rng, **kwargs)
+        assert model in ['lr', 'svm', 'xgb'], f'Parameter `model` has to be one of [lr, svm, xgb] but was {model}'
 
         self.task_id = task_id
         self.model = model
 
-        self.table, self.metadata = TabularDataManager(model, task_id, data_dir)
+        self.dm = TabularDataManager(model, task_id, data_dir)
+        self.table, self.metadata = self.dm.load()
 
         self.exp_args = self.metadata["exp_args"]
         self.config_spaces = self.metadata["config_spaces"]
         self.global_minimums = self.metadata["global_min"]
 
+        super(BaseTabularBenchmark, self).__init__(rng=rng, **kwargs)
+
     @AbstractBenchmark.check_parameters
     def objective_function(self,
                            configuration: Union[CS.Configuration, Dict],
@@ -113,16 +118,33 @@ def get_fidelity_range(self) -> List:
         return fidelities
 
     def _search_dataframe(self, row_dict, df):
-        # https://stackoverflow.com/a/46165056/8363967
-        mask = np.array([True] * df.shape[0])
-        for i, param in enumerate(df.drop("result", axis=1).columns):
-            mask *= df[param].values == row_dict[param]
-        idx = np.where(mask)
-        if len(idx) != 1:
+        query_stmt = self._build_query(row_dict)
+        result = df.query(query_stmt)
+        # TODO: What happens in this case? The objective function raises a TypeError.
+        if len(result) == 0:
             return None
-        idx = idx[0][0]
-        result = df.iloc[idx]["result"]
-        return result
+        return result.iloc[0].loc['result']
+
+        # TODO: This created an out-of-bounds error. The idx mask should have been 2d, but was 1d.
+        # # https://stackoverflow.com/a/46165056/8363967
+        # mask = np.array([True] * df.shape[0])
+        # for i, param in enumerate(df.drop("result", axis=1).columns):
+        #     mask *= df[param].values == row_dict[param]
+        # idx = np.where(mask)
+        # if len(idx) != 1:
+        #     return None
+        # idx = idx[0][0]
+        # result = df.iloc[idx]["result"]
+        # return result
+
+    @staticmethod
+    def _build_query(row_dict: Dict) -> str:
+        query = ''
+        for i, (param_name, param_value) in enumerate(row_dict.items()):
+            if i != 0:
+                query += ' & '
+            query += f'{param_name} == {param_value}'
+        return query
 
     def _objective(
             self,
@@ -133,12 +155,13 @@ def _objective(
             evaluation: Union[str, None] = ""
     ) -> Dict:
 
-        metric_str = ', '.join(list(metrics.keys))
+        metric_str = ', '.join(list(metrics.keys()))
         assert metric in list(metrics.keys()), f"metric not found among: {metric_str}"
         score_key = f"{evaluation}_scores"
         cost_key = f"{evaluation}_scores"
 
         key_path = dict()
+        # TODO: Dicts are unordered. This does not have to have an effect.
         for name in np.sort(self.configuration_space.get_hyperparameter_names()):
             key_path[str(name)] = config[str(name)]
         for name in np.sort(self.fidelity_space.get_hyperparameter_names()):
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index ff2ba55e..d256c081 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -138,7 +138,8 @@ def shuffle_data_idx(self, train_idx: Iterable = None, rng: Union[np.random.Rand
         rng.shuffle(train_idx)
         return train_idx
 
-    def _train_objective(self, config: Dict,
+    def _train_objective(self,
+                         config: Dict,
                          fidelity: Dict,
                          shuffle: bool,
                          rng: Union[np.random.RandomState, int, None] = None,
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 9e6f8fb9..258f7d56 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -93,7 +93,7 @@ def _download_file_with_progressbar(self, data_url: str, data_file: Path):
                 if chunk:
                     _ = f.write(chunk)
                     f.flush()
-        self.logger.info("Finished downloading")
+        self.logger.info(f"Finished downloading to {data_file}")
 
     @lockutils.synchronized('not_thread_process_safe', external=True,
                             lock_path=f'{hpobench.config_file.cache_dir}/lock_unzip_file', delay=0.5)
@@ -929,20 +929,21 @@ class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
 
-        assert model in ['lr', 'svm']
+        assert model in ['lr', 'svm', 'xgb'], f'Model has to be one of [lr, svm, xgb] but was {model}'
 
         self.model = model
         self.task_id = str(task_id)
 
-        url_svm = 'https://figshare.com/s/5a0929ad9b2ccd8dda58'
-        url_lr = 'https://ndownloader.figshare.com/files/29027112?private_link=d644493a93dbab4b4ee1'
+        url_dict = dict(xgb='https://ndownloader.figshare.com/files/29113257?private_link=c817bed4e7efc6daee91',
+                        svm='https://ndownloader.figshare.com/files/29102307?private_link=5a0929ad9b2ccd8dda58',
+                        lr='https://ndownloader.figshare.com/files/29027112?private_link=d644493a93dbab4b4ee1')
 
-        self.url_to_use = url_svm if model == 'svm' else url_lr
+        self.url_to_use = url_dict.get(model)
 
         if data_dir is None:
-            data_dir = hpobench.config_file.data_dir / "TabularData"
+            data_dir = hpobench.config_file.data_dir
 
-        self._save_dir = Path(data_dir)
+        self._save_dir = Path(data_dir) / "TabularData" / self.model
         self.create_save_directory(self._save_dir)
 
         self.parquet_file = self._save_dir / self.task_id / f'{self.model}_{self.task_id}_data.parquet.gzip'

From 48af58decdc329924ff6425f7193551cf3fa0d10 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 16 Aug 2021 23:45:17 +0200
Subject: [PATCH 060/147] Fix minor errors.

---
 hpobench/benchmarks/ml_mmfb/entry_point.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/hpobench/benchmarks/ml_mmfb/entry_point.py b/hpobench/benchmarks/ml_mmfb/entry_point.py
index 4ec917a6..1b380a51 100644
--- a/hpobench/benchmarks/ml_mmfb/entry_point.py
+++ b/hpobench/benchmarks/ml_mmfb/entry_point.py
@@ -10,3 +10,15 @@
 from hpobench.benchmarks.ml_mmfb.tabular_benchmark import TabularBenchmark, OriginalTabularBenchmark
 from hpobench.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark,\
     XGBoostSearchSpace2Benchmark, XGBoostSearchSpace3Benchmark
+
+
+__all__ = [HistGBSearchSpace0Benchmark, HistGBSearchSpace1Benchmark, HistGBSearchSpace2Benchmark,
+           HistGBSearchSpace3Benchmark,
+           LRSearchSpace0Benchmark, LRSearchSpace1Benchmark, LRSearchSpace2Benchmark, LRSearchSpace3Benchmark,
+           NNSearchSpace0Benchmark, NNSearchSpace1Benchmark, NNSearchSpace2Benchmark, NNSearchSpace3Benchmark,
+           RandomForestSearchSpace0Benchmark, RandomForestSearchSpace1Benchmark, RandomForestSearchSpace2Benchmark,
+           RandomForestSearchSpace3Benchmark,
+           SVMSearchSpace0Benchmark, SVMSearchSpace1Benchmark,
+           TabularBenchmark, OriginalTabularBenchmark,
+           XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark, XGBoostSearchSpace2Benchmark,
+           XGBoostSearchSpace3Benchmark]

From cf24488d0aabe0bac71eecd71b010b65f3b76491 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 00:05:18 +0200
Subject: [PATCH 061/147] Pylint

---
 hpobench/benchmarks/ml_mmfb/tabular_benchmark.py | 2 ++
 hpobench/dependencies/ml/data_manager.py         | 1 +
 hpobench/util/data_manager.py                    | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index 0225b361..907b5e51 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -31,6 +31,7 @@ def __init__(self,
 
         super(BaseTabularBenchmark, self).__init__(rng=rng, **kwargs)
 
+    # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
     def objective_function(self,
                            configuration: Union[CS.Configuration, Dict],
@@ -43,6 +44,7 @@ def objective_function(self,
         result = self._objective(configuration, fidelity, seed, metric, evaluation="val")
         return result
 
+    # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
     def objective_function_test(self,
                                 configuration: Union[CS.Configuration, Dict],
diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml/data_manager.py
index 55210933..84d8b587 100644
--- a/hpobench/dependencies/ml/data_manager.py
+++ b/hpobench/dependencies/ml/data_manager.py
@@ -53,6 +53,7 @@ def __init__(self, task_id: int,
 
         super(OpenMLDataManager, self).__init__()
 
+    # pylint: disable=arguments-differ
     @lockutils.synchronized('not_thread_process_safe', external=True,
                             lock_path=f'{config_file.cache_dir}/openml_dm_lock', delay=0.2)
     def load(self, valid_size=None, verbose=False):
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 258f7d56..d390218e 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -646,6 +646,7 @@ def _check_availability_and_download(self):
                     f.flush()
         self.logger.info("Finished downloading")
 
+    # pylint: disable=arguments-differ
     @lockutils.synchronized('not_thread_process_safe', external=True,
                             lock_path=f'{hpobench.config_file.cache_dir}/lock_surrogates_unzip_data', delay=0.5)
     def _unzip_data(self):
@@ -949,6 +950,7 @@ def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None]
         self.parquet_file = self._save_dir / self.task_id / f'{self.model}_{self.task_id}_data.parquet.gzip'
         self.metadata_file = self._save_dir / self.task_id / f'{self.model}_{self.task_id}_metadata.json'
 
+    # pylint: disable=arguments-differ
     def load(self):
         # Can we directly load the files?
         if self.parquet_file.exists() and self.metadata_file.exists():

From 528dde18d7ff8b211c8ede1b4f2e2f0e67212713 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 16:34:05 +0200
Subject: [PATCH 062/147] Init Model can handle now Configurations

---
 hpobench/benchmarks/ml_mmfb/histgb_benchmark.py   | 10 +++++++++-
 hpobench/benchmarks/ml_mmfb/lr_benchmark.py       | 10 +++++++++-
 hpobench/benchmarks/ml_mmfb/nn_benchmark.py       | 10 +++++++++-
 hpobench/benchmarks/ml_mmfb/rf_benchmark.py       |  9 ++++++++-
 hpobench/benchmarks/ml_mmfb/svm_benchmark.py      |  7 +++++--
 hpobench/dependencies/ml/ml_benchmark_template.py |  4 +++-
 6 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
index 9507d81f..dd36694c 100644
--- a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -79,10 +79,18 @@ def _get_fidelity_choices(ntrees_choice: str, subsample_choice: str) -> Tuple[Hy
         subsample = fidelity2[subsample_choice]
         return ntrees, subsample
 
-    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(self, config: Union[CS.Configuration, Dict],
+                   fidelity: Union[CS.Configuration, Dict, None] = None,
+                   rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
+
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        if isinstance(fidelity, CS.Configuration):
+            fidelity = fidelity.get_dictionary()
+
         model = HistGradientBoostingClassifier(
             **config,
             max_iter=fidelity['n_estimators'],  # a fidelity being used during initialization
diff --git a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
index 944e77c4..5b9d054b 100644
--- a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
@@ -68,9 +68,17 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(self, config: Union[CS.Configuration, Dict],
+                   fidelity: Union[CS.Configuration, Dict, None] = None,
+                   rng: Union[int, np.random.RandomState, None] = None):
         # initializing model
         rng = self.rng if rng is None else rng
+
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        if isinstance(fidelity, CS.Configuration):
+            fidelity = fidelity.get_dictionary()
+
         # https://scikit-learn.org/stable/modules/sgd.html
         model = SGDClassifier(
             **config,
diff --git a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
index 4826f7fe..601efe23 100644
--- a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
@@ -76,10 +76,18 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(self, config: Union[CS.Configuration, Dict],
+                   fidelity: Union[CS.Configuration, Dict, None] = None,
+                   rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
+
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        if isinstance(fidelity, CS.Configuration):
+            fidelity = fidelity.get_dictionary()
+
         config = deepcopy(config)
         depth = config["depth"]
         width = config["width"]
diff --git a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
index b17f74d1..838d956d 100644
--- a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
@@ -77,10 +77,17 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(self, config: Union[CS.Configuration, Dict],
+                   fidelity: Union[CS.Configuration, Dict, None] = None,
+                   rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         rng = self.rng if rng is None else rng
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        if isinstance(fidelity, CS.Configuration):
+            fidelity = fidelity.get_dictionary()
+
         config = deepcopy(config)
         n_features = self.train_X.shape[1]
         config["max_features"] = int(np.rint(np.power(n_features, config["max_features"])))
diff --git a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
index bc439fed..0ae25e18 100644
--- a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
@@ -62,10 +62,13 @@ def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
 
         return subsample
 
-    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(self, config: Union[CS.Configuration, Dict],
+                   fidelity: Union[CS.Configuration, Dict, None] = None,
+                   rng: Union[int, np.random.RandomState, None] = None):
         # initializing model
         rng = self.rng if rng is None else rng
-        config = config
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
         model = SVC(
             **config,
             random_state=rng,
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index d256c081..8460b113 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -113,7 +113,9 @@ def get_meta_information(self):
             'task_id': self.task_id
         }
 
-    def init_model(self, config: Dict, fidelity: Dict = None, rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(self, config: Union[CS.Configuration, Dict],
+                   fidelity: Union[CS.Configuration, Dict, None] = None,
+                   rng: Union[int, np.random.RandomState, None] = None):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         raise NotImplementedError()

From 6bdf5c019c6904e8960416b87e6e95e9dbaa9ab1 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 17:22:32 +0200
Subject: [PATCH 063/147] PR Requests: Rename Classes

---
 hpobench/benchmarks/ml_mmfb/entry_point.py    | 33 +++++-------
 .../benchmarks/ml_mmfb/histgb_benchmark.py    | 47 ++++------------
 hpobench/benchmarks/ml_mmfb/lr_benchmark.py   | 45 ++++++----------
 hpobench/benchmarks/ml_mmfb/nn_benchmark.py   | 54 +++++--------------
 hpobench/benchmarks/ml_mmfb/rf_benchmark.py   | 53 +++++-------------
 hpobench/benchmarks/ml_mmfb/svm_benchmark.py  | 33 ++++--------
 .../benchmarks/ml_mmfb/xgboost_benchmark.py   | 53 +++++-------------
 .../benchmarks/ml_mmfb/histgb_benchmark.py    | 29 ++++------
 .../benchmarks/ml_mmfb/lr_benchmark.py        | 29 ++++------
 .../benchmarks/ml_mmfb/nn_benchmark.py        | 29 ++++------
 .../benchmarks/ml_mmfb/rf_benchmark.py        | 29 ++++------
 .../benchmarks/ml_mmfb/svm_benchmark.py       | 22 +++++---
 .../benchmarks/ml_mmfb/xgboost_benchmark.py   | 21 ++++----
 13 files changed, 153 insertions(+), 324 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/entry_point.py b/hpobench/benchmarks/ml_mmfb/entry_point.py
index 1b380a51..0114acaa 100644
--- a/hpobench/benchmarks/ml_mmfb/entry_point.py
+++ b/hpobench/benchmarks/ml_mmfb/entry_point.py
@@ -1,24 +1,17 @@
-from hpobench.benchmarks.ml_mmfb.histgb_benchmark import HistGBSearchSpace0Benchmark, HistGBSearchSpace1Benchmark, \
-    HistGBSearchSpace2Benchmark, HistGBSearchSpace3Benchmark
-from hpobench.benchmarks.ml_mmfb.lr_benchmark import LRSearchSpace0Benchmark, LRSearchSpace1Benchmark, \
-    LRSearchSpace2Benchmark, LRSearchSpace3Benchmark
-from hpobench.benchmarks.ml_mmfb.nn_benchmark import NNSearchSpace0Benchmark, NNSearchSpace1Benchmark, \
-    NNSearchSpace2Benchmark, NNSearchSpace3Benchmark
-from hpobench.benchmarks.ml_mmfb.rf_benchmark import RandomForestSearchSpace0Benchmark, \
-    RandomForestSearchSpace1Benchmark, RandomForestSearchSpace2Benchmark, RandomForestSearchSpace3Benchmark
-from hpobench.benchmarks.ml_mmfb.svm_benchmark import SVMSearchSpace0Benchmark, SVMSearchSpace1Benchmark
+from hpobench.benchmarks.ml_mmfb.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
+    RandomForestBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
 from hpobench.benchmarks.ml_mmfb.tabular_benchmark import TabularBenchmark, OriginalTabularBenchmark
-from hpobench.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark,\
-    XGBoostSearchSpace2Benchmark, XGBoostSearchSpace3Benchmark
+from hpobench.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
 
 
-__all__ = [HistGBSearchSpace0Benchmark, HistGBSearchSpace1Benchmark, HistGBSearchSpace2Benchmark,
-           HistGBSearchSpace3Benchmark,
-           LRSearchSpace0Benchmark, LRSearchSpace1Benchmark, LRSearchSpace2Benchmark, LRSearchSpace3Benchmark,
-           NNSearchSpace0Benchmark, NNSearchSpace1Benchmark, NNSearchSpace2Benchmark, NNSearchSpace3Benchmark,
-           RandomForestSearchSpace0Benchmark, RandomForestSearchSpace1Benchmark, RandomForestSearchSpace2Benchmark,
-           RandomForestSearchSpace3Benchmark,
-           SVMSearchSpace0Benchmark, SVMSearchSpace1Benchmark,
+__all__ = [HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF,
+           LRBenchmark, LRBenchmarkBB, LRBenchmarkMF,
+           NNBenchmark, NNBenchmarkBB, NNBenchmarkMF,
+           RandomForestBenchmark, RandomForestBenchmarkBB, RandomForestBenchmarkMF,
+           SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF,
            TabularBenchmark, OriginalTabularBenchmark,
-           XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark, XGBoostSearchSpace2Benchmark,
-           XGBoostSearchSpace3Benchmark]
+           XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF]
diff --git a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
index dd36694c..5d164503 100644
--- a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -27,11 +27,9 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
             CS.UniformIntegerHyperparameter(
                 'max_depth', lower=6, upper=30, default_value=6, log=True
             ),
-            # TODO: The parameter max_leaf_node is not accepted. Changed it from max_leaf_node to max_leaf_nodes
             CS.UniformIntegerHyperparameter(
                 'max_leaf_nodes', lower=2, upper=64, default_value=32, log=True
             ),
-            # TODO: The parameter eta is not accepted. Do you mean learning_rate? Changed it from eta to learning_rate
             CS.UniformFloatHyperparameter(
                 'learning_rate', lower=2**-10, upper=1, default_value=0.1, log=True
             ),
@@ -43,18 +41,12 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """Fidelity space available --- specifies the fidelity dimensions
-
-        If fidelity_choice is 0
-            Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
-            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If fidelity_choice is 2
-            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
-            Fidelity space is multi-multi fidelity, all possible fidelities
-        """
-        raise NotImplementedError()
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            HistGBBenchmark._get_fidelity_choices(ntrees_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
 
     @staticmethod
     def _get_fidelity_choices(ntrees_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
@@ -100,7 +92,7 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         return model
 
 
-class HistGBSearchSpace0Benchmark(HistGBBenchmark):
+class HistGBBenchmarkBB(HistGBBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
@@ -110,7 +102,7 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
 
-class HistGBSearchSpace1Benchmark(HistGBBenchmark):
+class HistGBBenchmarkMF(HistGBBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
@@ -120,25 +112,4 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
 
-class HistGBSearchSpace2Benchmark(HistGBBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-fidelity) - subsample
-            HistGBBenchmark._get_fidelity_choices(ntrees_choice='fixed', subsample_choice='variable')
-        )
-        return fidelity_space
-
-
-class HistGBSearchSpace3Benchmark(HistGBBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            HistGBBenchmark._get_fidelity_choices(ntrees_choice='variable', subsample_choice='variable')
-        )
-        return fidelity_space
-
-
-__all__ = [HistGBSearchSpace0Benchmark, HistGBSearchSpace1Benchmark,
-           HistGBSearchSpace2Benchmark, HistGBSearchSpace3Benchmark]
+__all__ = [HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF]
diff --git a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
index 5b9d054b..0154e623 100644
--- a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
@@ -8,15 +8,15 @@
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class LRBaseBenchmark(MLBenchmark):
+class LRBenchmark(MLBenchmark):
     def __init__(self,
                  task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
 
-        super(LRBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
-        self.cache_size = 500  # TODO: Do we need this?
+        super(LRBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+        self.cache_size = 500
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -34,7 +34,12 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         return cs
 
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError()
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
+            LRBenchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
 
     @staticmethod
     def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
@@ -87,49 +92,29 @@ def init_model(self, config: Union[CS.Configuration, Dict],
             learning_rate="adaptive",
             tol=None,
             random_state=rng,
+
         )
         return model
 
 
-class LRSearchSpace0Benchmark(LRBaseBenchmark):
+class LRBenchmarkBB(LRBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            LRBaseBenchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='fixed')
+            LRBenchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='fixed')
         )
         return fidelity_space
 
 
-class LRSearchSpace1Benchmark(LRBaseBenchmark):
+class LRBenchmarkMF(LRBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - iterations
-            LRBaseBenchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='fixed')
-        )
-        return fidelity_space
-
-
-class LRSearchSpace2Benchmark(LRBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-fidelity) - data subsample
-            LRBaseBenchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='variable')
-        )
-        return fidelity_space
-
-
-class LRSearchSpace3Benchmark(LRBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
-            LRBaseBenchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='variable')
+            LRBenchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='fixed')
         )
         return fidelity_space
 
 
-__all__ = [LRSearchSpace0Benchmark, LRSearchSpace1Benchmark,
-           LRSearchSpace2Benchmark, LRSearchSpace3Benchmark]
+__all__ = [LRBenchmark, LRBenchmarkBB, LRBenchmarkMF]
diff --git a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
index 601efe23..ca3afa7c 100644
--- a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
@@ -9,13 +9,13 @@
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class NNBaseBenchmark(MLBenchmark):
+class NNBenchmark(MLBenchmark):
     def __init__(self,
                  task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
-        super(NNBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+        super(NNBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -44,18 +44,13 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """Fidelity space available --- specifies the fidelity dimensions
-
-        If fidelity_choice is 0
-            Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
-            Fidelity space is a single fidelity, in this case the number of epochs (max_iter)
-        If fidelity_choice is 2
-            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
-            Fidelity space is multi-multi fidelity, all possible fidelities
-        """
-        raise NotImplementedError()
+
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
+            NNBenchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
 
     @staticmethod
     def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
@@ -105,45 +100,24 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         return model
 
 
-class NNSearchSpace0Benchmark(NNBaseBenchmark):
+class NNBenchmarkBB(NNBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            NNSearchSpace0Benchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='fixed')
+            NNBenchmarkBB._get_fidelity_choices(iter_choice='fixed', subsample_choice='fixed')
         )
         return fidelity_space
 
 
-class NNSearchSpace1Benchmark(NNBaseBenchmark):
+class NNBenchmarkMF(NNBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - iterations
-            NNSearchSpace1Benchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='fixed')
-        )
-        return fidelity_space
-
-
-class NNSearchSpace2Benchmark(NNBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-fidelity) - subsample
-            NNSearchSpace2Benchmark._get_fidelity_choices(iter_choice='fixed', subsample_choice='variable')
-        )
-        return fidelity_space
-
-
-class NNSearchSpace3Benchmark(NNBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
-            NNSearchSpace3Benchmark._get_fidelity_choices(iter_choice='variable', subsample_choice='variable')
+            NNBenchmarkMF._get_fidelity_choices(iter_choice='variable', subsample_choice='fixed')
         )
         return fidelity_space
 
 
-__all__ = [NNSearchSpace0Benchmark, NNSearchSpace1Benchmark,
-           NNSearchSpace2Benchmark, NNSearchSpace3Benchmark]
+__all__ = [NNBenchmark, NNBenchmarkBB, NNBenchmarkMF]
diff --git a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
index 838d956d..8b6a64d8 100644
--- a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
@@ -9,13 +9,13 @@
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class RandomForestBaseBenchmark(MLBenchmark):
+class RandomForestBenchmark(MLBenchmark):
     def __init__(self,
                  task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
-        super(RandomForestBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+        super(RandomForestBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -41,18 +41,12 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """Fidelity space available --- specifies the fidelity dimensions
-
-        If fidelity_choice is 0
-            Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
-            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If fidelity_choice is 2
-            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
-            Fidelity space is multi-multi fidelity, all possible fidelities
-        """
-        raise NotImplementedError()
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
 
     @staticmethod
     def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
@@ -100,45 +94,24 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         return model
 
 
-class RandomForestSearchSpace0Benchmark(RandomForestBaseBenchmark):
+class RandomForestBenchmarkBB(RandomForestBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            RandomForestBaseBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
         )
         return fidelity_space
 
 
-class RandomForestSearchSpace1Benchmark(RandomForestBaseBenchmark):
+class RandomForestBenchmarkMF(RandomForestBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
-            RandomForestBaseBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
-        )
-        return fidelity_space
-
-
-class RandomForestSearchSpace2Benchmark(RandomForestBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-fidelity) - data subsample
-            RandomForestBaseBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='variable')
-        )
-        return fidelity_space
-
-
-class RandomForestSearchSpace3Benchmark(RandomForestBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            RandomForestBaseBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
         )
         return fidelity_space
 
 
-__all__ = [RandomForestSearchSpace0Benchmark, RandomForestSearchSpace1Benchmark,
-           RandomForestSearchSpace2Benchmark, RandomForestSearchSpace3Benchmark]
+__all__ = [RandomForestBenchmark, RandomForestBenchmarkBB, RandomForestBenchmarkMF]
diff --git a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
index 0ae25e18..fa129cce 100644
--- a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
@@ -8,13 +8,13 @@
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class SVMBaseBenchmark(MLBenchmark):
+class SVMBenchmark(MLBenchmark):
     def __init__(self,
                  task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
-        super(SVMBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+        super(SVMBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
         self.cache_size = 200
 
@@ -36,16 +36,11 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """Fidelity space available --- specifies the fidelity dimensions
-
-        For SVM, only a single fidelity exists, i.e., subsample fraction.
-        if fidelity_choice == 0
-            uses the entire data (subsample=1), reflecting the black-box setup
-        else
-            parameterize the fraction of data to subsample
-
-        """
-        raise NotImplementedError()
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameter(
+            SVMBenchmark._get_fidelity_choices(subsample_choice='variable')
+        )
+        return fidelity_space
 
     @staticmethod
     def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
@@ -77,21 +72,15 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         return model
 
 
-class SVMSearchSpace0Benchmark(SVMBaseBenchmark):
+class SVMBenchmarkBB(SVMBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameter(
             # uses the entire data (subsample=1), reflecting the black-box setup
-            SVMBaseBenchmark._get_fidelity_choices(subsample_choice='fixed')
+            SVMBenchmark._get_fidelity_choices(subsample_choice='fixed')
         )
         return fidelity_space
 
 
-class SVMSearchSpace1Benchmark(SVMBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameter(
-            # parameterize the fraction of data to subsample
-            SVMBaseBenchmark._get_fidelity_choices(subsample_choice='fixed')
-        )
-        return fidelity_space
+# To keep the parity of the the overall design
+SVMBenchmarkMF = SVMBenchmark
diff --git a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
index a5735b2c..b22827de 100644
--- a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -8,13 +8,13 @@
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
-class XGBoostBaseBenchmark(MLBenchmark):
+class XGBoostBenchmark(MLBenchmark):
     def __init__(self,
                  task_id: int,
                  rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
                  data_path: Union[str, None] = None):
-        super(XGBoostBaseBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+        super(XGBoostBenchmark, self).__init__(task_id, rng, valid_size, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -40,18 +40,12 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """Fidelity space available --- specifies the fidelity dimensions
-
-        If fidelity_choice is 0
-            Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
-            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If fidelity_choice is 2
-            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
-            Fidelity space is multi-multi fidelity, all possible fidelities
-        """
-        raise NotImplementedError()
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters(
+            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
+            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+        )
+        return fidelity_space
 
     @staticmethod
     def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
@@ -107,45 +101,24 @@ def init_model(self,
         return model
 
 
-class XGBoostSearchSpace0Benchmark(XGBoostBaseBenchmark):
+class XGBoostBenchmarkBB(XGBoostBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            XGBoostBaseBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
         )
         return fidelity_space
 
 
-class XGBoostSearchSpace1Benchmark(XGBoostBaseBenchmark):
+class XGBoostBenchmarkMF(XGBoostBenchmark):
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
-            XGBoostBaseBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
-        )
-        return fidelity_space
-
-
-class XGBoostSearchSpace2Benchmark(XGBoostBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-fidelity) - data subsample
-            XGBoostBaseBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='variable')
-        )
-        return fidelity_space
-
-
-class XGBoostSearchSpace3Benchmark(XGBoostBaseBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        fidelity_space = CS.ConfigurationSpace(seed=seed)
-        fidelity_space.add_hyperparameters(
-            # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            XGBoostBaseBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
         )
         return fidelity_space
 
 
-__all__ = [XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark,
-           XGBoostSearchSpace2Benchmark, XGBoostSearchSpace3Benchmark]
+__all__ = [XGBoostBenchmarkBB, XGBoostBenchmarkMF, XGBoostBenchmark]
diff --git a/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
index 77ed4bbb..47886eb1 100644
--- a/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -6,37 +6,28 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-class HistGBSearchSpace0Benchmark(AbstractBenchmarkClient):
+class HistGBBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBSearchSpace0Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(HistGBSearchSpace0Benchmark, self).__init__(**kwargs)
+        super(HistGBBenchmark, self).__init__(**kwargs)
 
 
-class HistGBSearchSpace1Benchmark(AbstractBenchmarkClient):
+class HistGBBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBSearchSpace1Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBBenchmarkBB')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(HistGBSearchSpace1Benchmark, self).__init__(**kwargs)
+        super(HistGBBenchmarkBB, self).__init__(**kwargs)
 
 
-class HistGBSearchSpace2Benchmark(AbstractBenchmarkClient):
+class HistGBBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBSearchSpace2Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBBenchmarkMF')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(HistGBSearchSpace2Benchmark, self).__init__(**kwargs)
+        super(HistGBBenchmarkMF, self).__init__(**kwargs)
 
 
-class HistGBSearchSpace3Benchmark(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'HistGBSearchSpace3Benchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(HistGBSearchSpace3Benchmark, self).__init__(**kwargs)
-
-
-__all__ = [HistGBSearchSpace0Benchmark, HistGBSearchSpace1Benchmark,
-           HistGBSearchSpace2Benchmark, HistGBSearchSpace3Benchmark]
+__all__ = [HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF]
diff --git a/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
index fd1b4015..74092e71 100644
--- a/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
@@ -6,37 +6,28 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-class LRSearchSpace0Benchmark(AbstractBenchmarkClient):
+class LRBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRSearchSpace0Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(LRSearchSpace0Benchmark, self).__init__(**kwargs)
+        super(LRBenchmark, self).__init__(**kwargs)
 
 
-class LRSearchSpace1Benchmark(AbstractBenchmarkClient):
+class LRBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRSearchSpace1Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmarkBB')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(LRSearchSpace1Benchmark, self).__init__(**kwargs)
+        super(LRBenchmarkBB, self).__init__(**kwargs)
 
 
-class LRSearchSpace2Benchmark(AbstractBenchmarkClient):
+class LRBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRSearchSpace2Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmarkMF')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(LRSearchSpace2Benchmark, self).__init__(**kwargs)
+        super(LRBenchmarkMF, self).__init__(**kwargs)
 
 
-class LRSearchSpace3Benchmark(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRSearchSpace3Benchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(LRSearchSpace3Benchmark, self).__init__(**kwargs)
-
-
-__all__ = [LRSearchSpace0Benchmark, LRSearchSpace1Benchmark,
-           LRSearchSpace2Benchmark, LRSearchSpace3Benchmark]
+__all__ = [LRBenchmark, LRBenchmarkBB, LRBenchmarkMF]
diff --git a/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
index 818fb606..8a444c11 100644
--- a/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
@@ -6,37 +6,28 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-class NNSearchSpace0Benchmark(AbstractBenchmarkClient):
+class NNBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNSearchSpace0Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(NNSearchSpace0Benchmark, self).__init__(**kwargs)
+        super(NNBenchmark, self).__init__(**kwargs)
 
 
-class NNSearchSpace1Benchmark(AbstractBenchmarkClient):
+class NNBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNSearchSpace1Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmarkBB')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(NNSearchSpace1Benchmark, self).__init__(**kwargs)
+        super(NNBenchmarkBB, self).__init__(**kwargs)
 
 
-class NNSearchSpace2Benchmark(AbstractBenchmarkClient):
+class NNBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNSearchSpace2Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmarkMF')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(NNSearchSpace2Benchmark, self).__init__(**kwargs)
+        super(NNBenchmarkMF, self).__init__(**kwargs)
 
 
-class NNSearchSpace3Benchmark(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNSearchSpace3Benchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(NNSearchSpace3Benchmark, self).__init__(**kwargs)
-
-
-__all__ = [NNSearchSpace0Benchmark, NNSearchSpace1Benchmark,
-           NNSearchSpace2Benchmark, NNSearchSpace3Benchmark]
+__all__ = [NNBenchmark, NNBenchmarkBB, NNBenchmarkMF]
diff --git a/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
index 3c7ced83..4f59f6a0 100644
--- a/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
@@ -6,37 +6,28 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-class RandomForestSearchSpace0Benchmark(AbstractBenchmarkClient):
+class RandomForestBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestSearchSpace0Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(RandomForestSearchSpace0Benchmark, self).__init__(**kwargs)
+        super(RandomForestBenchmark, self).__init__(**kwargs)
 
 
-class RandomForestSearchSpace1Benchmark(AbstractBenchmarkClient):
+class RandomForestBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestSearchSpace1Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmarkBB')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(RandomForestSearchSpace1Benchmark, self).__init__(**kwargs)
+        super(RandomForestBenchmarkBB, self).__init__(**kwargs)
 
 
-class RandomForestSearchSpace2Benchmark(AbstractBenchmarkClient):
+class RandomForestBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestSearchSpace2Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmarkMF')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(RandomForestSearchSpace2Benchmark, self).__init__(**kwargs)
+        super(RandomForestBenchmarkMF, self).__init__(**kwargs)
 
 
-class RandomForestSearchSpace3Benchmark(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestSearchSpace3Benchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(RandomForestSearchSpace3Benchmark, self).__init__(**kwargs)
-
-
-__all__ = [RandomForestSearchSpace0Benchmark, RandomForestSearchSpace1Benchmark,
-           RandomForestSearchSpace2Benchmark, RandomForestSearchSpace3Benchmark]
+__all__ = [RandomForestBenchmark, RandomForestBenchmarkBB, RandomForestBenchmarkMF]
diff --git a/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
index b2c46e75..328b26f3 100644
--- a/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
@@ -6,20 +6,28 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-class SVMSearchSpace0Benchmark(AbstractBenchmarkClient):
+class SVMBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMSearchSpace0Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(SVMSearchSpace0Benchmark, self).__init__(**kwargs)
+        super(SVMBenchmark, self).__init__(**kwargs)
 
 
-class SVMSearchSpace1Benchmark(AbstractBenchmarkClient):
+class SVMBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMSearchSpace1Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkMF')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(SVMSearchSpace1Benchmark, self).__init__(**kwargs)
+        super(SVMBenchmarkMF, self).__init__(**kwargs)
 
 
-__all__ = [SVMSearchSpace0Benchmark, SVMSearchSpace1Benchmark]
+class SVMBenchmarkBB(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkBB')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(SVMBenchmarkBB, self).__init__(**kwargs)
+
+
+__all__ = [SVMBenchmark, SVMBenchmarkMF, SVMBenchmarkBB]
diff --git a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
index 72438d37..1f09ead9 100644
--- a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -6,28 +6,28 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-class XGBoostSearchSpace0Benchmark(AbstractBenchmarkClient):
+class XGBoostBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace0Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(XGBoostSearchSpace0Benchmark, self).__init__(**kwargs)
+        super(XGBoostBenchmark, self).__init__(**kwargs)
 
 
-class XGBoostSearchSpace1Benchmark(AbstractBenchmarkClient):
+class XGBoostBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace1Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkBB')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(XGBoostSearchSpace1Benchmark, self).__init__(**kwargs)
+        super(XGBoostBenchmarkBB, self).__init__(**kwargs)
 
 
-class XGBoostSearchSpace2Benchmark(AbstractBenchmarkClient):
+class XGBoostBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace2Benchmark')
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkMF')
         kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(XGBoostSearchSpace2Benchmark, self).__init__(**kwargs)
+        super(XGBoostBenchmarkMF, self).__init__(**kwargs)
 
 
 class XGBoostSearchSpace3Benchmark(AbstractBenchmarkClient):
@@ -38,5 +38,4 @@ def __init__(self, **kwargs):
         super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
 
 
-__all__ = [XGBoostSearchSpace0Benchmark, XGBoostSearchSpace1Benchmark,
-           XGBoostSearchSpace2Benchmark, XGBoostSearchSpace3Benchmark]
+__all__ = [XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF]
\ No newline at end of file

From b8b30a535448c87f70b2931f41950f7dd0662977 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 17:41:33 +0200
Subject: [PATCH 064/147] PR Requests: Move dependencies to correct directory

---
 hpobench/benchmarks/ml_mmfb/histgb_benchmark.py               | 2 +-
 hpobench/benchmarks/ml_mmfb/lr_benchmark.py                   | 2 +-
 hpobench/benchmarks/ml_mmfb/nn_benchmark.py                   | 2 +-
 hpobench/benchmarks/ml_mmfb/rf_benchmark.py                   | 2 +-
 hpobench/benchmarks/ml_mmfb/svm_benchmark.py                  | 2 +-
 hpobench/benchmarks/ml_mmfb/tabular_benchmark.py              | 2 +-
 hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py              | 4 ++--
 hpobench/dependencies/{ml => ml_mmfb}/__init__.py             | 0
 hpobench/dependencies/{ml => ml_mmfb}/data_manager.py         | 0
 .../dependencies/{ml => ml_mmfb}/ml_benchmark_template.py     | 2 +-
 10 files changed, 9 insertions(+), 9 deletions(-)
 rename hpobench/dependencies/{ml => ml_mmfb}/__init__.py (100%)
 rename hpobench/dependencies/{ml => ml_mmfb}/data_manager.py (100%)
 rename hpobench/dependencies/{ml => ml_mmfb}/ml_benchmark_template.py (99%)

diff --git a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
index 5d164503..7a697129 100644
--- a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -7,7 +7,7 @@
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
-from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
 
 
 class HistGBBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
index 0154e623..a8ef771f 100644
--- a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
@@ -5,7 +5,7 @@
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.linear_model import SGDClassifier
 
-from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
 
 
 class LRBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
index ca3afa7c..c8341e8a 100644
--- a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
@@ -6,7 +6,7 @@
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.neural_network import MLPClassifier
 
-from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
 
 
 class NNBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
index 8b6a64d8..781c7ed4 100644
--- a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
@@ -6,7 +6,7 @@
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.ensemble import RandomForestClassifier
 
-from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
 
 
 class RandomForestBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
index fa129cce..6dccf605 100644
--- a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
@@ -5,7 +5,7 @@
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.svm import SVC
 
-from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
 
 
 class SVMBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index 907b5e51..f572d0dc 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -6,7 +6,7 @@
 from ConfigSpace.read_and_write import json as json_cs
 
 from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.dependencies.ml.ml_benchmark_template import metrics
+from hpobench.dependencies.ml_mmfb.ml_benchmark_template import metrics
 from hpobench.util.data_manager import TabularDataManager
 
 
diff --git a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
index b22827de..57218598 100644
--- a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -5,7 +5,7 @@
 import xgboost as xgb
 from ConfigSpace.hyperparameters import Hyperparameter
 
-from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
 
 
 class XGBoostBenchmark(MLBenchmark):
@@ -54,7 +54,7 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         assert subsample_choice in ['fixed', 'variable']
 
         fidelity1 = dict(
-            fixed=CS.Constant('n_estimators', value=100),  # TODO: Should this be 1000 or 100?
+            fixed=CS.Constant('n_estimators', value=1000),
             variable=CS.UniformIntegerHyperparameter(
                 'n_estimators', lower=50, upper=2000, default_value=1000, log=False
             )
diff --git a/hpobench/dependencies/ml/__init__.py b/hpobench/dependencies/ml_mmfb/__init__.py
similarity index 100%
rename from hpobench/dependencies/ml/__init__.py
rename to hpobench/dependencies/ml_mmfb/__init__.py
diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml_mmfb/data_manager.py
similarity index 100%
rename from hpobench/dependencies/ml/data_manager.py
rename to hpobench/dependencies/ml_mmfb/data_manager.py
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml_mmfb/ml_benchmark_template.py
similarity index 99%
rename from hpobench/dependencies/ml/ml_benchmark_template.py
rename to hpobench/dependencies/ml_mmfb/ml_benchmark_template.py
index 8460b113..b67078e7 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml_mmfb/ml_benchmark_template.py
@@ -9,7 +9,7 @@
     precision_score, f1_score
 
 from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.dependencies.ml.data_manager import OpenMLDataManager
+from hpobench.dependencies.ml_mmfb.data_manager import OpenMLDataManager
 from hpobench.util.rng_helper import get_rng
 
 metrics = dict(

From 875c594924fe6059500bec970076e61e1ddebefb Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 17:42:16 +0200
Subject: [PATCH 065/147] PR Requests: Tabular Benchmarks - Remove unnecessary
 class definition

---
 .../benchmarks/ml_mmfb/tabular_benchmark.py   | 68 ++++++-------------
 1 file changed, 20 insertions(+), 48 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index f572d0dc..2f346624 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -10,7 +10,7 @@
 from hpobench.util.data_manager import TabularDataManager
 
 
-class BaseTabularBenchmark(AbstractBenchmark):
+class TabularBenchmark(AbstractBenchmark):
 
     def __init__(self,
                  model: str, task_id: int,
@@ -29,7 +29,10 @@ def __init__(self,
         self.config_spaces = self.metadata["config_spaces"]
         self.global_minimums = self.metadata["global_min"]
 
-        super(BaseTabularBenchmark, self).__init__(rng=rng, **kwargs)
+        self.original_cs = json_cs.read(self.config_spaces['x'])
+        self.original_fs = json_cs.read(self.config_spaces['z'])
+
+        super(TabularBenchmark, self).__init__(rng=rng, **kwargs)
 
     # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
@@ -59,19 +62,26 @@ def objective_function_test(self,
 
     # pylint: disable=arguments-differ
     def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError
+        cs = json_cs.read(self.config_spaces['x_discrete'])
+        cs = self._preprocess_configspace(cs)
+        cs.seed(seed)
+        return cs
 
     # pylint: disable=arguments-differ
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError
+        cs = json_cs.read(self.config_spaces['z_discrete'])
+        cs.seed(seed=seed)
+        return cs
 
     # pylint: disable=arguments-differ
     def get_meta_information(self) -> Dict:
         """ Returns the meta information for the benchmark """
-        return {'name': 'BaseTabularBenchmark',
+        return {'name': 'TabularBenchmark',
                 'references': [],
                 'task_id': self.task_id,
-                'model': self.model
+                'model': self.model,
+                'original_configuration_space': self.original_cs,
+                'original_fidelity_space': self.original_fs,
                 }
 
     def _preprocess_configspace(self, config_space: CS.ConfigurationSpace) -> CS.ConfigurationSpace:
@@ -93,7 +103,7 @@ def _total_number_of_configurations(self, space: str = "hyperparameters") -> int
     def _seeds_used(self) -> List:
         return self.table.seed.unique().tolist()
 
-    def sample_hyperparamer(self, n: int = 1) -> Union[CS.Configuration, List]:
+    def sample_hyperparameter(self, n: int = 1) -> Union[CS.Configuration, List]:
         return self.configuration_space.sample_configuration(n)
 
     def sample_fidelity(self, n: int = 1) -> Union[CS.Configuration, List]:
@@ -163,10 +173,9 @@ def _objective(
         cost_key = f"{evaluation}_scores"
 
         key_path = dict()
-        # TODO: Dicts are unordered. This does not have to have an effect.
-        for name in np.sort(self.configuration_space.get_hyperparameter_names()):
+        for name in self.configuration_space.get_hyperparameter_names():
             key_path[str(name)] = config[str(name)]
-        for name in np.sort(self.fidelity_space.get_hyperparameter_names()):
+        for name in self.fidelity_space.get_hyperparameter_names():
             key_path[str(name)] = fidelity[str(name)]
 
         if seed is not None:
@@ -190,41 +199,4 @@ def _objective(
         return result
 
 
-class TabularBenchmark(BaseTabularBenchmark):
-    def __init__(self, model: str, task_id: int, data_dir: Union[Path, str, None] = None,
-                 rng: Union[int, np.random.RandomState, None] = None, **kwargs):
-        super(TabularBenchmark, self).__init__(model, task_id, data_dir, rng, **kwargs)
-
-    # pylint: disable=arguments-differ
-    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = json_cs.read(self.config_spaces['x_discrete'])
-        cs = self._preprocess_configspace(cs)
-        cs.seed(seed)
-        return cs
-
-    # pylint: disable=arguments-differ
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = json_cs.read(self.config_spaces['z_discrete'])
-        cs.seed(seed=seed)
-        return cs
-
-
-class OriginalTabularBenchmark(BaseTabularBenchmark):
-    def __init__(self, model: str, task_id: int, data_dir: Union[Path, str, None] = None,
-                 rng: Union[int, np.random.RandomState, None] = None, **kwargs):
-        super(OriginalTabularBenchmark, self).__init__(model, task_id, data_dir, rng, **kwargs)
-
-    # pylint: disable=arguments-differ
-    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = json_cs.read(self.config_spaces['x'])
-        cs.seed(seed)
-        return cs
-
-    # pylint: disable=arguments-differ
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = json_cs.read(self.config_spaces['z'])
-        cs.seed(seed=seed)
-        return cs
-
-
-__all__ = [TabularBenchmark, OriginalTabularBenchmark]
+__all__ = [TabularBenchmark]

From 8891e33a5312a1c07d3ae5a9f57c5f5a874798c6 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 17:42:34 +0200
Subject: [PATCH 066/147] PR Requests: Minor improvments

---
 hpobench/benchmarks/ml_mmfb/rf_benchmark.py      | 2 +-
 hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
index 781c7ed4..f0e70086 100644
--- a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
@@ -55,7 +55,7 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         assert subsample_choice in ['fixed', 'variable']
 
         fidelity1 = dict(
-            fixed=CS.Constant('n_estimators', value=100),  # TODO: is the default value here 100 or 512?
+            fixed=CS.Constant('n_estimators', value=512),
             variable=CS.UniformIntegerHyperparameter(
                 'n_estimators', lower=16, upper=512, default_value=512, log=False
             )
diff --git a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
index 57218598..5a997241 100644
--- a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -81,8 +81,7 @@ def init_model(self,
         if isinstance(fidelity, CS.Configuration):
             fidelity = fidelity.get_dictionary()
 
-        # TODO: This seems to be wrong. (AND-condition)
-        rng = rng if (rng is None and isinstance(rng, int)) else self.seed
+        rng = rng if (rng is None or isinstance(rng, int)) else self.seed
         extra_args = dict(
             booster="gbtree",
             n_estimators=fidelity['n_estimators'],

From 75f345dac76b0de77578797189e512d9283243c5 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 17:44:34 +0200
Subject: [PATCH 067/147] PR Requests: Update upper bounds of the fidelities

---
 hpobench/benchmarks/ml_mmfb/nn_benchmark.py      | 2 +-
 hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
index c8341e8a..7da663bf 100644
--- a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
@@ -56,7 +56,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
     def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
 
         fidelity1 = dict(
-            fixed=CS.Constant('iter', value=100),
+            fixed=CS.Constant('iter', value=243),
             variable=CS.UniformIntegerHyperparameter(
                 'iter', lower=3, upper=243, default_value=243, log=False
             )
diff --git a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
index 5a997241..17e7c165 100644
--- a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -54,9 +54,9 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         assert subsample_choice in ['fixed', 'variable']
 
         fidelity1 = dict(
-            fixed=CS.Constant('n_estimators', value=1000),
+            fixed=CS.Constant('n_estimators', value=2000),
             variable=CS.UniformIntegerHyperparameter(
-                'n_estimators', lower=50, upper=2000, default_value=1000, log=False
+                'n_estimators', lower=50, upper=2000, default_value=2000, log=False
             )
         )
         fidelity2 = dict(

From 8c2ab6cf130d98aad629336cae3bb2b2640300e0 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 17:51:01 +0200
Subject: [PATCH 068/147] PR Requests: Remove OriginalTabBenchmarks

---
 .../container/benchmarks/ml_mmfb/tabular_benchmark.py  | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
index f4a855d5..54b2763f 100644
--- a/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -14,12 +14,4 @@ def __init__(self, **kwargs):
         super(TabularBenchmark, self).__init__(**kwargs)
 
 
-class OriginalTabularBenchmark(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'OriginalTabularBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_tabular_benchmarks')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(OriginalTabularBenchmark, self).__init__(**kwargs)
-
-
-__all__ = [TabularBenchmark, OriginalTabularBenchmark]
+__all__ = [TabularBenchmark]

From e24d53736a5b26af4258e8379f528bef2ba61338 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 18:06:02 +0200
Subject: [PATCH 069/147] PR Requests: Revert the query function

---
 .../benchmarks/ml_mmfb/tabular_benchmark.py   | 36 +++++--------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index 2f346624..e2b16645 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -130,33 +130,15 @@ def get_fidelity_range(self) -> List:
         return fidelities
 
     def _search_dataframe(self, row_dict, df):
-        query_stmt = self._build_query(row_dict)
-        result = df.query(query_stmt)
-        # TODO: What happens in this case? The objective function raises a TypeError.
-        if len(result) == 0:
-            return None
-        return result.iloc[0].loc['result']
-
-        # TODO: This created an out-of-bounds error. The idx mask should have been 2d, but was 1d.
-        # # https://stackoverflow.com/a/46165056/8363967
-        # mask = np.array([True] * df.shape[0])
-        # for i, param in enumerate(df.drop("result", axis=1).columns):
-        #     mask *= df[param].values == row_dict[param]
-        # idx = np.where(mask)
-        # if len(idx) != 1:
-        #     return None
-        # idx = idx[0][0]
-        # result = df.iloc[idx]["result"]
-        # return result
-
-    @staticmethod
-    def _build_query(row_dict: Dict) -> str:
-        query = ''
-        for i, (param_name, param_value) in enumerate(row_dict.items()):
-            if i != 0:
-                query += ' & '
-            query += f'{param_name} == {param_value}'
-        return query
+        # https://stackoverflow.com/a/46165056/8363967
+        mask = np.array([True] * df.shape[0])
+        for i, param in enumerate(df.drop("result", axis=1).columns):
+            mask *= df[param].values == row_dict[param]
+        idx = np.where(mask)
+        assert len(idx) == 1, f'The query has resulted into mulitple matches. This should not happen.'
+        idx = idx[0][0]
+        result = df.iloc[idx]["result"]
+        return result
 
     def _objective(
             self,

From 3c4f37582bccee2ba6ecd549f6c3ea201655ab73 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 18:09:01 +0200
Subject: [PATCH 070/147] PR Requests: Minor improvements

---
 hpobench/benchmarks/ml_mmfb/__init__.py         | 17 +++++++++++++++++
 hpobench/benchmarks/ml_mmfb/entry_point.py      | 17 -----------------
 hpobench/benchmarks/ml_mmfb/histgb_benchmark.py |  2 +-
 hpobench/benchmarks/ml_mmfb/lr_benchmark.py     |  2 +-
 hpobench/benchmarks/ml_mmfb/nn_benchmark.py     |  2 +-
 hpobench/benchmarks/ml_mmfb/rf_benchmark.py     |  2 +-
 hpobench/benchmarks/ml_mmfb/svm_benchmark.py    |  2 ++
 .../benchmarks/ml_mmfb/tabular_benchmark.py     |  2 +-
 .../benchmarks/ml_mmfb/xgboost_benchmark.py     |  2 +-
 .../benchmarks/ml_mmfb/xgboost_benchmark.py     |  2 +-
 10 files changed, 26 insertions(+), 24 deletions(-)
 delete mode 100644 hpobench/benchmarks/ml_mmfb/entry_point.py

diff --git a/hpobench/benchmarks/ml_mmfb/__init__.py b/hpobench/benchmarks/ml_mmfb/__init__.py
index e69de29b..0d13c728 100644
--- a/hpobench/benchmarks/ml_mmfb/__init__.py
+++ b/hpobench/benchmarks/ml_mmfb/__init__.py
@@ -0,0 +1,17 @@
+from hpobench.benchmarks.ml_mmfb.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
+    RandomForestBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
+from hpobench.benchmarks.ml_mmfb.tabular_benchmark import TabularBenchmark
+from hpobench.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
+
+
+__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
+           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
+           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
+           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
+           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
+           'TabularBenchmark',
+           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
diff --git a/hpobench/benchmarks/ml_mmfb/entry_point.py b/hpobench/benchmarks/ml_mmfb/entry_point.py
deleted file mode 100644
index 0114acaa..00000000
--- a/hpobench/benchmarks/ml_mmfb/entry_point.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from hpobench.benchmarks.ml_mmfb.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
-    RandomForestBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.tabular_benchmark import TabularBenchmark, OriginalTabularBenchmark
-from hpobench.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-
-
-__all__ = [HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF,
-           LRBenchmark, LRBenchmarkBB, LRBenchmarkMF,
-           NNBenchmark, NNBenchmarkBB, NNBenchmarkMF,
-           RandomForestBenchmark, RandomForestBenchmarkBB, RandomForestBenchmarkMF,
-           SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF,
-           TabularBenchmark, OriginalTabularBenchmark,
-           XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF]
diff --git a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
index 7a697129..4947d022 100644
--- a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -112,4 +112,4 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
 
-__all__ = [HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF]
+__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF']
diff --git a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
index a8ef771f..32a21be9 100644
--- a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
@@ -117,4 +117,4 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
 
-__all__ = [LRBenchmark, LRBenchmarkBB, LRBenchmarkMF]
+__all__ = ['LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF']
diff --git a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
index 7da663bf..8179731c 100644
--- a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
@@ -120,4 +120,4 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
 
-__all__ = [NNBenchmark, NNBenchmarkBB, NNBenchmarkMF]
+__all__ = ['NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF']
diff --git a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
index f0e70086..788ee64f 100644
--- a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
@@ -114,4 +114,4 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
 
-__all__ = [RandomForestBenchmark, RandomForestBenchmarkBB, RandomForestBenchmarkMF]
+__all__ = ['RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF']
diff --git a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
index 6dccf605..b3bf7568 100644
--- a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
@@ -84,3 +84,5 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 # To keep the parity of the the overall design
 SVMBenchmarkMF = SVMBenchmark
+
+__all__ = ['SVMBenchmark', 'SVMBenchmarkMF', 'SVMBenchmarkBB']
\ No newline at end of file
diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index e2b16645..bf43bf1b 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -181,4 +181,4 @@ def _objective(
         return result
 
 
-__all__ = [TabularBenchmark]
+__all__ = ['TabularBenchmark']
diff --git a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
index 17e7c165..d975857e 100644
--- a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -120,4 +120,4 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
 
-__all__ = [XGBoostBenchmarkBB, XGBoostBenchmarkMF, XGBoostBenchmark]
+__all__ = ['XGBoostBenchmarkBB', 'XGBoostBenchmarkMF', 'XGBoostBenchmark']
diff --git a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
index 1f09ead9..547ce945 100644
--- a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -38,4 +38,4 @@ def __init__(self, **kwargs):
         super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
 
 
-__all__ = [XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF]
\ No newline at end of file
+__all__ = [XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF]

From 6fc7f576a6b9f935f40c9486620eb9649e2fb7af Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 18:10:22 +0200
Subject: [PATCH 071/147] Pycodestyle

---
 hpobench/benchmarks/ml_mmfb/svm_benchmark.py     | 2 +-
 hpobench/benchmarks/ml_mmfb/tabular_benchmark.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
index b3bf7568..b0bd7f65 100644
--- a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
@@ -85,4 +85,4 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 # To keep the parity of the the overall design
 SVMBenchmarkMF = SVMBenchmark
 
-__all__ = ['SVMBenchmark', 'SVMBenchmarkMF', 'SVMBenchmarkBB']
\ No newline at end of file
+__all__ = ['SVMBenchmark', 'SVMBenchmarkMF', 'SVMBenchmarkBB']
diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index bf43bf1b..9e6ec026 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -135,7 +135,8 @@ def _search_dataframe(self, row_dict, df):
         for i, param in enumerate(df.drop("result", axis=1).columns):
             mask *= df[param].values == row_dict[param]
         idx = np.where(mask)
-        assert len(idx) == 1, f'The query has resulted into mulitple matches. This should not happen.'
+        assert len(idx) == 1, 'The query has resulted into mulitple matches. This should not happen. ' \
+                              f'The Query was {row_dict}'
         idx = idx[0][0]
         result = df.iloc[idx]["result"]
         return result

From 0430c68c4dd2560af01444704bdc46bb214afd19 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 19:48:48 +0200
Subject: [PATCH 072/147] Add missing requirements

---
 extra_requirements/ml_mfbb.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extra_requirements/ml_mfbb.json b/extra_requirements/ml_mfbb.json
index 68b4a557..db67ecd0 100644
--- a/extra_requirements/ml_mfbb.json
+++ b/extra_requirements/ml_mfbb.json
@@ -1,4 +1,4 @@
 {
-  "ml_tabular_benchmarks": ["pandas==1.2.4"],
-  "ml_mfbb": ["pandas==1.2.4","sklearn==0.24.2"]
+  "ml_tabular_benchmarks": ["pandas==1.2.4","openml==0.12.2"],
+  "ml_mfbb": ["pandas==1.2.4","scikit-learn==0.24.2","openml==0.12.2","xgboost==1.3.1"]
 }
\ No newline at end of file

From 3eb3a2d587778e5c06d030dd3d84eea4683baeea Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 19:49:35 +0200
Subject: [PATCH 073/147] Minor Improvements

- cast return values to float
- improve the __all__ vars
---
 .../container/benchmarks/ml_mmfb/__init__.py  | 19 +++++++++++++++++++
 .../benchmarks/ml_mmfb/histgb_benchmark.py    |  2 +-
 .../benchmarks/ml_mmfb/lr_benchmark.py        |  2 +-
 .../benchmarks/ml_mmfb/nn_benchmark.py        |  2 +-
 .../benchmarks/ml_mmfb/rf_benchmark.py        |  2 +-
 .../benchmarks/ml_mmfb/svm_benchmark.py       |  2 +-
 .../benchmarks/ml_mmfb/tabular_benchmark.py   |  2 +-
 .../benchmarks/ml_mmfb/xgboost_benchmark.py   |  2 +-
 hpobench/dependencies/ml_mmfb/data_manager.py |  2 --
 .../ml_mmfb/ml_benchmark_template.py          |  4 ++--
 10 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/hpobench/container/benchmarks/ml_mmfb/__init__.py b/hpobench/container/benchmarks/ml_mmfb/__init__.py
index e69de29b..5f5cada5 100644
--- a/hpobench/container/benchmarks/ml_mmfb/__init__.py
+++ b/hpobench/container/benchmarks/ml_mmfb/__init__.py
@@ -0,0 +1,19 @@
+from hpobench.container.benchmarks.ml_mmfb.histgb_benchmark import HistGBBenchmarkMF, HistGBBenchmarkBB, HistGBBenchmark
+from hpobench.container.benchmarks.ml_mmfb.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
+from hpobench.container.benchmarks.ml_mmfb.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
+from hpobench.container.benchmarks.ml_mmfb.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
+from hpobench.container.benchmarks.ml_mmfb.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
+    RandomForestBenchmarkMF
+from hpobench.container.benchmarks.ml_mmfb.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
+from hpobench.container.benchmarks.ml_mmfb.tabular_benchmark import TabularBenchmark
+from hpobench.container.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, \
+    XGBoostBenchmarkMF
+
+
+__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
+           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
+           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
+           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
+           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
+           'TabularBenchmark',
+           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
diff --git a/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
index 47886eb1..dc7af088 100644
--- a/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
@@ -30,4 +30,4 @@ def __init__(self, **kwargs):
         super(HistGBBenchmarkMF, self).__init__(**kwargs)
 
 
-__all__ = [HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF]
+__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF']
diff --git a/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
index 74092e71..979cda3e 100644
--- a/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
@@ -30,4 +30,4 @@ def __init__(self, **kwargs):
         super(LRBenchmarkMF, self).__init__(**kwargs)
 
 
-__all__ = [LRBenchmark, LRBenchmarkBB, LRBenchmarkMF]
+__all__ = ['LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF']
diff --git a/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
index 8a444c11..04955e82 100644
--- a/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
@@ -30,4 +30,4 @@ def __init__(self, **kwargs):
         super(NNBenchmarkMF, self).__init__(**kwargs)
 
 
-__all__ = [NNBenchmark, NNBenchmarkBB, NNBenchmarkMF]
+__all__ = ['NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF']
diff --git a/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
index 4f59f6a0..a414349d 100644
--- a/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
@@ -30,4 +30,4 @@ def __init__(self, **kwargs):
         super(RandomForestBenchmarkMF, self).__init__(**kwargs)
 
 
-__all__ = [RandomForestBenchmark, RandomForestBenchmarkBB, RandomForestBenchmarkMF]
+__all__ = ['RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF']
diff --git a/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
index 328b26f3..7547a81a 100644
--- a/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
@@ -30,4 +30,4 @@ def __init__(self, **kwargs):
         super(SVMBenchmarkBB, self).__init__(**kwargs)
 
 
-__all__ = [SVMBenchmark, SVMBenchmarkMF, SVMBenchmarkBB]
+__all__ = ['SVMBenchmark', 'SVMBenchmarkMF', 'SVMBenchmarkBB']
diff --git a/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
index 54b2763f..6d19953b 100644
--- a/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -14,4 +14,4 @@ def __init__(self, **kwargs):
         super(TabularBenchmark, self).__init__(**kwargs)
 
 
-__all__ = [TabularBenchmark]
+__all__ = ['TabularBenchmark']
diff --git a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
index 547ce945..c82ea606 100644
--- a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
@@ -38,4 +38,4 @@ def __init__(self, **kwargs):
         super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
 
 
-__all__ = [XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF]
+__all__ = ['XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
diff --git a/hpobench/dependencies/ml_mmfb/data_manager.py b/hpobench/dependencies/ml_mmfb/data_manager.py
index 84d8b587..526c6756 100644
--- a/hpobench/dependencies/ml_mmfb/data_manager.py
+++ b/hpobench/dependencies/ml_mmfb/data_manager.py
@@ -14,8 +14,6 @@
 from oslo_concurrency import lockutils
 
 from hpobench.util.data_manager import DataManager
-
-
 from hpobench import config_file
 
 
diff --git a/hpobench/dependencies/ml_mmfb/ml_benchmark_template.py b/hpobench/dependencies/ml_mmfb/ml_benchmark_template.py
index b67078e7..3af13965 100644
--- a/hpobench/dependencies/ml_mmfb/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml_mmfb/ml_benchmark_template.py
@@ -286,7 +286,7 @@ def objective_function_test(self,
         }
 
         return {
-            'function_value': info['test_loss'],
-            'cost': model_fit_time + info['test_costs']['acc'],
+            'function_value': float(info['test_loss']),
+            'cost': float(model_fit_time + info['test_costs']['acc']),
             'info': info
         }

From fa691f7b72b80605f214fcd0834ffd0350761f80 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Tue, 17 Aug 2021 19:52:31 +0200
Subject: [PATCH 074/147] ADD container recipes

---
 .../recipes/ml_mmfb/Singularity.ml_mmfb       | 25 +++++++++++++++++++
 .../ml_mmfb/Singularity.ml_tabular_benchmark  | 25 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 hpobench/container/recipes/ml_mmfb/Singularity.ml_mmfb
 create mode 100644 hpobench/container/recipes/ml_mmfb/Singularity.ml_tabular_benchmark

diff --git a/hpobench/container/recipes/ml_mmfb/Singularity.ml_mmfb b/hpobench/container/recipes/ml_mmfb/Singularity.ml_mmfb
new file mode 100644
index 00000000..49f9a894
--- /dev/null
+++ b/hpobench/container/recipes/ml_mmfb/Singularity.ml_mmfb
@@ -0,0 +1,25 @@
+Bootstrap: docker
+From: python:3.8-slim
+
+%labels
+MAINTAINER muelleph@cs.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y \
+    && apt install build-essential git -y \
+
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout development \
+    && pip install ".[ml_mfbb]" \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && pip cache purge \
+    && rm -rf /var/lib/apt/lists/*
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml_mmfb $@
diff --git a/hpobench/container/recipes/ml_mmfb/Singularity.ml_tabular_benchmark b/hpobench/container/recipes/ml_mmfb/Singularity.ml_tabular_benchmark
new file mode 100644
index 00000000..d128211a
--- /dev/null
+++ b/hpobench/container/recipes/ml_mmfb/Singularity.ml_tabular_benchmark
@@ -0,0 +1,25 @@
+Bootstrap: docker
+From: python:3.8-slim
+
+%labels
+MAINTAINER muelleph@cs.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y \
+    && apt install build-essential git -y \
+
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout development \
+    && pip install ".[ml_tabular_benchmarks]" \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && pip cache purge \
+    && rm -rf /var/lib/apt/lists/*
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml_mmfb $@

From f64917e7c52f3ea22dcfbe4f2582da991dcb8edd Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Thu, 19 Aug 2021 14:44:29 +0200
Subject: [PATCH 075/147] PR: Fix path in tabular data loader

---
 .gitignore                    | 1 +
 hpobench/util/data_manager.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 37101fcb..5e77c268 100644
--- a/.gitignore
+++ b/.gitignore
@@ -136,3 +136,4 @@ experiments/
 # Vagrant
 .vagrant
 Vagrantfile
+/hpobench/container/recipes_local/
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index d390218e..780bb06c 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -942,9 +942,9 @@ def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None]
         self.url_to_use = url_dict.get(model)
 
         if data_dir is None:
-            data_dir = hpobench.config_file.data_dir
+            data_dir = hpobench.config_file.data_dir / "TabularData"
 
-        self._save_dir = Path(data_dir) / "TabularData" / self.model
+        self._save_dir = Path(data_dir) / self.model
         self.create_save_directory(self._save_dir)
 
         self.parquet_file = self._save_dir / self.task_id / f'{self.model}_{self.task_id}_data.parquet.gzip'

From b95d2a5b703185ad2077eaef4040113828757829 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Thu, 19 Aug 2021 14:44:48 +0200
Subject: [PATCH 076/147] PR: Remove casting configspace to np.floats

---
 hpobench/benchmarks/ml_mmfb/tabular_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
index 9e6ec026..50af2457 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
@@ -63,7 +63,7 @@ def objective_function_test(self,
     # pylint: disable=arguments-differ
     def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         cs = json_cs.read(self.config_spaces['x_discrete'])
-        cs = self._preprocess_configspace(cs)
+        # cs = self._preprocess_configspace(cs)
         cs.seed(seed)
         return cs
 
@@ -178,7 +178,7 @@ def _objective(
             info[seed] = res["info"]
             key_path.pop("seed")
         loss = np.mean(loss)
-        result = dict(function_value=loss, cost=costs, info=info)
+        result = dict(function_value=float(loss), cost=costs, info=info)
         return result
 
 

From d7d7a2d49b73a1656dc93639ad972c296669d80e Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Thu, 19 Aug 2021 14:54:23 +0200
Subject: [PATCH 077/147] PR: Move everything back from ml_mmfb/ to ml/

---
 hpobench/benchmarks/{ml_mmfb => ml}/README.md |  0
 hpobench/benchmarks/ml/__init__.py            | 21 ++++++++++
 .../{ml_mmfb => ml}/histgb_benchmark.py       |  2 +-
 .../{ml_mmfb => ml}/lr_benchmark.py           |  2 +-
 .../{ml_mmfb => ml}/nn_benchmark.py           |  2 +-
 .../{ml_mmfb => ml}/rf_benchmark.py           |  2 +-
 .../{ml_mmfb => ml}/svm_benchmark.py          |  2 +-
 .../{ml_mmfb => ml}/tabular_benchmark.py      |  2 +-
 .../{ml_mmfb => ml}/xgboost_benchmark.py      |  2 +-
 hpobench/benchmarks/ml_mmfb/__init__.py       | 14 +++----
 hpobench/container/benchmarks/ml/__init__.py  | 17 ++++++++
 .../{ml_mmfb => ml}/histgb_benchmark.py       |  0
 .../{ml_mmfb => ml}/lr_benchmark.py           |  0
 .../{ml_mmfb => ml}/nn_benchmark.py           |  0
 .../{ml_mmfb => ml}/rf_benchmark.py           |  0
 .../container/benchmarks/ml/svm_benchmark.py  | 34 +++++++++++----
 .../benchmarks/ml/svm_benchmark_old.py        | 15 +++++++
 .../{ml_mmfb => ml}/tabular_benchmark.py      |  0
 .../benchmarks/ml/xgboost_benchmark.py        | 41 +++++++++++++------
 .../benchmarks/ml/xgboost_benchmark_old.py    | 24 +++++++++++
 .../container/benchmarks/ml_mmfb/__init__.py  | 19 ---------
 .../benchmarks/ml_mmfb/svm_benchmark.py       | 33 ---------------
 .../benchmarks/ml_mmfb/xgboost_benchmark.py   | 41 -------------------
 .../{ml_mmfb => ml}/Singularity.ml_mmfb       |  2 +-
 .../Singularity.ml_tabular_benchmark          |  2 +-
 .../dependencies/{ml_mmfb => ml}/__init__.py  |  0
 .../{ml_mmfb => ml}/data_manager.py           |  0
 .../{ml_mmfb => ml}/ml_benchmark_template.py  |  2 +-
 28 files changed, 149 insertions(+), 130 deletions(-)
 rename hpobench/benchmarks/{ml_mmfb => ml}/README.md (100%)
 rename hpobench/benchmarks/{ml_mmfb => ml}/histgb_benchmark.py (98%)
 rename hpobench/benchmarks/{ml_mmfb => ml}/lr_benchmark.py (98%)
 rename hpobench/benchmarks/{ml_mmfb => ml}/nn_benchmark.py (98%)
 rename hpobench/benchmarks/{ml_mmfb => ml}/rf_benchmark.py (98%)
 rename hpobench/benchmarks/{ml_mmfb => ml}/svm_benchmark.py (97%)
 rename hpobench/benchmarks/{ml_mmfb => ml}/tabular_benchmark.py (99%)
 rename hpobench/benchmarks/{ml_mmfb => ml}/xgboost_benchmark.py (98%)
 rename hpobench/container/benchmarks/{ml_mmfb => ml}/histgb_benchmark.py (100%)
 rename hpobench/container/benchmarks/{ml_mmfb => ml}/lr_benchmark.py (100%)
 rename hpobench/container/benchmarks/{ml_mmfb => ml}/nn_benchmark.py (100%)
 rename hpobench/container/benchmarks/{ml_mmfb => ml}/rf_benchmark.py (100%)
 create mode 100644 hpobench/container/benchmarks/ml/svm_benchmark_old.py
 rename hpobench/container/benchmarks/{ml_mmfb => ml}/tabular_benchmark.py (100%)
 create mode 100644 hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
 delete mode 100644 hpobench/container/benchmarks/ml_mmfb/__init__.py
 delete mode 100644 hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
 delete mode 100644 hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
 rename hpobench/container/recipes/{ml_mmfb => ml}/Singularity.ml_mmfb (95%)
 rename hpobench/container/recipes/{ml_mmfb => ml}/Singularity.ml_tabular_benchmark (96%)
 rename hpobench/dependencies/{ml_mmfb => ml}/__init__.py (100%)
 rename hpobench/dependencies/{ml_mmfb => ml}/data_manager.py (100%)
 rename hpobench/dependencies/{ml_mmfb => ml}/ml_benchmark_template.py (99%)

diff --git a/hpobench/benchmarks/ml_mmfb/README.md b/hpobench/benchmarks/ml/README.md
similarity index 100%
rename from hpobench/benchmarks/ml_mmfb/README.md
rename to hpobench/benchmarks/ml/README.md
diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index e69de29b..966d2ed8 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -0,0 +1,21 @@
+from hpobench.benchmarks.ml.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
+from hpobench.benchmarks.ml.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
+from hpobench.benchmarks.ml.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
+from hpobench.benchmarks.ml.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
+    RandomForestBenchmarkMF
+from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
+from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
+from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
+from hpobench.benchmarks.ml.pybnn import BNNOnToyFunction, BNNOnProteinStructure, BNNOnProteinStructure, \
+    BNNOnYearPrediction
+
+
+__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
+           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
+           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
+           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
+           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
+           'TabularBenchmark',
+           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
+           'BNNOnToyFunction', 'BNNOnProteinStructure', 'BNNOnProteinStructure', 'BNNOnYearPrediction'
+           ]
diff --git a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/benchmarks/ml/histgb_benchmark.py
similarity index 98%
rename from hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
rename to hpobench/benchmarks/ml/histgb_benchmark.py
index 4947d022..f08882e8 100644
--- a/hpobench/benchmarks/ml_mmfb/histgb_benchmark.py
+++ b/hpobench/benchmarks/ml/histgb_benchmark.py
@@ -7,7 +7,7 @@
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
-from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
 class HistGBBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
similarity index 98%
rename from hpobench/benchmarks/ml_mmfb/lr_benchmark.py
rename to hpobench/benchmarks/ml/lr_benchmark.py
index 32a21be9..e99170d0 100644
--- a/hpobench/benchmarks/ml_mmfb/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -5,7 +5,7 @@
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.linear_model import SGDClassifier
 
-from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
 class LRBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
similarity index 98%
rename from hpobench/benchmarks/ml_mmfb/nn_benchmark.py
rename to hpobench/benchmarks/ml/nn_benchmark.py
index 8179731c..5c3d54fb 100644
--- a/hpobench/benchmarks/ml_mmfb/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -6,7 +6,7 @@
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.neural_network import MLPClassifier
 
-from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
 class NNBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
similarity index 98%
rename from hpobench/benchmarks/ml_mmfb/rf_benchmark.py
rename to hpobench/benchmarks/ml/rf_benchmark.py
index 788ee64f..b6424a6e 100644
--- a/hpobench/benchmarks/ml_mmfb/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -6,7 +6,7 @@
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.ensemble import RandomForestClassifier
 
-from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
 class RandomForestBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
similarity index 97%
rename from hpobench/benchmarks/ml_mmfb/svm_benchmark.py
rename to hpobench/benchmarks/ml/svm_benchmark.py
index b0bd7f65..582a3342 100644
--- a/hpobench/benchmarks/ml_mmfb/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -5,7 +5,7 @@
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.svm import SVC
 
-from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
 class SVMBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
similarity index 99%
rename from hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
rename to hpobench/benchmarks/ml/tabular_benchmark.py
index 50af2457..166f8841 100644
--- a/hpobench/benchmarks/ml_mmfb/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -6,7 +6,7 @@
 from ConfigSpace.read_and_write import json as json_cs
 
 from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.dependencies.ml_mmfb.ml_benchmark_template import metrics
+from hpobench.dependencies.ml.ml_benchmark_template import metrics
 from hpobench.util.data_manager import TabularDataManager
 
 
diff --git a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
similarity index 98%
rename from hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
rename to hpobench/benchmarks/ml/xgboost_benchmark.py
index d975857e..4a2dd2f4 100644
--- a/hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -5,7 +5,7 @@
 import xgboost as xgb
 from ConfigSpace.hyperparameters import Hyperparameter
 
-from hpobench.dependencies.ml_mmfb.ml_benchmark_template import MLBenchmark
+from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 
 class XGBoostBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml_mmfb/__init__.py b/hpobench/benchmarks/ml_mmfb/__init__.py
index 0d13c728..b826bd24 100644
--- a/hpobench/benchmarks/ml_mmfb/__init__.py
+++ b/hpobench/benchmarks/ml_mmfb/__init__.py
@@ -1,11 +1,11 @@
-from hpobench.benchmarks.ml_mmfb.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
+from hpobench.benchmarks.ml.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
+from hpobench.benchmarks.ml.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
+from hpobench.benchmarks.ml.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
+from hpobench.benchmarks.ml.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
     RandomForestBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
-from hpobench.benchmarks.ml_mmfb.tabular_benchmark import TabularBenchmark
-from hpobench.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
+from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
+from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
+from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
 
 
 __all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
diff --git a/hpobench/container/benchmarks/ml/__init__.py b/hpobench/container/benchmarks/ml/__init__.py
index e69de29b..ed2ce40f 100644
--- a/hpobench/container/benchmarks/ml/__init__.py
+++ b/hpobench/container/benchmarks/ml/__init__.py
@@ -0,0 +1,17 @@
+from hpobench.container.benchmarks.ml.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
+from hpobench.container.benchmarks.ml.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
+from hpobench.container.benchmarks.ml.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
+from hpobench.container.benchmarks.ml.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
+    RandomForestBenchmarkMF
+from hpobench.container.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
+from hpobench.container.benchmarks.ml.tabular_benchmark import TabularBenchmark
+from hpobench.container.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
+
+
+__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
+           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
+           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
+           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
+           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
+           'TabularBenchmark',
+           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
diff --git a/hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py b/hpobench/container/benchmarks/ml/histgb_benchmark.py
similarity index 100%
rename from hpobench/container/benchmarks/ml_mmfb/histgb_benchmark.py
rename to hpobench/container/benchmarks/ml/histgb_benchmark.py
diff --git a/hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py b/hpobench/container/benchmarks/ml/lr_benchmark.py
similarity index 100%
rename from hpobench/container/benchmarks/ml_mmfb/lr_benchmark.py
rename to hpobench/container/benchmarks/ml/lr_benchmark.py
diff --git a/hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py b/hpobench/container/benchmarks/ml/nn_benchmark.py
similarity index 100%
rename from hpobench/container/benchmarks/ml_mmfb/nn_benchmark.py
rename to hpobench/container/benchmarks/ml/nn_benchmark.py
diff --git a/hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py b/hpobench/container/benchmarks/ml/rf_benchmark.py
similarity index 100%
rename from hpobench/container/benchmarks/ml_mmfb/rf_benchmark.py
rename to hpobench/container/benchmarks/ml/rf_benchmark.py
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark.py b/hpobench/container/benchmarks/ml/svm_benchmark.py
index 4955f057..7547a81a 100644
--- a/hpobench/container/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/container/benchmarks/ml/svm_benchmark.py
@@ -1,15 +1,33 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
-""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
+""" Benchmark for the SVM Benchmarks from hpobench/benchmarks/ml_mmfb/svm_benchmark.py """
 
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-class SupportVectorMachine(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SupportVectorMachine')
-        kwargs['container_name'] = kwargs.get('container_name', 'svm_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(SupportVectorMachine, self).__init__(**kwargs)
+class SVMBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(SVMBenchmark, self).__init__(**kwargs)
+
+
+class SVMBenchmarkMF(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkMF')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(SVMBenchmarkMF, self).__init__(**kwargs)
+
+
+class SVMBenchmarkBB(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkBB')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(SVMBenchmarkBB, self).__init__(**kwargs)
+
+
+__all__ = ['SVMBenchmark', 'SVMBenchmarkMF', 'SVMBenchmarkBB']
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark_old.py b/hpobench/container/benchmarks/ml/svm_benchmark_old.py
new file mode 100644
index 00000000..4955f057
--- /dev/null
+++ b/hpobench/container/benchmarks/ml/svm_benchmark_old.py
@@ -0,0 +1,15 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class SupportVectorMachine(AbstractBenchmarkClient):
+    def __init__(self, task_id: int, **kwargs):
+        kwargs['task_id'] = task_id
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SupportVectorMachine')
+        kwargs['container_name'] = kwargs.get('container_name', 'svm_benchmark')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
+        super(SupportVectorMachine, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py b/hpobench/container/benchmarks/ml/tabular_benchmark.py
similarity index 100%
rename from hpobench/container/benchmarks/ml_mmfb/tabular_benchmark.py
rename to hpobench/container/benchmarks/ml/tabular_benchmark.py
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark.py b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
index df475748..c82ea606 100644
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
@@ -1,24 +1,41 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
-""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
+""" Benchmark for the XGB Benchmarks from hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py """
 
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
 class XGBoostBenchmark(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
+    def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
         super(XGBoostBenchmark, self).__init__(**kwargs)
 
 
-class XGBoostExtendedBenchmark(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostExtendedBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(XGBoostExtendedBenchmark, self).__init__(**kwargs)
+class XGBoostBenchmarkBB(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkBB')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(XGBoostBenchmarkBB, self).__init__(**kwargs)
+
+
+class XGBoostBenchmarkMF(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkMF')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(XGBoostBenchmarkMF, self).__init__(**kwargs)
+
+
+class XGBoostSearchSpace3Benchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace3Benchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+__all__ = ['XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
new file mode 100644
index 00000000..df475748
--- /dev/null
+++ b/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
@@ -0,0 +1,24 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class XGBoostBenchmark(AbstractBenchmarkClient):
+    def __init__(self, task_id: int, **kwargs):
+        kwargs['task_id'] = task_id
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
+        super(XGBoostBenchmark, self).__init__(**kwargs)
+
+
+class XGBoostExtendedBenchmark(AbstractBenchmarkClient):
+    def __init__(self, task_id: int, **kwargs):
+        kwargs['task_id'] = task_id
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostExtendedBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
+        super(XGBoostExtendedBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/ml_mmfb/__init__.py b/hpobench/container/benchmarks/ml_mmfb/__init__.py
deleted file mode 100644
index 5f5cada5..00000000
--- a/hpobench/container/benchmarks/ml_mmfb/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from hpobench.container.benchmarks.ml_mmfb.histgb_benchmark import HistGBBenchmarkMF, HistGBBenchmarkBB, HistGBBenchmark
-from hpobench.container.benchmarks.ml_mmfb.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
-from hpobench.container.benchmarks.ml_mmfb.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
-from hpobench.container.benchmarks.ml_mmfb.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
-from hpobench.container.benchmarks.ml_mmfb.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
-    RandomForestBenchmarkMF
-from hpobench.container.benchmarks.ml_mmfb.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
-from hpobench.container.benchmarks.ml_mmfb.tabular_benchmark import TabularBenchmark
-from hpobench.container.benchmarks.ml_mmfb.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, \
-    XGBoostBenchmarkMF
-
-
-__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
-           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
-           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
-           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
-           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
-           'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
diff --git a/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
deleted file mode 100644
index 7547a81a..00000000
--- a/hpobench/container/benchmarks/ml_mmfb/svm_benchmark.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-""" Benchmark for the SVM Benchmarks from hpobench/benchmarks/ml_mmfb/svm_benchmark.py """
-
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
-
-
-class SVMBenchmark(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(SVMBenchmark, self).__init__(**kwargs)
-
-
-class SVMBenchmarkMF(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(SVMBenchmarkMF, self).__init__(**kwargs)
-
-
-class SVMBenchmarkBB(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(SVMBenchmarkBB, self).__init__(**kwargs)
-
-
-__all__ = ['SVMBenchmark', 'SVMBenchmarkMF', 'SVMBenchmarkBB']
diff --git a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py b/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
deleted file mode 100644
index c82ea606..00000000
--- a/hpobench/container/benchmarks/ml_mmfb/xgboost_benchmark.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-""" Benchmark for the XGB Benchmarks from hpobench/benchmarks/ml_mmfb/xgboost_benchmark.py """
-
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
-
-
-class XGBoostBenchmark(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(XGBoostBenchmark, self).__init__(**kwargs)
-
-
-class XGBoostBenchmarkBB(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(XGBoostBenchmarkBB, self).__init__(**kwargs)
-
-
-class XGBoostBenchmarkMF(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(XGBoostBenchmarkMF, self).__init__(**kwargs)
-
-
-class XGBoostSearchSpace3Benchmark(AbstractBenchmarkClient):
-    def __init__(self, **kwargs):
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace3Benchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
-        super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
-
-
-__all__ = ['XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
diff --git a/hpobench/container/recipes/ml_mmfb/Singularity.ml_mmfb b/hpobench/container/recipes/ml/Singularity.ml_mmfb
similarity index 95%
rename from hpobench/container/recipes/ml_mmfb/Singularity.ml_mmfb
rename to hpobench/container/recipes/ml/Singularity.ml_mmfb
index 49f9a894..cd8b3e6e 100644
--- a/hpobench/container/recipes/ml_mmfb/Singularity.ml_mmfb
+++ b/hpobench/container/recipes/ml/Singularity.ml_mmfb
@@ -22,4 +22,4 @@ VERSION v0.0.1
 
 
 %runscript
-    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml_mmfb $@
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml $@
diff --git a/hpobench/container/recipes/ml_mmfb/Singularity.ml_tabular_benchmark b/hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark
similarity index 96%
rename from hpobench/container/recipes/ml_mmfb/Singularity.ml_tabular_benchmark
rename to hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark
index d128211a..16f92de8 100644
--- a/hpobench/container/recipes/ml_mmfb/Singularity.ml_tabular_benchmark
+++ b/hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark
@@ -22,4 +22,4 @@ VERSION v0.0.1
 
 
 %runscript
-    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml_mmfb $@
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml $@
diff --git a/hpobench/dependencies/ml_mmfb/__init__.py b/hpobench/dependencies/ml/__init__.py
similarity index 100%
rename from hpobench/dependencies/ml_mmfb/__init__.py
rename to hpobench/dependencies/ml/__init__.py
diff --git a/hpobench/dependencies/ml_mmfb/data_manager.py b/hpobench/dependencies/ml/data_manager.py
similarity index 100%
rename from hpobench/dependencies/ml_mmfb/data_manager.py
rename to hpobench/dependencies/ml/data_manager.py
diff --git a/hpobench/dependencies/ml_mmfb/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
similarity index 99%
rename from hpobench/dependencies/ml_mmfb/ml_benchmark_template.py
rename to hpobench/dependencies/ml/ml_benchmark_template.py
index 3af13965..3c6fcdaf 100644
--- a/hpobench/dependencies/ml_mmfb/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -9,7 +9,7 @@
     precision_score, f1_score
 
 from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.dependencies.ml_mmfb.data_manager import OpenMLDataManager
+from hpobench.dependencies.ml.data_manager import OpenMLDataManager
 from hpobench.util.rng_helper import get_rng
 
 metrics = dict(

From be641f8bbf81a8394263255e32de72daac17d62b Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Thu, 19 Aug 2021 14:55:45 +0200
Subject: [PATCH 078/147] PR: Remove pybnn from the init.

This would cause an error since importing stuff for the tab benchmarks would require the pybnn stuff.
---
 hpobench/benchmarks/ml/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index 966d2ed8..b44482c4 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -6,8 +6,6 @@
 from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
 from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
 from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-from hpobench.benchmarks.ml.pybnn import BNNOnToyFunction, BNNOnProteinStructure, BNNOnProteinStructure, \
-    BNNOnYearPrediction
 
 
 __all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
@@ -17,5 +15,4 @@
            'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
            'TabularBenchmark',
            'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
-           'BNNOnToyFunction', 'BNNOnProteinStructure', 'BNNOnProteinStructure', 'BNNOnYearPrediction'
            ]

From 7bc25bcaa153d0e8a2b97de3f58ecc043140b891 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Thu, 19 Aug 2021 14:56:49 +0200
Subject: [PATCH 079/147] PR: Cleanup

---
 hpobench/benchmarks/ml_mmfb/__init__.py | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100644 hpobench/benchmarks/ml_mmfb/__init__.py

diff --git a/hpobench/benchmarks/ml_mmfb/__init__.py b/hpobench/benchmarks/ml_mmfb/__init__.py
deleted file mode 100644
index b826bd24..00000000
--- a/hpobench/benchmarks/ml_mmfb/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from hpobench.benchmarks.ml.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
-from hpobench.benchmarks.ml.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
-from hpobench.benchmarks.ml.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
-from hpobench.benchmarks.ml.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
-    RandomForestBenchmarkMF
-from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
-from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
-from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-
-
-__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
-           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
-           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
-           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
-           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
-           'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']

From b0d9b7f4bf626c3d5f0fa70a2c0c0ef10247bd01 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Thu, 19 Aug 2021 15:24:19 +0200
Subject: [PATCH 080/147] PR: Fix Tests

---
 tests/test_check_configuration.py | 1 -
 tests/test_server.py              | 2 +-
 tests/test_svm.py                 | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_check_configuration.py b/tests/test_check_configuration.py
index 382e9810..8d3db58f 100644
--- a/tests/test_check_configuration.py
+++ b/tests/test_check_configuration.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 import pytest
-import ConfigSpace as CS
 
 from ConfigSpace import ConfigurationSpace, Configuration, \
     UniformFloatHyperparameter, UniformIntegerHyperparameter, \
diff --git a/tests/test_server.py b/tests/test_server.py
index d175c09a..d78cb0cc 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -24,7 +24,7 @@ def test_debug_container():
 
     set_log_level(True)
 
-    from hpobench.container.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark as Benchmark
+    from hpobench.container.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark as Benchmark
     from hpobench.util.openml_data_manager import get_openmlcc18_taskids
 
     task_id = get_openmlcc18_taskids()[0]
diff --git a/tests/test_svm.py b/tests/test_svm.py
index a7a31307..c3acf007 100644
--- a/tests/test_svm.py
+++ b/tests/test_svm.py
@@ -1,6 +1,6 @@
 import pytest
 
-from hpobench.container.benchmarks.ml.svm_benchmark import SupportVectorMachine
+from hpobench.container.benchmarks.ml.svm_benchmark_old import SupportVectorMachine
 from hpobench.util.openml_data_manager import get_openmlcc18_taskids
 
 task_ids = get_openmlcc18_taskids()

From 59bd905370b4cec599c9fbcf6df2a40e8c1c490b Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 19 Aug 2021 21:44:53 +0200
Subject: [PATCH 081/147] Adding public URLs for tabular benchmark

---
 hpobench/util/data_manager.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index d390218e..d5f7e48f 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -41,6 +41,14 @@
 import hpobench
 
 
+tabular_multi_fidelity_urls = dict(
+    xgb="https://figshare.com/articles/dataset/XGBoost/15155919",
+    svm="https://figshare.com/articles/dataset/SupportVectorMachine/15098280",
+    lr="https://figshare.com/articles/dataset/LogisticRegression/15098283",
+    rf="https://figshare.com/articles/dataset/RandomForest/15173517",
+    nn="https://figshare.com/articles/dataset/NeuralNetwork/15156915"
+)
+
 class DataManager(abc.ABC, metaclass=abc.ABCMeta):
     """ Base Class for loading and managing the data.
 
@@ -930,14 +938,12 @@ class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
 
-        assert model in ['lr', 'svm', 'xgb'], f'Model has to be one of [lr, svm, xgb] but was {model}'
+        assert model in tabular_multi_fidelity_urls.keys(), f'Model has to be one of [lr, svm, xgb] but was {model}'
 
         self.model = model
         self.task_id = str(task_id)
 
-        url_dict = dict(xgb='https://ndownloader.figshare.com/files/29113257?private_link=c817bed4e7efc6daee91',
-                        svm='https://ndownloader.figshare.com/files/29102307?private_link=5a0929ad9b2ccd8dda58',
-                        lr='https://ndownloader.figshare.com/files/29027112?private_link=d644493a93dbab4b4ee1')
+        url_dict = tabular_multi_fidelity_urls
 
         self.url_to_use = url_dict.get(model)
 

From f576fb36b4c8eb40a02653d2c60bea8c6c06da26 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 19 Aug 2021 22:27:42 +0200
Subject: [PATCH 082/147] Adding more models

---
 hpobench/benchmarks/ml/tabular_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 166f8841..20dcf312 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -17,7 +17,7 @@ def __init__(self,
                  data_dir: Union[Path, str, None] = None,
                  rng: Union[int, np.random.RandomState, None] = None, **kwargs):
 
-        assert model in ['lr', 'svm', 'xgb'], f'Parameter `model` has to be one of [lr, svm, xgb] but was {model}'
+        assert model in ['lr', 'svm', 'xgb', 'rf', 'nn'], f'Parameter `model` has to be one of [lr, svm, xgb] but was {model}'
 
         self.task_id = task_id
         self.model = model

From 63f517782f86c9379dacb1cd6031098f1db2cc7d Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 20 Aug 2021 19:26:53 +0200
Subject: [PATCH 083/147] Updating figshare URLs with new public ones

---
 hpobench/util/data_manager.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 603c2334..0ee6d0aa 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -42,11 +42,11 @@
 
 
 tabular_multi_fidelity_urls = dict(
-    xgb="https://figshare.com/articles/dataset/XGBoost/15155919",
-    svm="https://figshare.com/articles/dataset/SupportVectorMachine/15098280",
-    lr="https://figshare.com/articles/dataset/LogisticRegression/15098283",
-    rf="https://figshare.com/articles/dataset/RandomForest/15173517",
-    nn="https://figshare.com/articles/dataset/NeuralNetwork/15156915"
+    xgb="https://ndownloader.figshare.com/files/29469231",
+    svm="https://ndownloader.figshare.com/files/29471790",
+    lr="https://ndownloader.figshare.com/files/29470119",
+    rf="https://ndownloader.figshare.com/files/29466012",
+    nn="https://ndownloader.figshare.com/files/29467902"
 )
 
 class DataManager(abc.ABC, metaclass=abc.ABCMeta):
@@ -938,7 +938,8 @@ class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
 
-        assert model in tabular_multi_fidelity_urls.keys(), f'Model has to be one of [lr, svm, xgb] but was {model}'
+        assert model in tabular_multi_fidelity_urls.keys(), \
+            f'Model has to be one of {list(tabular_multi_fidelity_urls.keys())} but was {model}'
 
         self.model = model
         self.task_id = str(task_id)

From 53358314583e454ca0b47ca980bf68365640c4be Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Fri, 20 Aug 2021 21:58:11 +0200
Subject: [PATCH 084/147] PR Fix URLs and dependencies

---
 extra_requirements/ml_mfbb.json             |  4 ++--
 hpobench/benchmarks/ml/__init__.py          |  6 +++++-
 hpobench/benchmarks/ml/tabular_benchmark.py |  4 ++--
 hpobench/util/data_manager.py               | 15 ++++++++++-----
 4 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/extra_requirements/ml_mfbb.json b/extra_requirements/ml_mfbb.json
index db67ecd0..87de2c63 100644
--- a/extra_requirements/ml_mfbb.json
+++ b/extra_requirements/ml_mfbb.json
@@ -1,4 +1,4 @@
 {
-  "ml_tabular_benchmarks": ["pandas==1.2.4","openml==0.12.2"],
-  "ml_mfbb": ["pandas==1.2.4","scikit-learn==0.24.2","openml==0.12.2","xgboost==1.3.1"]
+  "ml_tabular_benchmarks": ["tqdm","pandas==1.2.4","scikit-learn==0.24.2","openml==0.12.2","xgboost==1.3.1"],
+  "ml_mfbb": ["tqdm","pandas==1.2.4","scikit-learn==0.24.2","openml==0.12.2","xgboost==1.3.1"]
 }
\ No newline at end of file
diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index b44482c4..64e399cd 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -5,7 +5,11 @@
     RandomForestBenchmarkMF
 from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
 from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
-from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
+
+try:
+    from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
+except ImportError:
+    pass
 
 
 __all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 166f8841..fa5421b2 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -16,8 +16,8 @@ def __init__(self,
                  model: str, task_id: int,
                  data_dir: Union[Path, str, None] = None,
                  rng: Union[int, np.random.RandomState, None] = None, **kwargs):
-
-        assert model in ['lr', 'svm', 'xgb'], f'Parameter `model` has to be one of [lr, svm, xgb] but was {model}'
+        models = ['lr', 'svm', 'xgb', 'rf', 'nn']
+        assert model in models, f'Parameter `model` has to be one of {models} but was {model}'
 
         self.task_id = task_id
         self.model = model
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 780bb06c..cf3200fd 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -930,15 +930,20 @@ class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
 
-        assert model in ['lr', 'svm', 'xgb'], f'Model has to be one of [lr, svm, xgb] but was {model}'
+        url_dict = dict(
+            xgb="https://ndownloader.figshare.com/files/29469231",
+            svm="https://ndownloader.figshare.com/files/29471790",
+            lr="https://ndownloader.figshare.com/files/29470119",
+            rf="https://ndownloader.figshare.com/files/29466012",
+            nn="https://ndownloader.figshare.com/files/29467902"
+        )
+
+        assert model in url_dict.keys(), \
+            f'Model has to be one of {list(url_dict.keys())} but was {model}'
 
         self.model = model
         self.task_id = str(task_id)
 
-        url_dict = dict(xgb='https://ndownloader.figshare.com/files/29113257?private_link=c817bed4e7efc6daee91',
-                        svm='https://ndownloader.figshare.com/files/29102307?private_link=5a0929ad9b2ccd8dda58',
-                        lr='https://ndownloader.figshare.com/files/29027112?private_link=d644493a93dbab4b4ee1')
-
         self.url_to_use = url_dict.get(model)
 
         if data_dir is None:

From cf9b4ef61b24569b2e3ed76fd35800ead9f65232 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Sun, 22 Aug 2021 01:59:20 +0200
Subject: [PATCH 085/147] Updating URL for SVM data

---
 hpobench/util/data_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 0ee6d0aa..f36bc234 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -43,7 +43,7 @@
 
 tabular_multi_fidelity_urls = dict(
     xgb="https://ndownloader.figshare.com/files/29469231",
-    svm="https://ndownloader.figshare.com/files/29471790",
+    svm="https://ndownloader.figshare.com/files/30300531",
     lr="https://ndownloader.figshare.com/files/29470119",
     rf="https://ndownloader.figshare.com/files/29466012",
     nn="https://ndownloader.figshare.com/files/29467902"

From ed7d23ed76c44a45b71354a33710350d86097f6a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 23 Aug 2021 02:08:35 +0200
Subject: [PATCH 086/147] Updating Tabular bench URLs

---
 hpobench/util/data_manager.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index f36bc234..eafa00ba 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -42,11 +42,11 @@
 
 
 tabular_multi_fidelity_urls = dict(
-    xgb="https://ndownloader.figshare.com/files/29469231",
-    svm="https://ndownloader.figshare.com/files/30300531",
-    lr="https://ndownloader.figshare.com/files/29470119",
-    rf="https://ndownloader.figshare.com/files/29466012",
-    nn="https://ndownloader.figshare.com/files/29467902"
+    xgb="https://ndownloader.figshare.com/files/30378972",
+    svm="https://ndownloader.figshare.com/files/30379359",
+    lr="https://ndownloader.figshare.com/files/30379038",
+    rf="https://ndownloader.figshare.com/files/30378930",
+    nn="https://ndownloader.figshare.com/files/30379005"
 )
 
 class DataManager(abc.ABC, metaclass=abc.ABCMeta):

From 9181bbb3608e7159edbfc16a28e1e18da2ae45a5 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Wed, 25 Aug 2021 15:47:35 +0200
Subject: [PATCH 087/147] PR Fix URLs and dependencies

---
 hpobench/util/data_manager.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index cf3200fd..9712ff8e 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -931,11 +931,11 @@ def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None]
         super(TabularDataManager, self).__init__()
 
         url_dict = dict(
-            xgb="https://ndownloader.figshare.com/files/29469231",
-            svm="https://ndownloader.figshare.com/files/29471790",
-            lr="https://ndownloader.figshare.com/files/29470119",
-            rf="https://ndownloader.figshare.com/files/29466012",
-            nn="https://ndownloader.figshare.com/files/29467902"
+            xgb="https://ndownloader.figshare.com/files/30378972",
+            svm="https://ndownloader.figshare.com/files/30379359",
+            lr="https://ndownloader.figshare.com/files/30379038",
+            rf="https://ndownloader.figshare.com/files/30378930",
+            nn="https://ndownloader.figshare.com/files/30379005"
         )
 
         assert model in url_dict.keys(), \

From 451ff08bd56e105ec8a633443d2314e895048494 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Wed, 25 Aug 2021 16:52:30 +0200
Subject: [PATCH 088/147] PR Fix URLs and dependencies

---
 ci_scripts/install.sh         | 2 +-
 extra_requirements/tests.json | 3 ++-
 tests/test_whitebox.py        | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index 9ed0f5b6..d2799899 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -4,7 +4,7 @@ install_packages=""
 
 if [[ "$RUN_TESTS" == "true" ]]; then
     echo "Install tools for testing"
-    install_packages="${install_packages}xgboost,pytest,test_paramnet,"
+    install_packages="${install_packages}xgboost,pytest,test_paramnet,test_tabular_datamanager"
     pip install codecov
 
     # The param net benchmark does not work with a scikit-learn version != 0.23.2. (See notes in the benchmark)
diff --git a/extra_requirements/tests.json b/extra_requirements/tests.json
index fff27ee1..0b8deb77 100644
--- a/extra_requirements/tests.json
+++ b/extra_requirements/tests.json
@@ -1,5 +1,6 @@
 {
   "codestyle": ["pycodestyle","flake8","pylint"],
   "pytest": ["pytest>=4.6","pytest-cov"],
-  "test_paramnet": ["tqdm", "scikit-learn==0.23.2"]
+  "test_paramnet": ["tqdm", "scikit-learn==0.23.2"],
+  "test_tabular_datamanager": ["pyarrow"]
 }
\ No newline at end of file
diff --git a/tests/test_whitebox.py b/tests/test_whitebox.py
index c3f5e0ff..bff5a77e 100644
--- a/tests/test_whitebox.py
+++ b/tests/test_whitebox.py
@@ -39,7 +39,7 @@ def test_whitebox_without_container_xgb():
 
 @pytest.mark.skipif(skip_container_test, reason="Requires singularity and flask")
 def test_whitebox_with_container():
-    from hpobench.container.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark as Benchmark
+    from hpobench.container.benchmarks.ml.xgboost_benchmark_old import XGBoostBenchmark as Benchmark
     b = Benchmark(container_name='xgboost_benchmark',
                   task_id=167199,
                   rng=0)

From 310b11e5c938638d81dfd8ffea65d94405e27e19 Mon Sep 17 00:00:00 2001
From: Neeratyoy Mallik <neeratyoy@gmail.com>
Date: Thu, 26 Aug 2021 14:28:55 +0200
Subject: [PATCH 089/147] Updating RF benchmark URL

---
 hpobench/util/data_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index eafa00ba..f08ffb90 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -45,7 +45,7 @@
     xgb="https://ndownloader.figshare.com/files/30378972",
     svm="https://ndownloader.figshare.com/files/30379359",
     lr="https://ndownloader.figshare.com/files/30379038",
-    rf="https://ndownloader.figshare.com/files/30378930",
+    rf="https://ndownloader.figshare.com/files/30469089",
     nn="https://ndownloader.figshare.com/files/30379005"
 )
 

From f01286b56d7066614c212dbff13ad7ce12bd4432 Mon Sep 17 00:00:00 2001
From: Neeratyoy Mallik <neeratyoy@gmail.com>
Date: Thu, 26 Aug 2021 15:30:23 +0200
Subject: [PATCH 090/147] Updating XGB URL

---
 hpobench/util/data_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index f08ffb90..1a36025f 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -42,7 +42,7 @@
 
 
 tabular_multi_fidelity_urls = dict(
-    xgb="https://ndownloader.figshare.com/files/30378972",
+    xgb="https://ndownloader.figshare.com/files/30469920",
     svm="https://ndownloader.figshare.com/files/30379359",
     lr="https://ndownloader.figshare.com/files/30379038",
     rf="https://ndownloader.figshare.com/files/30469089",

From 12b72b1d4d7be78798dda4e4a07f7f5c83463841 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Fri, 27 Aug 2021 19:39:29 +0200
Subject: [PATCH 091/147] PR Fix tests

---
 ci_scripts/install.sh         | 2 +-
 extra_requirements/tests.json | 2 +-
 tests/test_whitebox.py        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index d2799899..03f3f185 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -4,7 +4,7 @@ install_packages=""
 
 if [[ "$RUN_TESTS" == "true" ]]; then
     echo "Install tools for testing"
-    install_packages="${install_packages}xgboost,pytest,test_paramnet,test_tabular_datamanager"
+    install_packages="${install_packages}xgboost,pytest,test_paramnet,test_tabular_datamanager,"
     pip install codecov
 
     # The param net benchmark does not work with a scikit-learn version != 0.23.2. (See notes in the benchmark)
diff --git a/extra_requirements/tests.json b/extra_requirements/tests.json
index 0b8deb77..6c27be97 100644
--- a/extra_requirements/tests.json
+++ b/extra_requirements/tests.json
@@ -2,5 +2,5 @@
   "codestyle": ["pycodestyle","flake8","pylint"],
   "pytest": ["pytest>=4.6","pytest-cov"],
   "test_paramnet": ["tqdm", "scikit-learn==0.23.2"],
-  "test_tabular_datamanager": ["pyarrow"]
+  "test_tabular_datamanager": ["pyarrow", "fastparquet"]
 }
\ No newline at end of file
diff --git a/tests/test_whitebox.py b/tests/test_whitebox.py
index bff5a77e..35a9a940 100644
--- a/tests/test_whitebox.py
+++ b/tests/test_whitebox.py
@@ -32,8 +32,8 @@ def test_whitebox_without_container_xgb():
     result_dict = b.objective_function_test(configuration, fidelity=dict(n_estimators=n_estimator), rng=0)
     test_loss = result_dict['function_value']
 
-    assert np.isclose(train_loss, 0.0223, atol=0.001)
-    assert np.isclose(valid_loss, 0.4234, atol=0.001)
+    assert np.isclose(train_loss, 0.02678, atol=0.001)
+    assert np.isclose(valid_loss, 0.49549, atol=0.001)
     assert np.isclose(test_loss, 0.43636, atol=0.001)
 
 

From 41aa96bf657be97693d9f31290d586957b84c749 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Fri, 27 Aug 2021 19:40:49 +0200
Subject: [PATCH 092/147] New Urls

---
 hpobench/util/data_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 9712ff8e..a2e33121 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -931,10 +931,10 @@ def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None]
         super(TabularDataManager, self).__init__()
 
         url_dict = dict(
-            xgb="https://ndownloader.figshare.com/files/30378972",
+            xgb="https://ndownloader.figshare.com/files/30469920",
             svm="https://ndownloader.figshare.com/files/30379359",
             lr="https://ndownloader.figshare.com/files/30379038",
-            rf="https://ndownloader.figshare.com/files/30378930",
+            rf="https://ndownloader.figshare.com/files/30469089",
             nn="https://ndownloader.figshare.com/files/30379005"
         )
 

From c23e3545c1931fa9bd17d0d16b81b6db15de0937 Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 30 Aug 2021 17:40:52 +0200
Subject: [PATCH 093/147] Trigger Rebuild.

---
 ci_scripts/install.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index 03f3f185..20652052 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -65,7 +65,6 @@ if [[ "$USE_SINGULARITY" == "true" ]]; then
       sudo make -C builddir install
 
     cd ..
-    install_packages="${install_packages}singularity,"
 else
     echo "Skip installing Singularity"
 fi

From 1fa684c3e0a86b46c176a44db7ab84427b3603db Mon Sep 17 00:00:00 2001
From: PhMueller <muller-phil@gmx.net>
Date: Mon, 30 Aug 2021 18:15:40 +0200
Subject: [PATCH 094/147] Fix Dataloader Assertion

---
 ci_scripts/install.sh      | 2 +-
 tests/test_data_manager.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index 20652052..097d00d0 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -63,7 +63,7 @@ if [[ "$USE_SINGULARITY" == "true" ]]; then
     ./mconfig && \
       make -C builddir && \
       sudo make -C builddir install
-
+      install_packages="${install_packages}placeholder,"
     cd ..
 else
     echo "Skip installing Singularity"
diff --git a/tests/test_data_manager.py b/tests/test_data_manager.py
index fd57b627..7e32ce84 100644
--- a/tests/test_data_manager.py
+++ b/tests/test_data_manager.py
@@ -108,7 +108,7 @@ def test_tabular_datamanager():
 
     table, meta_data = dm.load()
 
-    assert (hpobench.config_file.data_dir / "TabularData" / str(3) / f'lr_3_data.parquet.gzip').exists()
-    assert (hpobench.config_file.data_dir / "TabularData" / str(3) / f'lr_3_metadata.json').exists()
+    assert (hpobench.config_file.data_dir / "TabularData" / 'lr' / str(3) / f'lr_3_data.parquet.gzip').exists()
+    assert (hpobench.config_file.data_dir / "TabularData" / 'lr' / str(3) / f'lr_3_metadata.json').exists()
 
     table_2, meta_data_2 = dm.load()

From bfb38766f3e818dfd8640d9818dceb2103c07425 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 1 Oct 2021 18:38:41 +0200
Subject: [PATCH 095/147] Redesigning reporting of val-test evaluations on
 query type

---
 .../dependencies/ml/ml_benchmark_template.py  | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 3c6fcdaf..191f8999 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -134,7 +134,9 @@ def get_fidelity(self, size: Union[int, None] = None):
             return self.fidelity_space.sample_configuration()
         return [self.fidelity_space.sample_configuration() for i in range(size)]
 
-    def shuffle_data_idx(self, train_idx: Iterable = None, rng: Union[np.random.RandomState, None] = None) -> Iterable:
+    def shuffle_data_idx(
+            self, train_idx: Iterable = None, rng: Union[np.random.RandomState, None] = None
+    ) -> Iterable:
         rng = self.rng if rng is None else rng
         train_idx = self.train_idx if train_idx is None else train_idx
         rng.shuffle(train_idx)
@@ -154,14 +156,16 @@ def _train_objective(self,
         model = self.init_model(config, fidelity, rng)
 
         # preparing data
-        if eval == "valid":
+        if evaluation == "valid":
             train_X = self.train_X
             train_y = self.train_y
             train_idx = self.train_idx
-        else:
+        elif evaluation == "test":
             train_X = np.vstack((self.train_X, self.valid_X))
             train_y = pd.concat((self.train_y, self.valid_y))
             train_idx = np.arange(len(train_X))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
 
         # shuffling data
         if shuffle:
@@ -190,10 +194,9 @@ def _train_objective(self,
         for k, v in self.scorers.items():
             scores[k] = 0.0
             score_cost[k] = 0.0
-            if evaluation == "test":
-                _start = time.time()
-                scores[k] = v(model, train_X, train_y)
-                score_cost[k] = time.time() - _start
+            _start = time.time()
+            scores[k] = v(model, train_X, train_y)
+            score_cost[k] = time.time() - _start
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
@@ -208,7 +211,7 @@ def objective_function(self,
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="val"
+            configuration, fidelity, shuffle, rng, evaluation="valid"
         )
         val_scores = dict()
         val_score_cost = dict()
@@ -218,33 +221,29 @@ def objective_function(self,
             val_score_cost[k] = time.time() - _start
         val_loss = 1 - val_scores["acc"]
 
-        test_scores = dict()
-        test_score_cost = dict()
-        for k, v in self.scorers.items():
-            _start = time.time()
-            test_scores[k] = v(model, self.test_X, self.test_y)
-            test_score_cost[k] = time.time() - _start
-        test_loss = 1 - test_scores["acc"]
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
 
         info = {
             'train_loss': train_loss,
             'val_loss': val_loss,
-            'test_loss': test_loss,
+            'test_loss': None,
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
             'val_scores': val_scores,
             'val_costs': val_score_cost,
-            'test_scores': test_scores,
-            'test_costs': test_score_cost,
+            'test_scores': None,
+            'test_costs': None,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
         }
 
         return {
-            'function_value': info['val_loss'],
-            'cost': model_fit_time + info['val_costs']['acc'],
+            'function_value': float(info['val_loss']),
+            'cost': float(model_fit_time + info['val_costs']['acc']),
             'info': info
         }
 
@@ -276,8 +275,8 @@ def objective_function_test(self,
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
-            'val_scores': dict(),
-            'val_costs': dict(),
+            'val_scores': None,
+            'val_costs': None,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory

From 6394521080fa51d7df952129972905398f6a26fe Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 7 Oct 2021 01:18:42 +0200
Subject: [PATCH 096/147] inference cost key fix

---
 hpobench/benchmarks/ml/tabular_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 72e5fb31..c5525bf5 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -163,7 +163,7 @@ def _objective(
         metric_str = ', '.join(list(metrics.keys()))
         assert metric in list(metrics.keys()), f"metric not found among: {metric_str}"
         score_key = f"{evaluation}_scores"
-        cost_key = f"{evaluation}_scores"
+        cost_key = f"{evaluation}_costs"
 
         key_path = dict()
         for name in self.configuration_space.get_hyperparameter_names():

From 460ae8c04e12e718f761194c9fa1b5b144a7020e Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 18 Oct 2021 23:23:54 +0200
Subject: [PATCH 097/147] Basic redesign of data collected on raw objective

---
 hpobench/benchmarks/ml/lr_benchmark.py        |  1 -
 .../dependencies/ml/ml_benchmark_template.py  | 55 +++++++++++++++----
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 8c317111..dea69889 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -103,7 +103,6 @@ def init_model(self, config: Union[CS.Configuration, Dict],
             learning_rate="adaptive",
             tol=None,
             random_state=rng,
-
         )
         return model
 
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 191f8999..9974ce3f 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -49,9 +49,8 @@ def __init__(
 
         self.task_id = task_id
         self.valid_size = valid_size
-        self.scorers = dict()
-        for k, v in metrics.items():
-            self.scorers[k] = make_scorer(v, **metrics_kwargs[k])
+        self.scorers = metrics
+        self.scorer_args = metrics_kwargs
 
         if data_path is None:
             from hpobench import config_file
@@ -188,6 +187,10 @@ def _train_objective(self,
         start = time.time()
         model.fit(train_X[train_idx], train_y.iloc[train_idx])
         model_fit_time = time.time() - start
+        # model inference
+        start = time.time()
+        pred_train = model.predict(train_X)
+        inference_time = time.time() - start
         # computing statistics on training data
         scores = dict()
         score_cost = dict()
@@ -195,8 +198,8 @@ def _train_objective(self,
             scores[k] = 0.0
             score_cost[k] = 0.0
             _start = time.time()
-            scores[k] = v(model, train_X, train_y)
-            score_cost[k] = time.time() - _start
+            scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
@@ -213,14 +216,35 @@ def objective_function(self,
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
             configuration, fidelity, shuffle, rng, evaluation="valid"
         )
+
+        # model inference on validation set
+        start = time.time()
+        pred_val = model.predict(self.valid_X)
+        val_inference_time = time.time() - start
         val_scores = dict()
         val_score_cost = dict()
         for k, v in self.scorers.items():
+            val_scores[k] = 0.0
+            val_score_cost[k] = 0.0
             _start = time.time()
-            val_scores[k] = v(model, self.valid_X, self.valid_y)
-            val_score_cost[k] = time.time() - _start
+            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
+            val_score_cost[k] = time.time() - _start + val_inference_time
         val_loss = 1 - val_scores["acc"]
 
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
+        test_scores = dict()
+        test_score_cost = dict()
+        for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
+            _start = time.time()
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
+        test_loss = 1 - test_scores["acc"]
+
         fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
         configuration = configuration.get_dictionary() \
             if isinstance(configuration, CS.Configuration) else configuration
@@ -228,14 +252,14 @@ def objective_function(self,
         info = {
             'train_loss': train_loss,
             'val_loss': val_loss,
-            'test_loss': None,
+            'test_loss': test_loss,
             'model_cost': model_fit_time,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
             'val_scores': val_scores,
             'val_costs': val_score_cost,
-            'test_scores': None,
-            'test_costs': None,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
@@ -260,12 +284,19 @@ def objective_function_test(self,
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
             configuration, fidelity, shuffle, rng, evaluation="test"
         )
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
         test_scores = dict()
         test_score_cost = dict()
         for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
             _start = time.time()
-            test_scores[k] = v(model, self.test_X, self.test_y)
-            test_score_cost[k] = time.time() - _start
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
         test_loss = 1 - test_scores["acc"]
 
         info = {

From fe41f41e8bf4b9d66711daaf8fbccd375881e835 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@informatik.uni-freiburg.de>
Date: Mon, 25 Oct 2021 15:11:49 +0200
Subject: [PATCH 098/147] Update __version__.py (#137)

* Update __version__.py

* Update changelog.md
---
 changelog.md            | 2 ++
 hpobench/__version__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/changelog.md b/changelog.md
index 18b3b9fd..fd54dafa 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,3 +1,5 @@
+# 0.0.11
+
 # 0.0.10
   * Cartpole Benchmark Version 0.0.4:
     Fix: Pass the hp `entropy_regularization` to the PPO Agent. 
diff --git a/hpobench/__version__.py b/hpobench/__version__.py
index 6820f36a..7f116511 100644
--- a/hpobench/__version__.py
+++ b/hpobench/__version__.py
@@ -1 +1 @@
-__version__ = '0.0.10'
+__version__ = '0.0.11dev'

From cdd222fc59be618dd775f3351d433db05a9ea9ba Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 10 Nov 2021 17:15:41 +0530
Subject: [PATCH 099/147] Restructuring first iteration

---
 hpobench/benchmarks/ml/lr_benchmark.py        |   4 +-
 .../dependencies/ml/ml_benchmark_template.py  | 110 +++++++++++-------
 2 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index dea69889..0108c44a 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -22,11 +22,11 @@
 class LRBenchmark(MLBenchmark):
     def __init__(self,
                  task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
                  valid_size: float = 0.33,
+                 rng: Union[np.random.RandomState, int, None] = None,
                  data_path: Union[str, None] = None):
 
-        super(LRBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+        super(LRBenchmark, self).__init__(task_id, valid_size, rng, data_path)
         self.cache_size = 500
 
     @staticmethod
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 9974ce3f..669651c9 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -33,17 +33,33 @@ class MLBenchmark(AbstractBenchmark):
     def __init__(
             self,
             task_id: int,
-            rng: Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
             data_path: Union[str, Path, None] = None,
             global_seed: int = 1
     ):
+        """ Base template for the ML multi-fidelity benchmarks.
+
+        Parameters
+        ----------
+        task_id : int
+            A valid OpenML Task ID.
+        valid_size : float
+            The fraction of training set to be used as validation split.
+        rng : np.random.RandomState, int (optional)
+            The random seed that will be passed to the ML model if not explicitly passed.
+        data_path : str, Path (optional)
+            The path from where the training-validation-testing splits may be loaded.
+        global_seed : int
+            The fixed global seed that is used for creating validation splits if not available.
+        """
         super(MLBenchmark, self).__init__(rng=rng)
 
         if isinstance(rng, int):
             self.seed = rng
         else:
             self.seed = self.rng.randint(1, 10**6)
+        self.rng = get_rng(self.seed)
 
         self.global_seed = global_seed  # used for fixed training-validation splits
 
@@ -58,7 +74,7 @@ def __init__(
 
         self.data_path = Path(data_path)
 
-        dm = OpenMLDataManager(task_id, valid_size, data_path, global_seed)
+        dm = OpenMLDataManager(self.task_id, self.valid_size, self.data_path, self.global_seed)
         dm.load()
 
         # Data variables
@@ -89,32 +105,27 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
-
-        If fidelity_choice is 0
-            Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
-            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If fidelity_choice is 2
-            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
-            Fidelity space is multi-multi fidelity, all possible fidelities
         """
         raise NotImplementedError()
 
     def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
+        """ Returns the meta information for the benchmark
+        """
         return {
             'name': 'Support Vector Machine',
             'shape of train data': self.train_X.shape,
             'shape of test data': self.test_X.shape,
             'shape of valid data': self.valid_X.shape,
-            'initial random seed': self.seed,
+            'initial random seed': self.rng,
             'task_id': self.task_id
         }
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         raise NotImplementedError()
@@ -141,13 +152,15 @@ def shuffle_data_idx(
         rng.shuffle(train_idx)
         return train_idx
 
-    def _train_objective(self,
-                         config: Dict,
-                         fidelity: Dict,
-                         shuffle: bool,
-                         rng: Union[np.random.RandomState, int, None] = None,
-                         evaluation: Union[str, None] = "valid"):
-
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False
+    ):
         if rng is not None:
             rng = get_rng(rng, self.rng)
 
@@ -158,13 +171,12 @@ def _train_objective(self,
         if evaluation == "valid":
             train_X = self.train_X
             train_y = self.train_y
-            train_idx = self.train_idx
         elif evaluation == "test":
             train_X = np.vstack((self.train_X, self.valid_X))
             train_y = pd.concat((self.train_y, self.valid_y))
-            train_idx = np.arange(len(train_X))
         else:
             raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
 
         # shuffling data
         if shuffle:
@@ -188,9 +200,12 @@ def _train_objective(self,
         model.fit(train_X[train_idx], train_y.iloc[train_idx])
         model_fit_time = time.time() - start
         # model inference
-        start = time.time()
-        pred_train = model.predict(train_X)
-        inference_time = time.time() - start
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
         # computing statistics on training data
         scores = dict()
         score_cost = dict()
@@ -198,23 +213,28 @@ def _train_objective(self,
             scores[k] = 0.0
             score_cost[k] = 0.0
             _start = time.time()
-            scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
             score_cost[k] = time.time() - _start + inference_time
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost
 
     # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
-    def objective_function(self,
-                           configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            **kwargs
+    ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
         """
+        # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="valid"
+            configuration, fidelity, shuffle, rng, evaluation="valid", record_stats=record_train
         )
 
         # model inference on validation set
@@ -273,16 +293,20 @@ def objective_function(self,
 
     # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
-    def objective_function_test(self,
-                                configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            **kwargs
+    ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
         """
+        # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="test"
+            configuration, fidelity, shuffle, rng, evaluation="test", record_stats=record_train
         )
 
         # model inference on test set

From e1bb210cbd1e6b6a00694504eab8f0caea9401ac Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 11 Nov 2021 14:36:27 +0530
Subject: [PATCH 100/147] Updating ML benches version with revisions

---
 hpobench/benchmarks/ml/lr_benchmark.py        | 43 ++++++++--------
 hpobench/benchmarks/ml/nn_benchmark.py        | 39 ++++++++++-----
 hpobench/benchmarks/ml/rf_benchmark.py        | 37 +++++++++-----
 hpobench/benchmarks/ml/svm_benchmark.py       | 38 ++++++++------
 hpobench/benchmarks/ml/xgboost_benchmark.py   | 43 ++++++++++------
 .../dependencies/ml/ml_benchmark_template.py  | 50 ++++++++++++++++++-
 6 files changed, 173 insertions(+), 77 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 0108c44a..588e3d6c 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -4,6 +4,8 @@
 
 0.0.1:
 * First implementation of the LR Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
 """
 
 
@@ -16,18 +18,20 @@
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.2'
 
 
 class LRBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 valid_size: float = 0.33,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 data_path: Union[str, None] = None):
-
+    """ Multi-multi-fidelity Logisitic Regression Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
         super(LRBenchmark, self).__init__(task_id, valid_size, rng, data_path)
-        self.cache_size = 500
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -44,6 +48,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         ])
         return cs
 
+    @staticmethod
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
@@ -55,15 +60,7 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
     @staticmethod
     def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
         """Fidelity space available --- specifies the fidelity dimensions
-
-        For SVM, only a single fidelity exists, i.e., subsample fraction.
-        if fidelity_choice == 0
-            uses the entire data (subsample=1), reflecting the black-box setup
-        else
-            parameterizes the fraction of data to subsample
-
         """
-
         assert iter_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
 
@@ -79,14 +76,16 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
                 'subsample', lower=0.1, upper=1.0, default_value=1.0, log=False
             )
         )
-
         iter = fidelity1[iter_choice]
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         # initializing model
         rng = self.rng if rng is None else rng
 
@@ -108,6 +107,8 @@ def init_model(self, config: Union[CS.Configuration, Dict],
 
 
 class LRBenchmarkBB(LRBenchmark):
+    """ Black-box version of the LRBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
@@ -118,6 +119,8 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 
 class LRBenchmarkMF(LRBenchmark):
+    """ Multi-fidelity version of the LRBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 06634661..9692509a 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -4,6 +4,8 @@
 
 0.0.1:
 * First implementation of the NN Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
 """
 
 from copy import deepcopy
@@ -16,16 +18,20 @@
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.2'
 
 
 class NNBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(NNBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity Multi-Layer Perceptron Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(NNBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -64,7 +70,8 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
     @staticmethod
     def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
-
+        """Fidelity space available --- specifies the fidelity dimensions
+        """
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=243),
             variable=CS.UniformIntegerHyperparameter(
@@ -81,11 +88,13 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
         rng = self.rng if rng is None else rng
 
         if isinstance(config, CS.Configuration):
@@ -111,6 +120,8 @@ def init_model(self, config: Union[CS.Configuration, Dict],
 
 
 class NNBenchmarkBB(NNBenchmark):
+    """ Black-box version of the NNBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
@@ -121,6 +132,8 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 
 class NNBenchmarkMF(NNBenchmark):
+    """ Multi-fidelity version of the NNBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 596f03b6..284ff8dd 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -4,6 +4,8 @@
 
 0.0.1:
 * First implementation of the RF Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
 """
 
 from copy import deepcopy
@@ -16,16 +18,20 @@
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.2'
 
 
 class RandomForestBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(RandomForestBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity Random Forest Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(RandomForestBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -70,7 +76,6 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
                 'n_estimators', lower=16, upper=512, default_value=512, log=False
             )
         )
-
         fidelity2 = dict(
             fixed=CS.Constant('subsample', value=1),
             variable=CS.UniformFloatHyperparameter(
@@ -81,11 +86,13 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
         rng = self.rng if rng is None else rng
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
@@ -105,6 +112,8 @@ def init_model(self, config: Union[CS.Configuration, Dict],
 
 
 class RandomForestBenchmarkBB(RandomForestBenchmark):
+    """ Black-box version of the RandomForestBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
@@ -115,6 +124,8 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 
 class RandomForestBenchmarkMF(RandomForestBenchmark):
+    """ Multi-fidelity version of the RandomForestBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 9462442f..bc791c9b 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -4,6 +4,8 @@
 
 0.0.1:
 * First implementation of the new SVM Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
 """
 
 from typing import Union, Dict
@@ -15,18 +17,21 @@
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.2'
 
 
 class SVMBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(SVMBenchmark, self).__init__(task_id, rng, valid_size, data_path)
-
-        self.cache_size = 200
+    """ Multi-multi-fidelity SVM Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(SVMBenchmark, self).__init__(task_id, valid_size, rng, data_path)
+        self.cache_size = 1024  # in MB
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -54,7 +59,8 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
     @staticmethod
     def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
-
+        """Fidelity space available --- specifies the fidelity dimensions
+        """
         assert subsample_choice in ['fixed', 'variable']
 
         fidelity = dict(
@@ -64,12 +70,14 @@ def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
             )
         )
         subsample = fidelity[subsample_choice]
-
         return subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         # initializing model
         rng = self.rng if rng is None else rng
         if isinstance(config, CS.Configuration):
@@ -83,6 +91,8 @@ def init_model(self, config: Union[CS.Configuration, Dict],
 
 
 class SVMBenchmarkBB(SVMBenchmark):
+    """ Black-box version of the SVMBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameter(
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index ae554628..629a7fb6 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -4,6 +4,8 @@
 
 0.0.1:
 * First implementation of the new XGB Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
 """
 from typing import Union, Tuple, Dict
 
@@ -14,16 +16,20 @@
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.2'
 
 
 class XGBoostBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(XGBoostBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity XGBoost Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(XGBoostBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -74,23 +80,24 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         )
-
         n_estimators = fidelity1[n_estimators_choice]
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self,
-                   config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
+        # rng = rng if (rng is None or isinstance(rng, int)) else self.seed
+        rng = rng if isinstance(rng, int) else self.seed
+
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
         if isinstance(fidelity, CS.Configuration):
             fidelity = fidelity.get_dictionary()
-
-        rng = rng if (rng is None or isinstance(rng, int)) else self.seed
         extra_args = dict(
             booster="gbtree",
             n_estimators=fidelity['n_estimators'],
@@ -110,6 +117,8 @@ def init_model(self,
 
 
 class XGBoostBenchmarkBB(XGBoostBenchmark):
+    """ Black-box version of the XGBoostBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
@@ -120,6 +129,8 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 
 class XGBoostBenchmarkMF(XGBoostBenchmark):
+    """ Multi-fidelity version of the XGBoostBenchmark
+    """
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 669651c9..d6758f83 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -181,7 +181,10 @@ def _train_objective(
         # shuffling data
         if shuffle:
             train_idx = self.shuffle_data_idx(train_idx, rng)
-            train_X = train_X.iloc[train_idx]
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
             train_y = train_y.iloc[train_idx]
 
         # subsample here:
@@ -231,6 +234,23 @@ def objective_function(
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
+
+        The ML model is trained on the training split, and evaluated on the valid and test splits.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
         """
         # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
@@ -303,6 +323,23 @@ def objective_function_test(
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
+
+        The ML model is trained on the training+valid split, and evaluated on the test split.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
         """
         # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
@@ -344,3 +381,14 @@ def objective_function_test(
             'cost': float(model_fit_time + info['test_costs']['acc']),
             'info': info
         }
+
+
+if __name__ == "__main__":
+    from hpobench.benchmarks.ml import XGBoostBenchmarkMF
+    benchmark = XGBoostBenchmarkMF(task_id=10101)
+    config = benchmark.configuration_space.sample_configuration()
+    print(config)
+    fidelity = benchmark.fidelity_space.sample_configuration()
+    print(fidelity)
+    res = benchmark.objective_function(config, fidelity, shuffle=True, record_train=True, rng=123)
+    print(res)

From 815da3447c4168df467b2b45d05fd64fe8f81930 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 11 Nov 2021 14:49:57 +0530
Subject: [PATCH 101/147] Minor update

---
 hpobench/benchmarks/ml/svm_benchmark_old.py   | 354 --------------
 hpobench/benchmarks/ml/tabular_benchmark.py   |   4 +-
 .../benchmarks/ml/xgboost_benchmark_old.py    | 430 ------------------
 3 files changed, 3 insertions(+), 785 deletions(-)
 delete mode 100644 hpobench/benchmarks/ml/svm_benchmark_old.py
 delete mode 100644 hpobench/benchmarks/ml/xgboost_benchmark_old.py

diff --git a/hpobench/benchmarks/ml/svm_benchmark_old.py b/hpobench/benchmarks/ml/svm_benchmark_old.py
deleted file mode 100644
index 9aad5e44..00000000
--- a/hpobench/benchmarks/ml/svm_benchmark_old.py
+++ /dev/null
@@ -1,354 +0,0 @@
-"""
-
-Changelog:
-==========
-0.0.3
-* New container release due to a general change in the communication between container and HPOBench.
-  Works with HPOBench >= v0.0.8
-
-0.0.2:
-* Standardize the structure of the meta information
-
-0.0.1:
-* First implementation
-
-"""
-
-import logging
-import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
-import numpy as np
-from scipy import sparse
-from sklearn import pipeline
-from sklearn import svm
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, make_scorer
-from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
-
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.3'
-
-logger = logging.getLogger('SVMBenchmark')
-
-
-class SupportVectorMachine(AbstractBenchmark):
-    """
-    Hyperparameter optimization task to optimize the regularization
-    parameter C and the kernel parameter gamma of a support vector machine.
-    Both hyperparameters are optimized on a log scale in [-10, 10].
-    The X_test data set is only used for a final offline evaluation of
-    a configuration. For that the validation and training data is
-    concatenated to form the whole training data set.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-        Parameters
-        ----------
-        task_id : int, None
-        rng : np.random.RandomState, int, None
-        """
-        super(SupportVectorMachine, self).__init__(rng=rng)
-
-        self.task_id = task_id
-        self.cache_size = 200  # Cache for the SVC in MB
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM model
-        fidelity: Dict, None
-            Fidelity parameters for the SVM model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : training loss
-                fidelity : used fidelities in this evaluation
-        """
-        start_time = time.time()
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        # Split of dataset subset
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_size = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_size = fidelity['dataset_fraction']
-
-        train_size = int(train_size * len(self.train_idx))
-        train_idx = self.train_idx[:train_size]
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        # Train support vector machine
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(self.x_train[train_idx], self.y_train[train_idx])
-
-        # Compute validation error
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(val_loss),
-                "cost": cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}}
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model with a given configuration on both the X_train
-        and validation data set and evaluates the model on the X_test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : X_test loss
-            cost : time to X_train and evaluate the model
-            info : Dict
-                train_valid_loss: Loss on the train+valid data set
-                fidelity : used fidelities in this evaluation
-        """
-        assert np.isclose(fidelity['dataset_fraction'], 1), \
-            f'Data set fraction must be 1 but was {fidelity["dataset_fraction"]}'
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start_time = time.time()
-
-        # Concatenate training and validation dataset
-        if isinstance(self.x_train, sparse.csr.csr_matrix) or isinstance(self.x_valid, sparse.csr.csr_matrix):
-            data = sparse.vstack((self.x_train, self.x_valid))
-        else:
-            data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(data, targets)
-
-        # Compute validation error
-        train_valid_loss = 1 - self.accuracy_scorer(model, data, targets)
-
-        # Compute test error
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(test_loss),
-                "cost": cost,
-                'info': {'train_valid_loss': float(train_valid_loss),
-                         'fidelity': fidelity}}
-
-    def get_pipeline(self, C: float, gamma: float) -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-
-        model = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", MinMaxScaler(feature_range=(0, 1)), ~self.categorical_data)])),
-            ('svm',
-             svm.SVC(gamma=gamma, C=C, random_state=self.rng, cache_size=self.cache_size))
-        ])
-        return model
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the SVM Model
-
-        For a detailed explanation of the hyperparameters:
-        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
-
-        cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('C', lower=-10., upper=10., default_value=0., log=False),
-            CS.UniformFloatHyperparameter('gamma', lower=-10., upper=10., default_value=1., log=False),
-        ])
-        # cs.generate_all_continuous_from_bounds(SupportVectorMachine.get_meta_information()['bounds'])
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the SupportVector Benchmark
-
-        Fidelities
-        ----------
-        dataset_fraction: float - [0.1, 1]
-            fraction of training data set to use
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-        ])
-        return fidel_space
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        return {'name': 'Support Vector Machine',
-                'references': ["@InProceedings{pmlr-v54-klein17a",
-                               "author = {Aaron Klein and Stefan Falkner and Simon Bartels and Philipp Hennig and "
-                               "Frank Hutter}, "
-                               "title = {{Fast Bayesian Optimization of Machine Learning Hyperparameters on "
-                               "Large Datasets}}"
-                               "pages = {528--536}, year = {2017},"
-                               "editor = {Aarti Singh and Jerry Zhu},"
-                               "volume = {54},"
-                               "series = {Proceedings of Machine Learning Research},"
-                               "address = {Fort Lauderdale, FL, USA},"
-                               "month = {20--22 Apr},"
-                               "publisher = {PMLR},"
-                               "pdf = {http://proceedings.mlr.press/v54/klein17a/klein17a.pdf}, "
-                               "url = {http://proceedings.mlr.press/v54/klein17a.html}, "
-                               ],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/container/hpolib/benchmarks/ml/svm_benchmark.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index c5525bf5..3012f3dd 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -4,6 +4,8 @@
 
 0.0.1:
 * First implementation of the Tabular Benchmark.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
 """
 
 from pathlib import Path
@@ -17,7 +19,7 @@
 from hpobench.dependencies.ml.ml_benchmark_template import metrics
 from hpobench.util.data_manager import TabularDataManager
 
-__version__ = '0.0.1'
+__version__ = '0.0.2'
 
 
 class TabularBenchmark(AbstractBenchmark):
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/benchmarks/ml/xgboost_benchmark_old.py
deleted file mode 100644
index f8730f52..00000000
--- a/hpobench/benchmarks/ml/xgboost_benchmark_old.py
+++ /dev/null
@@ -1,430 +0,0 @@
-"""
-
-Changelog:
-==========
-0.0.3
-* New container release due to a general change in the communication between container and HPOBench.
-  Works with HPOBench >= v0.0.8
-
-0.0.2:
-* Change the search space definiton to match the paper: (https://arxiv.org/pdf/1802.09596.pdf)
-    eta:                [1e-5, 1] (def: 0.3)    ->  [2**-10, 1] (def: 0.3)
-    min_child_weight:   [0,05, 10] (def: 1)     ->  [1, 2**7] (def: 1)
-    colsample_bytree:   [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    colsample_bylevel:  [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    reg_lambda:         [1e-5, 2] (def: 1)      ->  [2**-10, 2**10] (def: 1)
-    reg_alpha:          [1e-5, 2] (def: 1e-5)   ->  [2**-10, 2**10] (def: 1)
-    max_depth:          -                       ->  [1, 15] (def: 6)
-    subsample_per_it:   -                       ->  [0.01, 1] (def: 1)
-    [booster:            -                      ->  [gbtree, gblinear, dart] (def: gbtree)]  *)
-
-    *) This parameter is only in the XGBoostExtendedBenchmark. Not in the XGBoostBenchmark class.
-
-* Increase the fidelity `n_estimators`
-    n_estimators        [2, 128] (def: 128)     ->  [1, 256] (def: 256)
-
-* Add class to optimize also the used booster method: (gbtree, gblinear or dart)
-    We have introduced a new class, which adds the used booster as parameter to the configuration space. To read more
-    about booster, please take a look in the official XGBoost-documentation (https://xgboost.readthedocs.io/en/latest).
-
-
-0.0.1:
-* First implementation of a XGBoost Benchmark.
-
-
-"""
-
-import logging
-import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
-import numpy as np
-import xgboost as xgb
-from sklearn import pipeline
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, make_scorer
-from sklearn.preprocessing import OneHotEncoder
-
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.3'
-
-logger = logging.getLogger('XGBBenchmark')
-
-
-class XGBoostBenchmark(AbstractBenchmark):
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-
-        Parameters
-        ----------
-        task_id : int, None
-        n_threads  : int, None
-        rng : np.random.RandomState, int, None
-        """
-
-        super(XGBoostBenchmark, self).__init__(rng=rng)
-        self.n_threads = n_threads
-        self.task_id = task_id
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        # Determine the number of categories in the labels.
-        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
-        self.num_class = len(np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid])))
-        self.num_class = 1 if self.num_class == 2 else self.num_class
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost model
-        fidelity: Dict, None
-            Fidelity parameters for the XGBoost model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : trainings loss
-                fidelity : used fidelities in this evaluation
-        """
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_data_fraction = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_data_fraction = fidelity['dataset_fraction']
-
-        train_idx = self.train_idx[:int(len(self.train_idx) * train_data_fraction)]
-
-        model = self._get_pipeline(n_estimators=fidelity["n_estimators"], **configuration)
-        model.fit(X=self.x_train[train_idx], y=self.y_train[train_idx])
-
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-        cost = time.time() - start
-
-        return {'function_value': float(val_loss),
-                'cost': cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}
-                }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model with a given configuration on both the train
-        and validation data set and evaluates the model on the test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : test loss
-            cost : time to train and evaluate the model
-            info : Dict
-                fidelity : used fidelities in this evaluation
-        """
-        default_dataset_fraction = self.get_fidelity_space().get_hyperparameter('dataset_fraction').default_value
-        if fidelity['dataset_fraction'] != default_dataset_fraction:
-            raise NotImplementedError(f'Test error can not be computed for dataset_fraction <= '
-                                      f'{default_dataset_fraction}')
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        # Impute potential nan values with the feature-
-        data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        model = self._get_pipeline(n_estimators=fidelity['n_estimators'], **configuration)
-        model.fit(X=data, y=targets)
-
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-        cost = time.time() - start
-
-        return {'function_value': float(test_loss),
-                'cost': cost,
-                'info': {'fidelity': fidelity}}
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the XGBoost Model
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
-
-        cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('eta', lower=2**-10, upper=1., default_value=0.3, log=True),
-            CS.UniformIntegerHyperparameter('max_depth', lower=1, upper=15, default_value=6, log=False),
-            CS.UniformFloatHyperparameter('min_child_weight', lower=1., upper=2**7., default_value=1., log=True),
-            CS.UniformFloatHyperparameter('colsample_bytree', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('colsample_bylevel', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('subsample_per_it', lower=0.1, upper=1, default_value=1, log=False)
-        ])
-
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the XGBoost Benchmark
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-            CS.UniformIntegerHyperparameter("n_estimators", lower=1, upper=256, default_value=256, log=False)
-        ])
-
-        return fidel_space
-
-    def get_meta_information(self) -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'XGBoost',
-                'references': ['@article{probst2019tunability,'
-                               'title={Tunability: Importance of hyperparameters of machine learning algorithms.},'
-                               'author={Probst, Philipp and Boulesteix, Anne-Laure and Bischl, Bernd},'
-                               'journal={J. Mach. Learn. Res.},'
-                               'volume={20},'
-                               'number={53},'
-                               'pages={1--32},'
-                               'year={2019}'
-                               '}'],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/development/hpolib/benchmarks/ml/'
-                        'xgboost_benchmark_old.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
-
-    def _get_pipeline(self, max_depth: int, eta: float, min_child_weight: int,
-                      colsample_bytree: float, colsample_bylevel: float, reg_lambda: int, reg_alpha: int,
-                      n_estimators: int, subsample_per_it: float) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(
-                 max_depth=max_depth,
-                 learning_rate=eta,
-                 min_child_weight=min_child_weight,
-                 colsample_bytree=colsample_bytree,
-                 colsample_bylevel=colsample_bylevel,
-                 reg_alpha=reg_alpha,
-                 reg_lambda=reg_lambda,
-                 n_estimators=n_estimators,
-                 objective=objective,
-                 n_jobs=self.n_threads,
-                 random_state=self.rng.randint(1, 100000),
-                 num_class=self.num_class,
-                 subsample=subsample_per_it))
-            ])
-        return clf
-
-
-class XGBoostExtendedBenchmark(XGBoostBenchmark):
-    """
-    Similar to XGBoostBenchmark but enables also the optimization of the used booster.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        super(XGBoostExtendedBenchmark, self).__init__(task_id=task_id, n_threads=n_threads, rng=rng)
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = XGBoostBenchmark.get_configuration_space(seed)
-        hp_booster = CS.CategoricalHyperparameter('booster', choices=['gbtree', 'gblinear', 'dart'],
-                                                  default_value='gbtree')
-        cs.add_hyperparameter(hp_booster)
-
-        # XGBoost with 'gblinear' can not use some
-        # parameters. Exclude them from the configuration space by introducing a condition.
-        hps = ['colsample_bylevel', 'colsample_bytree', 'max_depth',  'min_child_weight', 'subsample_per_it']
-
-        # The NotEqualsCondition means: "Make parameter X active if hp_booster is not equal to gblinear."
-        conditions = [CS.NotEqualsCondition(cs.get_hyperparameter(hp), hp_booster, 'gblinear') for hp in hps]
-        cs.add_conditions(conditions)
-        return cs
-
-    # noinspection PyMethodOverriding
-    # pylint: disable=arguments-differ
-    def _get_pipeline(self, n_estimators: int, booster: str, reg_lambda: int, reg_alpha: int, eta: float,
-                      min_child_weight: int = None, max_depth: int = None, colsample_bytree: float = None,
-                      colsample_bylevel: float = None, subsample_per_it: float = None) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        configuration = dict(booster=booster,
-                             max_depth=max_depth,
-                             learning_rate=eta,
-                             min_child_weight=min_child_weight,
-                             colsample_bytree=colsample_bytree,
-                             colsample_bylevel=colsample_bylevel,
-                             reg_alpha=reg_alpha,
-                             reg_lambda=reg_lambda,
-                             n_estimators=n_estimators,
-                             objective=objective,
-                             n_jobs=self.n_threads,
-                             random_state=self.rng.randint(1, 100000),
-                             num_class=self.num_class,
-                             subsample=subsample_per_it)
-
-        configuration = {k: v for k, v in configuration.items() if v is not None}
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(**configuration))
-        ])
-        return clf

From c5889758a7700ca84dae3185008c2c0a7d914779 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@informatik.uni-freiburg.de>
Date: Fri, 12 Nov 2021 10:35:47 +0100
Subject: [PATCH 102/147] Update README.md

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5ef43638..b74b1a00 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,7 @@
 
 HPOBench is a library for providing benchmarks for (multi-fidelity) hyperparameter optimization and with a focus on reproducibility.
 
-Further info:
-  * list of [benchmarks](https://github.com/automl/HPOBench/wiki/Available-Containerized-Benchmarks)
-  * [howto](https://github.com/automl/HPOBench/wiki/How-to-add-a-new-benchmark-step-by-step) contribute benchmarks
+For further info on [existing benchmarks](https://github.com/automl/HPOBench/wiki/Available-Containerized-Benchmarks) and [howto](https://github.com/automl/HPOBench/wiki/How-to-add-a-new-benchmark-step-by-step) contribute new benchmarks, see the [wiki](https://github.com/automl/HPOBench/wiki).
 
 ## Status
 

From 462549cc6b561ef93d9844e4eabf50bf6ba0c194 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 15 Nov 2021 14:11:58 +0530
Subject: [PATCH 103/147] Adding model size info LR SVM NN

---
 hpobench/benchmarks/ml/lr_benchmark.py        | 19 +++++++++++++++++++
 hpobench/benchmarks/ml/nn_benchmark.py        | 19 +++++++++++++++++++
 hpobench/benchmarks/ml/svm_benchmark.py       | 15 +++++++++++++++
 .../dependencies/ml/ml_benchmark_template.py  |  9 +++++++++
 4 files changed, 62 insertions(+)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 588e3d6c..abaaaf43 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -105,6 +105,25 @@ def init_model(
         )
         return model
 
+    def get_model_size(self, model: SGDClassifier) -> float:
+        """ Returns the dimensionality as a proxy for the number of model parameters
+
+        Logistic Regression models have a fixed number of parameters given a dataset. Model size is
+        being approximated as the number of beta parameters required as the model support plus the
+        intercept. This depends on the dataset and not on the trained model.
+
+        Parameters
+        ----------
+        model : SGDClassifier
+            Trained LR model. This parameter is required to maintain function signature.
+
+        Returns
+        -------
+        float
+        """
+        ndims = self.train_X.shape[1]
+        return ndims + 1
+
 
 class LRBenchmarkBB(LRBenchmark):
     """ Black-box version of the LRBenchmark
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 9692509a..05e8e2c9 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -118,6 +118,25 @@ def init_model(
         )
         return model
 
+    def get_model_size(self, model: MLPClassifier) -> float:
+        """ Returns the number of trained parameters in the MLP model
+
+        Parameters
+        ----------
+        model : MLPClassifier
+            Trained MLP model.
+
+        Returns
+        -------
+        float
+        """
+        nparams = 1
+        for layer in model.coefs_:
+            nparams += layer.shape[0] * layer.shape[1]
+        for layer in model.intercepts_:
+            nparams += layer.shape[0]
+        return nparams
+
 
 class NNBenchmarkBB(NNBenchmark):
     """ Black-box version of the NNBenchmark
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index bc791c9b..96f3b6c9 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -89,6 +89,21 @@ def init_model(
         )
         return model
 
+    def get_model_size(self, model: SVC) -> float:
+        """ Returns the number of support vectors in the SVM model
+
+        Parameters
+        ----------
+        model : SVC
+            Trained SVM model.
+
+        Returns
+        -------
+        float
+        """
+        nsupport = model.support_.shape[0]
+        return nsupport
+
 
 class SVMBenchmarkBB(SVMBenchmark):
     """ Black-box version of the SVMBenchmark
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index d6758f83..ce4c3c03 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -120,6 +120,11 @@ def get_meta_information(self):
             'task_id': self.task_id
         }
 
+    def get_model_size(self, model):
+        """ Returns a custom model size specific to the ML model, if applicable
+        """
+        raise NotImplementedError
+
     def init_model(
             self,
             config: Union[CS.Configuration, Dict],
@@ -256,6 +261,7 @@ def objective_function(
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
             configuration, fidelity, shuffle, rng, evaluation="valid", record_stats=record_train
         )
+        model_size = self.get_model_size(model)
 
         # model inference on validation set
         start = time.time()
@@ -294,6 +300,7 @@ def objective_function(
             'val_loss': val_loss,
             'test_loss': test_loss,
             'model_cost': model_fit_time,
+            'model_size': model_size,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
             'val_scores': val_scores,
@@ -345,6 +352,7 @@ def objective_function_test(
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
             configuration, fidelity, shuffle, rng, evaluation="test", record_stats=record_train
         )
+        model_size = self.get_model_size(model)
 
         # model inference on test set
         start = time.time()
@@ -365,6 +373,7 @@ def objective_function_test(
             'val_loss': None,
             'test_loss': test_loss,
             'model_cost': model_fit_time,
+            'model_size': model_size,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
             'val_scores': None,

From e6e77a3cf17787a843704dec6aaa5331ce43ac35 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 15 Nov 2021 15:00:15 +0530
Subject: [PATCH 104/147] Adding model size metric for RF, XGB

---
 hpobench/benchmarks/ml/nn_benchmark.py      |  2 +-
 hpobench/benchmarks/ml/rf_benchmark.py      | 18 ++++++++++++++++++
 hpobench/benchmarks/ml/xgboost_benchmark.py | 16 +++++++++++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 05e8e2c9..f1149c7b 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -119,7 +119,7 @@ def init_model(
         return model
 
     def get_model_size(self, model: MLPClassifier) -> float:
-        """ Returns the number of trained parameters in the MLP model
+        """ Returns the total number of trained parameters in the MLP model
 
         Parameters
         ----------
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 284ff8dd..07a18b7c 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -110,6 +110,24 @@ def init_model(
         )
         return model
 
+    def get_model_size(self, model: RandomForestClassifier) -> float:
+        """ Returns the total number of decision nodes in the entire Random Forest model
+
+        Parameters
+        ----------
+        model : RandomForestClassifier
+            Trained RF model.
+
+        Returns
+        -------
+        float
+        """
+        nodes = 0
+        for tree in model.estimators_:
+            # total number of nodes in the tree (internal + leaf)
+            nodes += tree.tree_.node_count
+        return nodes
+
 
 class RandomForestBenchmarkBB(RandomForestBenchmark):
     """ Black-box version of the RandomForestBenchmark
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 629a7fb6..57b57a84 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -91,7 +91,6 @@ def init_model(
             rng: Union[int, np.random.RandomState, None] = None
     ):
         # initializing model
-        # rng = rng if (rng is None or isinstance(rng, int)) else self.seed
         rng = rng if isinstance(rng, int) else self.seed
 
         if isinstance(config, CS.Configuration):
@@ -115,6 +114,21 @@ def init_model(
         )
         return model
 
+    def get_model_size(self, model: xgb.XGBClassifier) -> float:
+        """ Returns the total number of decision nodes in the sequence of Gradient Boosted trees
+
+        Parameters
+        ----------
+        model : xgb.XGBClassifier
+            Trained XGB model.
+
+        Returns
+        -------
+        float
+        """
+        nodes = model.get_booster().trees_to_dataframe().shape[0]
+        return nodes
+
 
 class XGBoostBenchmarkBB(XGBoostBenchmark):
     """ Black-box version of the XGBoostBenchmark

From b79bc2d83b850ecf29613b040c2518dfcd400098 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 15 Nov 2021 15:03:39 +0530
Subject: [PATCH 105/147] Enforcing minor PEP constraints

---
 hpobench/benchmarks/ml/lr_benchmark.py      |  4 +++-
 hpobench/benchmarks/ml/nn_benchmark.py      |  4 +++-
 hpobench/benchmarks/ml/rf_benchmark.py      | 16 ++++++++++++----
 hpobench/benchmarks/ml/tabular_benchmark.py |  4 ++--
 hpobench/benchmarks/ml/xgboost_benchmark.py | 16 ++++++++++++----
 5 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index abaaaf43..d456835b 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -58,7 +58,9 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            iter_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
         """Fidelity space available --- specifies the fidelity dimensions
         """
         assert iter_choice in ['fixed', 'variable']
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index f1149c7b..4cc57309 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -69,7 +69,9 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            iter_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
         """Fidelity space available --- specifies the fidelity dimensions
         """
         fidelity1 = dict(
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 07a18b7c..f0d096c2 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -60,12 +60,16 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='variable'
+            )
         )
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            n_estimators_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
 
         assert n_estimators_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
@@ -136,7 +140,9 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='fixed', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
@@ -148,7 +154,9 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 3012f3dd..28945940 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -147,8 +147,8 @@ def _search_dataframe(self, row_dict, df):
         for i, param in enumerate(df.drop("result", axis=1).columns):
             mask *= df[param].values == row_dict[param]
         idx = np.where(mask)
-        assert len(idx) == 1, 'The query has resulted into mulitple matches. This should not happen. ' \
-                              f'The Query was {row_dict}'
+        assert len(idx) == 1, 'The query has resulted into mulitple matches. ' \
+                              'This should not happen. The Query was {row_dict}'
         idx = idx[0][0]
         result = df.iloc[idx]["result"]
         return result
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 57b57a84..9e4811e5 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -58,12 +58,16 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='variable'
+            )
         )
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            n_estimators_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
 
         assert n_estimators_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
@@ -137,7 +141,9 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='fixed', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
@@ -149,7 +155,9 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 

From 2f9793187b01464966bbefa3f80bfdd61737dbfd Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 16 Nov 2021 13:22:57 +0530
Subject: [PATCH 106/147] Minor fixes to MLP and LR

---
 hpobench/benchmarks/ml/lr_benchmark.py | 6 ++++--
 hpobench/benchmarks/ml/nn_benchmark.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index d456835b..db97c7a9 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -107,7 +107,7 @@ def init_model(
         )
         return model
 
-    def get_model_size(self, model: SGDClassifier) -> float:
+    def get_model_size(self, model: SGDClassifier = None) -> float:
         """ Returns the dimensionality as a proxy for the number of model parameters
 
         Logistic Regression models have a fixed number of parameters given a dataset. Model size is
@@ -124,7 +124,9 @@ def get_model_size(self, model: SGDClassifier) -> float:
         float
         """
         ndims = self.train_X.shape[1]
-        return ndims + 1
+        # accounting for the intercept
+        ndims += 1
+        return ndims
 
 
 class LRBenchmarkBB(LRBenchmark):
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 4cc57309..1e311bf0 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -132,7 +132,7 @@ def get_model_size(self, model: MLPClassifier) -> float:
         -------
         float
         """
-        nparams = 1
+        nparams = 0
         for layer in model.coefs_:
             nparams += layer.shape[0] * layer.shape[1]
         for layer in model.intercepts_:

From aeebf8f80664574961d6efad4dad6f2a56c4c328 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Sun, 19 Dec 2021 17:32:13 +0530
Subject: [PATCH 107/147] Recording LCs for LR, RF, MLP

---
 hpobench/benchmarks/ml/lr_benchmark.py        | 201 +++++++++++++++++
 hpobench/benchmarks/ml/nn_benchmark.py        | 202 ++++++++++++++++++
 hpobench/benchmarks/ml/rf_benchmark.py        | 201 +++++++++++++++++
 .../dependencies/ml/ml_benchmark_template.py  |  20 +-
 4 files changed, 620 insertions(+), 4 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index db97c7a9..236ae99d 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -9,13 +9,16 @@
 """
 
 
+import time
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.linear_model import SGDClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 __version__ = '0.0.2'
@@ -128,6 +131,204 @@ def get_model_size(self, model: SGDClassifier = None) -> float:
         ndims += 1
         return ndims
 
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ):
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            model.warm_start = True
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            for i in range(model.max_iter):
+                start = time.time()
+                model.partial_fit(
+                    train_X[train_idx], train_y.iloc[train_idx], np.unique(train_y.iloc[train_idx])
+                )
+                model_fit_time += time.time() - start
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                lc_time += time.time() - lc_start
+        else:
+            learning_curves = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves
+
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+
+        The ML model is trained on the training split, and evaluated on the valid and test splits.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+        """
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng, evaluation="valid",
+                record_stats=record_train, get_learning_curve=get_learning_curve
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on validation set
+        start = time.time()
+        pred_val = model.predict(self.valid_X)
+        val_inference_time = time.time() - start
+        val_scores = dict()
+        val_score_cost = dict()
+        for k, v in self.scorers.items():
+            val_scores[k] = 0.0
+            val_score_cost[k] = 0.0
+            _start = time.time()
+            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
+            val_score_cost[k] = time.time() - _start + val_inference_time
+        val_loss = 1 - val_scores["acc"]
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
+        test_scores = dict()
+        test_score_cost = dict()
+        for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
+            _start = time.time()
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
+        test_loss = 1 - test_scores["acc"]
+
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'test_loss': test_loss,
+            'model_cost': model_fit_time,
+            'model_size': model_size,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'val_scores': val_scores,
+            'val_costs': val_score_cost,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity,
+            'config': configuration,
+        }
+
+        return {
+            'function_value': float(info['val_loss']),
+            'cost': float(model_fit_time + info['val_costs']['acc']),
+            'info': info
+        }
+
 
 class LRBenchmarkBB(LRBenchmark):
     """ Black-box version of the LRBenchmark
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 1e311bf0..3855e498 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -8,14 +8,17 @@
 * Restructuring for consistency and to match ML Benchmark Template updates.
 """
 
+import time
 from copy import deepcopy
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.neural_network import MLPClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 __version__ = '0.0.2'
@@ -110,6 +113,7 @@ def init_model(
         config.pop("depth")
         config.pop("width")
         hidden_layers = [width] * depth
+        # TODO: check for iteration length and edit n_iter_no_change maybe
         model = MLPClassifier(
             **config,
             hidden_layer_sizes=hidden_layers,
@@ -139,6 +143,204 @@ def get_model_size(self, model: MLPClassifier) -> float:
             nparams += layer.shape[0]
         return nparams
 
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ):
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            model.warm_start = True
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            for i in range(model.max_iter):
+                start = time.time()
+                model.partial_fit(
+                    train_X[train_idx], train_y.iloc[train_idx], np.unique(train_y.iloc[train_idx])
+                )
+                model_fit_time += time.time() - start
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                lc_time += time.time() - lc_start
+        else:
+            learning_curves = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves
+
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+
+        The ML model is trained on the training split, and evaluated on the valid and test splits.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+        """
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng, evaluation="valid",
+                record_stats=record_train, get_learning_curve=get_learning_curve
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on validation set
+        start = time.time()
+        pred_val = model.predict(self.valid_X)
+        val_inference_time = time.time() - start
+        val_scores = dict()
+        val_score_cost = dict()
+        for k, v in self.scorers.items():
+            val_scores[k] = 0.0
+            val_score_cost[k] = 0.0
+            _start = time.time()
+            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
+            val_score_cost[k] = time.time() - _start + val_inference_time
+        val_loss = 1 - val_scores["acc"]
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
+        test_scores = dict()
+        test_score_cost = dict()
+        for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
+            _start = time.time()
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
+        test_loss = 1 - test_scores["acc"]
+
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'test_loss': test_loss,
+            'model_cost': model_fit_time,
+            'model_size': model_size,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'val_scores': val_scores,
+            'val_costs': val_score_cost,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity,
+            'config': configuration,
+        }
+
+        return {
+            'function_value': float(info['val_loss']),
+            'cost': float(model_fit_time + info['val_costs']['acc']),
+            'info': info
+        }
+
 
 class NNBenchmarkBB(NNBenchmark):
     """ Black-box version of the NNBenchmark
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index f0d096c2..4215fd23 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -8,14 +8,17 @@
 * Restructuring for consistency and to match ML Benchmark Template updates.
 """
 
+import time
 from copy import deepcopy
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.ensemble import RandomForestClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 __version__ = '0.0.2'
@@ -132,6 +135,204 @@ def get_model_size(self, model: RandomForestClassifier) -> float:
             nodes += tree.tree_.node_count
         return nodes
 
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ):
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            model.warm_start = True
+            model.n_estimators = 0
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            for i in range(fidelity['n_estimators']):
+                model.n_estimators += 1
+                start = time.time()
+                model.fit(train_X[train_idx], train_y.iloc[train_idx])
+                model_fit_time += time.time() - start
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                lc_time += time.time() - lc_start
+        else:
+            learning_curves = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves
+
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the validation set
+
+        The ML model is trained on the training split, and evaluated on the valid and test splits.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+        """
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng, evaluation="valid",
+                record_stats=record_train, get_learning_curve=get_learning_curve
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on validation set
+        start = time.time()
+        pred_val = model.predict(self.valid_X)
+        val_inference_time = time.time() - start
+        val_scores = dict()
+        val_score_cost = dict()
+        for k, v in self.scorers.items():
+            val_scores[k] = 0.0
+            val_score_cost[k] = 0.0
+            _start = time.time()
+            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
+            val_score_cost[k] = time.time() - _start + val_inference_time
+        val_loss = 1 - val_scores["acc"]
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
+        test_scores = dict()
+        test_score_cost = dict()
+        for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
+            _start = time.time()
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
+        test_loss = 1 - test_scores["acc"]
+
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'test_loss': test_loss,
+            'model_cost': model_fit_time,
+            'model_size': model_size,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'val_scores': val_scores,
+            'val_costs': val_score_cost,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity,
+            'config': configuration,
+        }
+
+        return {
+            'function_value': float(info['val_loss']),
+            'cost': float(model_fit_time + info['val_costs']['acc']),
+            'info': info
+        }
+
 
 class RandomForestBenchmarkBB(RandomForestBenchmark):
     """ Black-box version of the RandomForestBenchmark
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index ce4c3c03..ba79ac8e 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -164,7 +164,9 @@ def _train_objective(
             shuffle: bool,
             rng: Union[np.random.RandomState, int, None] = None,
             evaluation: Union[str, None] = "valid",
-            record_stats: bool = False
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
     ):
         if rng is not None:
             rng = get_rng(rng, self.rng)
@@ -236,6 +238,7 @@ def objective_function(
             shuffle: bool = False,
             rng: Union[np.random.RandomState, int, None] = None,
             record_train: bool = False,
+            get_learning_curve: bool = False,
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
@@ -256,6 +259,8 @@ def objective_function(
         record_train : bool (optional)
             If True, records the evaluation metrics of the trained ML model on the training set.
             This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
         """
         # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
@@ -327,6 +332,7 @@ def objective_function_test(
             shuffle: bool = False,
             rng: Union[np.random.RandomState, int, None] = None,
             record_train: bool = False,
+            get_learning_curve: bool = False,
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
@@ -347,6 +353,8 @@ def objective_function_test(
         record_train : bool (optional)
             If True, records the evaluation metrics of the trained ML model on the training set.
             This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
         """
         # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
@@ -393,11 +401,15 @@ def objective_function_test(
 
 
 if __name__ == "__main__":
-    from hpobench.benchmarks.ml import XGBoostBenchmarkMF
-    benchmark = XGBoostBenchmarkMF(task_id=10101)
+    from hpobench.benchmarks.ml import RandomForestBenchmarkMF
+    benchmark = RandomForestBenchmarkMF(task_id=10101)
     config = benchmark.configuration_space.sample_configuration()
     print(config)
     fidelity = benchmark.fidelity_space.sample_configuration()
     print(fidelity)
-    res = benchmark.objective_function(config, fidelity, shuffle=True, record_train=True, rng=123)
+    start = time.time()
+    res = benchmark.objective_function(
+        config, fidelity, record_train=True, rng=123, get_learning_curve=True
+    )
     print(res)
+    print(time.time() - start)

From 48a88ddb7e504944765bccb959b3bd7523092ae9 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 3 Jan 2022 14:09:24 +0530
Subject: [PATCH 108/147] Updating LR benchmark for LC collection

---
 hpobench/benchmarks/ml/lr_benchmark.py        | 126 +++++++++++++++++-
 .../dependencies/ml/ml_benchmark_template.py  |  34 +++--
 2 files changed, 149 insertions(+), 11 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 236ae99d..7b6d1783 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -142,6 +142,34 @@ def _train_objective(
             get_learning_curve: bool = False,
             **kwargs
     ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        """
         if rng is not None:
             rng = get_rng(rng, self.rng)
 
@@ -170,9 +198,11 @@ def _train_objective(
 
         # subsample here:
         # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
         if self.lower_bound_train_size is None:
             self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
-            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
         subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
         train_idx = self.rng.choice(
             np.arange(len(train_X)), size=int(
@@ -180,6 +210,8 @@ def _train_objective(
             )
         )
         # fitting the model with subsampled data
+        learning_curves = None
+        lc_time = None
         if get_learning_curve:
             model.warm_start = True
             lc_time = 0.0
@@ -190,6 +222,7 @@ def _train_objective(
                 model.partial_fit(
                     train_X[train_idx], train_y.iloc[train_idx], np.unique(train_y.iloc[train_idx])
                 )
+                # adding all partial fit times
                 model_fit_time += time.time() - start
                 lc_start = time.time()
                 if record_stats:
@@ -208,8 +241,10 @@ def _train_objective(
                     self.test_y, test_pred, **self.scorer_args['acc']
                 )
                 learning_curves['test'].append(test_loss)
+                # sums the time taken to evaluate and collect data for the learning curves
                 lc_time += time.time() - lc_start
         else:
+            # default training as per the base benchmark template
             learning_curves = None
             start = time.time()
             model.fit(train_X[train_idx], train_y.iloc[train_idx])
@@ -232,7 +267,7 @@ def _train_objective(
                 scores[k] = v(train_y, pred_train, **self.scorer_args[k])
             score_cost[k] = time.time() - _start + inference_time
         train_loss = 1 - scores["acc"]
-        return model, model_fit_time, train_loss, scores, score_cost, learning_curves
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
     def objective_function(
             self,
@@ -264,9 +299,10 @@ def objective_function(
             This is set to False by default to reduce overall compute time.
         get_learning_curve : bool (optional)
             If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
         """
         # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs = \
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
             self._train_objective(
                 configuration, fidelity, shuffle, rng, evaluation="valid",
                 record_stats=record_train, get_learning_curve=get_learning_curve
@@ -318,6 +354,7 @@ def objective_function(
             'test_scores': test_scores,
             'test_costs': test_score_cost,
             'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
@@ -329,6 +366,89 @@ def objective_function(
             'info': info
         }
 
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+
+        The ML model is trained on the training+valid split, and evaluated on the test split.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        """
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng, evaluation="test",
+                record_stats=record_train, get_learning_curve=get_learning_curve
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
+        test_scores = dict()
+        test_score_cost = dict()
+        for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
+            _start = time.time()
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
+        test_loss = 1 - test_scores["acc"]
+
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': None,
+            'test_loss': test_loss,
+            'model_cost': model_fit_time,
+            'model_size': model_size,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'val_scores': None,
+            'val_costs': None,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity,
+            'config': configuration,
+        }
+
+        return {
+            'function_value': float(info['test_loss']),
+            'cost': float(model_fit_time + info['test_costs']['acc']),
+            'info': info
+        }
+
 
 class LRBenchmarkBB(LRBenchmark):
     """ Black-box version of the LRBenchmark
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index ba79ac8e..0fd1fcc5 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -165,9 +165,31 @@ def _train_objective(
             rng: Union[np.random.RandomState, int, None] = None,
             evaluation: Union[str, None] = "valid",
             record_stats: bool = False,
-            get_learning_curve: bool = False,
             **kwargs
     ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        """
         if rng is not None:
             rng = get_rng(rng, self.rng)
 
@@ -196,9 +218,11 @@ def _train_objective(
 
         # subsample here:
         # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
         if self.lower_bound_train_size is None:
             self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
-            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
         subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
         train_idx = self.rng.choice(
             np.arange(len(train_X)), size=int(
@@ -238,7 +262,6 @@ def objective_function(
             shuffle: bool = False,
             rng: Union[np.random.RandomState, int, None] = None,
             record_train: bool = False,
-            get_learning_curve: bool = False,
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
@@ -259,8 +282,6 @@ def objective_function(
         record_train : bool (optional)
             If True, records the evaluation metrics of the trained ML model on the training set.
             This is set to False by default to reduce overall compute time.
-        get_learning_curve : bool (optional)
-            If True, records the learning curve using partial_fit or warm starting, if applicable.
         """
         # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
@@ -332,7 +353,6 @@ def objective_function_test(
             shuffle: bool = False,
             rng: Union[np.random.RandomState, int, None] = None,
             record_train: bool = False,
-            get_learning_curve: bool = False,
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
@@ -353,8 +373,6 @@ def objective_function_test(
         record_train : bool (optional)
             If True, records the evaluation metrics of the trained ML model on the training set.
             This is set to False by default to reduce overall compute time.
-        get_learning_curve : bool (optional)
-            If True, records the learning curve using partial_fit or warm starting, if applicable.
         """
         # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(

From e4c699f1836c02ad7ce79774e3428483bc9d8eb5 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 3 Jan 2022 15:17:16 +0530
Subject: [PATCH 109/147] Updating RF bench with LC collection

---
 hpobench/benchmarks/ml/lr_benchmark.py |   3 +-
 hpobench/benchmarks/ml/rf_benchmark.py | 122 ++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 7b6d1783..86db4195 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -210,8 +210,6 @@ def _train_objective(
             )
         )
         # fitting the model with subsampled data
-        learning_curves = None
-        lc_time = None
         if get_learning_curve:
             model.warm_start = True
             lc_time = 0.0
@@ -246,6 +244,7 @@ def _train_objective(
         else:
             # default training as per the base benchmark template
             learning_curves = None
+            lc_time = None
             start = time.time()
             model.fit(train_X[train_idx], train_y.iloc[train_idx])
             model_fit_time = time.time() - start
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 4215fd23..ab945c3c 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -146,6 +146,34 @@ def _train_objective(
             get_learning_curve: bool = False,
             **kwargs
     ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        """
         if rng is not None:
             rng = get_rng(rng, self.rng)
 
@@ -174,9 +202,11 @@ def _train_objective(
 
         # subsample here:
         # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
         if self.lower_bound_train_size is None:
             self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
-            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
         subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
         train_idx = self.rng.choice(
             np.arange(len(train_X)), size=int(
@@ -215,6 +245,7 @@ def _train_objective(
                 lc_time += time.time() - lc_start
         else:
             learning_curves = None
+            lc_time = None
             start = time.time()
             model.fit(train_X[train_idx], train_y.iloc[train_idx])
             model_fit_time = time.time() - start
@@ -236,7 +267,7 @@ def _train_objective(
                 scores[k] = v(train_y, pred_train, **self.scorer_args[k])
             score_cost[k] = time.time() - _start + inference_time
         train_loss = 1 - scores["acc"]
-        return model, model_fit_time, train_loss, scores, score_cost, learning_curves
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
     def objective_function(
             self,
@@ -268,9 +299,10 @@ def objective_function(
             This is set to False by default to reduce overall compute time.
         get_learning_curve : bool (optional)
             If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
         """
         # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs = \
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
             self._train_objective(
                 configuration, fidelity, shuffle, rng, evaluation="valid",
                 record_stats=record_train, get_learning_curve=get_learning_curve
@@ -322,6 +354,7 @@ def objective_function(
             'test_scores': test_scores,
             'test_costs': test_score_cost,
             'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
@@ -333,6 +366,89 @@ def objective_function(
             'info': info
         }
 
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+
+        The ML model is trained on the training+valid split, and evaluated on the test split.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        """
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng, evaluation="test",
+                record_stats=record_train, get_learning_curve=get_learning_curve
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
+        test_scores = dict()
+        test_score_cost = dict()
+        for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
+            _start = time.time()
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
+        test_loss = 1 - test_scores["acc"]
+
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': None,
+            'test_loss': test_loss,
+            'model_cost': model_fit_time,
+            'model_size': model_size,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'val_scores': None,
+            'val_costs': None,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity,
+            'config': configuration,
+        }
+
+        return {
+            'function_value': float(info['test_loss']),
+            'cost': float(model_fit_time + info['test_costs']['acc']),
+            'info': info
+        }
+
 
 class RandomForestBenchmarkBB(RandomForestBenchmark):
     """ Black-box version of the RandomForestBenchmark

From a8f551f83512e3ab2dffd60323aeff41ff9c415a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 3 Jan 2022 15:19:38 +0530
Subject: [PATCH 110/147] Updating MLP bench with LC collection

---
 hpobench/benchmarks/ml/nn_benchmark.py | 117 ++++++++++++++++++++++++-
 1 file changed, 115 insertions(+), 2 deletions(-)

diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 3855e498..12721c26 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -154,6 +154,34 @@ def _train_objective(
             get_learning_curve: bool = False,
             **kwargs
     ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        """
         if rng is not None:
             rng = get_rng(rng, self.rng)
 
@@ -223,6 +251,7 @@ def _train_objective(
                 lc_time += time.time() - lc_start
         else:
             learning_curves = None
+            lc_time = None
             start = time.time()
             model.fit(train_X[train_idx], train_y.iloc[train_idx])
             model_fit_time = time.time() - start
@@ -244,7 +273,7 @@ def _train_objective(
                 scores[k] = v(train_y, pred_train, **self.scorer_args[k])
             score_cost[k] = time.time() - _start + inference_time
         train_loss = 1 - scores["acc"]
-        return model, model_fit_time, train_loss, scores, score_cost, learning_curves
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
     def objective_function(
             self,
@@ -278,7 +307,7 @@ def objective_function(
             If True, records the learning curve using partial_fit or warm starting, if applicable.
         """
         # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs = \
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
             self._train_objective(
                 configuration, fidelity, shuffle, rng, evaluation="valid",
                 record_stats=record_train, get_learning_curve=get_learning_curve
@@ -330,6 +359,7 @@ def objective_function(
             'test_scores': test_scores,
             'test_costs': test_score_cost,
             'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
@@ -341,6 +371,89 @@ def objective_function(
             'info': info
         }
 
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            **kwargs
+    ) -> Dict:
+        """Function that evaluates a 'config' on a 'fidelity' on the test set
+
+        The ML model is trained on the training+valid split, and evaluated on the test split.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        """
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng, evaluation="test",
+                record_stats=record_train, get_learning_curve=get_learning_curve
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
+        test_scores = dict()
+        test_score_cost = dict()
+        for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
+            _start = time.time()
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
+        test_loss = 1 - test_scores["acc"]
+
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
+        info = {
+            'train_loss': train_loss,
+            'val_loss': None,
+            'test_loss': test_loss,
+            'model_cost': model_fit_time,
+            'model_size': model_size,
+            'train_scores': train_scores,
+            'train_costs': train_score_cost,
+            'val_scores': None,
+            'val_costs': None,
+            'test_scores': test_scores,
+            'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
+            # storing as dictionary and not ConfigSpace saves tremendous memory
+            'fidelity': fidelity,
+            'config': configuration,
+        }
+
+        return {
+            'function_value': float(info['test_loss']),
+            'cost': float(model_fit_time + info['test_costs']['acc']),
+            'info': info
+        }
+
 
 class NNBenchmarkBB(NNBenchmark):
     """ Black-box version of the NNBenchmark

From 0a47c598be613821afe9ddeb5a3f908b69bc8c85 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 4 Jan 2022 15:50:08 +0530
Subject: [PATCH 111/147] Adding minor check

---
 hpobench/dependencies/ml/ml_benchmark_template.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index ce4c3c03..6dfce195 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -368,6 +368,10 @@ def objective_function_test(
             test_score_cost[k] = time.time() - _start + test_inference_time
         test_loss = 1 - test_scores["acc"]
 
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
         info = {
             'train_loss': train_loss,
             'val_loss': None,

From 20546a7600d1d472751bc487911be64b2fb9640b Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 5 Jan 2022 15:41:05 +0530
Subject: [PATCH 112/147] Cleaning up seed usage in ML classes

---
 hpobench/benchmarks/ml/xgboost_benchmark.py       |  8 ++++++--
 hpobench/dependencies/ml/ml_benchmark_template.py | 10 ----------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 9e4811e5..4f95e4a2 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -14,6 +14,7 @@
 import xgboost as xgb
 from ConfigSpace.hyperparameters import Hyperparameter
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
 __version__ = '0.0.2'
@@ -95,7 +96,10 @@ def init_model(
             rng: Union[int, np.random.RandomState, None] = None
     ):
         # initializing model
-        rng = rng if isinstance(rng, int) else self.seed
+        rng = self.rng if rng is None else get_rng(rng)
+        # xgb.XGBClassifier when trainied using the scikit-learn API of `fit`, requires
+        # random_state to be an integer and doesn't accept a RandomState
+        seed = rng.randint(1, 10**6)
 
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
@@ -105,7 +109,7 @@ def init_model(
             booster="gbtree",
             n_estimators=fidelity['n_estimators'],
             objective="binary:logistic",
-            random_state=rng,
+            random_state=seed,
             subsample=1
         )
         if self.n_classes > 2:
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 6dfce195..0b9bc402 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -55,12 +55,6 @@ def __init__(
         """
         super(MLBenchmark, self).__init__(rng=rng)
 
-        if isinstance(rng, int):
-            self.seed = rng
-        else:
-            self.seed = self.rng.randint(1, 10**6)
-        self.rng = get_rng(self.seed)
-
         self.global_seed = global_seed  # used for fixed training-validation splits
 
         self.task_id = task_id
@@ -92,10 +86,6 @@ def __init__(
         self.lower_bound_train_size = dm.lower_bound_train_size
         self.n_classes = dm.n_classes
 
-        # Observation and fidelity spaces
-        self.fidelity_space = self.get_fidelity_space(self.seed)
-        self.configuration_space = self.get_configuration_space(self.seed)
-
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters

From bfc42cc0d51024ab0d9a08a06dd2c839e09df2c7 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 7 Jan 2022 16:02:39 +0530
Subject: [PATCH 113/147] Cleaning LC collection code for ML benchmarks

---
 hpobench/benchmarks/ml/lr_benchmark.py        | 181 +-----------------
 hpobench/benchmarks/ml/nn_benchmark.py        | 180 +----------------
 hpobench/benchmarks/ml/rf_benchmark.py        | 181 +-----------------
 .../dependencies/ml/ml_benchmark_template.py  |  45 ++++-
 4 files changed, 41 insertions(+), 546 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 86db4195..cd250510 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -211,6 +211,7 @@ def _train_objective(
         )
         # fitting the model with subsampled data
         if get_learning_curve:
+            # IMPORTANT to allow partial_fit
             model.warm_start = True
             lc_time = 0.0
             model_fit_time = 0.0
@@ -268,186 +269,6 @@ def _train_objective(
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            record_train: bool = False,
-            get_learning_curve: bool = False,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-
-        The ML model is trained on the training split, and evaluated on the valid and test splits.
-
-        Parameters
-        ----------
-        configuration : CS.Configuration, Dict
-            The hyperparameter configuration.
-        fidelity : CS.Configuration, Dict
-            The fidelity configuration.
-        shuffle : bool (optional)
-            If True, shuffles the training split before fitting the ML model.
-        rng : np.random.RandomState, int (optional)
-            The random seed passed to the ML model and if applicable, used for shuffling the data
-            and subsampling the dataset fraction.
-        record_train : bool (optional)
-            If True, records the evaluation metrics of the trained ML model on the training set.
-            This is set to False by default to reduce overall compute time.
-        get_learning_curve : bool (optional)
-            If True, records the learning curve using partial_fit or warm starting, if applicable.
-            This is set to False by default to reduce overall compute time.
-        """
-        # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
-            self._train_objective(
-                configuration, fidelity, shuffle, rng, evaluation="valid",
-                record_stats=record_train, get_learning_curve=get_learning_curve
-            )
-        model_size = self.get_model_size(model)
-
-        # model inference on validation set
-        start = time.time()
-        pred_val = model.predict(self.valid_X)
-        val_inference_time = time.time() - start
-        val_scores = dict()
-        val_score_cost = dict()
-        for k, v in self.scorers.items():
-            val_scores[k] = 0.0
-            val_score_cost[k] = 0.0
-            _start = time.time()
-            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
-            val_score_cost[k] = time.time() - _start + val_inference_time
-        val_loss = 1 - val_scores["acc"]
-
-        # model inference on test set
-        start = time.time()
-        pred_test = model.predict(self.test_X)
-        test_inference_time = time.time() - start
-        test_scores = dict()
-        test_score_cost = dict()
-        for k, v in self.scorers.items():
-            test_scores[k] = 0.0
-            test_score_cost[k] = 0.0
-            _start = time.time()
-            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
-            test_score_cost[k] = time.time() - _start + test_inference_time
-        test_loss = 1 - test_scores["acc"]
-
-        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
-        configuration = configuration.get_dictionary() \
-            if isinstance(configuration, CS.Configuration) else configuration
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'test_loss': test_loss,
-            'model_cost': model_fit_time,
-            'model_size': model_size,
-            'train_scores': train_scores,
-            'train_costs': train_score_cost,
-            'val_scores': val_scores,
-            'val_costs': val_score_cost,
-            'test_scores': test_scores,
-            'test_costs': test_score_cost,
-            'learning_curves': lcs,
-            'learning_curves_cost': lc_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity,
-            'config': configuration,
-        }
-
-        return {
-            'function_value': float(info['val_loss']),
-            'cost': float(model_fit_time + info['val_costs']['acc']),
-            'info': info
-        }
-
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            record_train: bool = False,
-            get_learning_curve: bool = False,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-
-        The ML model is trained on the training+valid split, and evaluated on the test split.
-
-        Parameters
-        ----------
-        configuration : CS.Configuration, Dict
-            The hyperparameter configuration.
-        fidelity : CS.Configuration, Dict
-            The fidelity configuration.
-        shuffle : bool (optional)
-            If True, shuffles the training split before fitting the ML model.
-        rng : np.random.RandomState, int (optional)
-            The random seed passed to the ML model and if applicable, used for shuffling the data
-            and subsampling the dataset fraction.
-        record_train : bool (optional)
-            If True, records the evaluation metrics of the trained ML model on the training set.
-            This is set to False by default to reduce overall compute time.
-        get_learning_curve : bool (optional)
-            If True, records the learning curve using partial_fit or warm starting, if applicable.
-            This is set to False by default to reduce overall compute time.
-        """
-        # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
-            self._train_objective(
-                configuration, fidelity, shuffle, rng, evaluation="test",
-                record_stats=record_train, get_learning_curve=get_learning_curve
-            )
-        model_size = self.get_model_size(model)
-
-        # model inference on test set
-        start = time.time()
-        pred_test = model.predict(self.test_X)
-        test_inference_time = time.time() - start
-        test_scores = dict()
-        test_score_cost = dict()
-        for k, v in self.scorers.items():
-            test_scores[k] = 0.0
-            test_score_cost[k] = 0.0
-            _start = time.time()
-            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
-            test_score_cost[k] = time.time() - _start + test_inference_time
-        test_loss = 1 - test_scores["acc"]
-
-        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
-        configuration = configuration.get_dictionary() \
-            if isinstance(configuration, CS.Configuration) else configuration
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': None,
-            'test_loss': test_loss,
-            'model_cost': model_fit_time,
-            'model_size': model_size,
-            'train_scores': train_scores,
-            'train_costs': train_score_cost,
-            'val_scores': None,
-            'val_costs': None,
-            'test_scores': test_scores,
-            'test_costs': test_score_cost,
-            'learning_curves': lcs,
-            'learning_curves_cost': lc_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity,
-            'config': configuration,
-        }
-
-        return {
-            'function_value': float(info['test_loss']),
-            'cost': float(model_fit_time + info['test_costs']['acc']),
-            'info': info
-        }
-
 
 class LRBenchmarkBB(LRBenchmark):
     """ Black-box version of the LRBenchmark
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 12721c26..bdd731e1 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -221,6 +221,7 @@ def _train_objective(
         )
         # fitting the model with subsampled data
         if get_learning_curve:
+            # IMPORTANT to allow partial_fit
             model.warm_start = True
             lc_time = 0.0
             model_fit_time = 0.0
@@ -275,185 +276,6 @@ def _train_objective(
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            record_train: bool = False,
-            get_learning_curve: bool = False,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-
-        The ML model is trained on the training split, and evaluated on the valid and test splits.
-
-        Parameters
-        ----------
-        configuration : CS.Configuration, Dict
-            The hyperparameter configuration.
-        fidelity : CS.Configuration, Dict
-            The fidelity configuration.
-        shuffle : bool (optional)
-            If True, shuffles the training split before fitting the ML model.
-        rng : np.random.RandomState, int (optional)
-            The random seed passed to the ML model and if applicable, used for shuffling the data
-            and subsampling the dataset fraction.
-        record_train : bool (optional)
-            If True, records the evaluation metrics of the trained ML model on the training set.
-            This is set to False by default to reduce overall compute time.
-        get_learning_curve : bool (optional)
-            If True, records the learning curve using partial_fit or warm starting, if applicable.
-        """
-        # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
-            self._train_objective(
-                configuration, fidelity, shuffle, rng, evaluation="valid",
-                record_stats=record_train, get_learning_curve=get_learning_curve
-            )
-        model_size = self.get_model_size(model)
-
-        # model inference on validation set
-        start = time.time()
-        pred_val = model.predict(self.valid_X)
-        val_inference_time = time.time() - start
-        val_scores = dict()
-        val_score_cost = dict()
-        for k, v in self.scorers.items():
-            val_scores[k] = 0.0
-            val_score_cost[k] = 0.0
-            _start = time.time()
-            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
-            val_score_cost[k] = time.time() - _start + val_inference_time
-        val_loss = 1 - val_scores["acc"]
-
-        # model inference on test set
-        start = time.time()
-        pred_test = model.predict(self.test_X)
-        test_inference_time = time.time() - start
-        test_scores = dict()
-        test_score_cost = dict()
-        for k, v in self.scorers.items():
-            test_scores[k] = 0.0
-            test_score_cost[k] = 0.0
-            _start = time.time()
-            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
-            test_score_cost[k] = time.time() - _start + test_inference_time
-        test_loss = 1 - test_scores["acc"]
-
-        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
-        configuration = configuration.get_dictionary() \
-            if isinstance(configuration, CS.Configuration) else configuration
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'test_loss': test_loss,
-            'model_cost': model_fit_time,
-            'model_size': model_size,
-            'train_scores': train_scores,
-            'train_costs': train_score_cost,
-            'val_scores': val_scores,
-            'val_costs': val_score_cost,
-            'test_scores': test_scores,
-            'test_costs': test_score_cost,
-            'learning_curves': lcs,
-            'learning_curves_cost': lc_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity,
-            'config': configuration,
-        }
-
-        return {
-            'function_value': float(info['val_loss']),
-            'cost': float(model_fit_time + info['val_costs']['acc']),
-            'info': info
-        }
-
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            record_train: bool = False,
-            get_learning_curve: bool = False,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-
-        The ML model is trained on the training+valid split, and evaluated on the test split.
-
-        Parameters
-        ----------
-        configuration : CS.Configuration, Dict
-            The hyperparameter configuration.
-        fidelity : CS.Configuration, Dict
-            The fidelity configuration.
-        shuffle : bool (optional)
-            If True, shuffles the training split before fitting the ML model.
-        rng : np.random.RandomState, int (optional)
-            The random seed passed to the ML model and if applicable, used for shuffling the data
-            and subsampling the dataset fraction.
-        record_train : bool (optional)
-            If True, records the evaluation metrics of the trained ML model on the training set.
-            This is set to False by default to reduce overall compute time.
-        get_learning_curve : bool (optional)
-            If True, records the learning curve using partial_fit or warm starting, if applicable.
-            This is set to False by default to reduce overall compute time.
-        """
-        # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
-            self._train_objective(
-                configuration, fidelity, shuffle, rng, evaluation="test",
-                record_stats=record_train, get_learning_curve=get_learning_curve
-            )
-        model_size = self.get_model_size(model)
-
-        # model inference on test set
-        start = time.time()
-        pred_test = model.predict(self.test_X)
-        test_inference_time = time.time() - start
-        test_scores = dict()
-        test_score_cost = dict()
-        for k, v in self.scorers.items():
-            test_scores[k] = 0.0
-            test_score_cost[k] = 0.0
-            _start = time.time()
-            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
-            test_score_cost[k] = time.time() - _start + test_inference_time
-        test_loss = 1 - test_scores["acc"]
-
-        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
-        configuration = configuration.get_dictionary() \
-            if isinstance(configuration, CS.Configuration) else configuration
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': None,
-            'test_loss': test_loss,
-            'model_cost': model_fit_time,
-            'model_size': model_size,
-            'train_scores': train_scores,
-            'train_costs': train_score_cost,
-            'val_scores': None,
-            'val_costs': None,
-            'test_scores': test_scores,
-            'test_costs': test_score_cost,
-            'learning_curves': lcs,
-            'learning_curves_cost': lc_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity,
-            'config': configuration,
-        }
-
-        return {
-            'function_value': float(info['test_loss']),
-            'cost': float(model_fit_time + info['test_costs']['acc']),
-            'info': info
-        }
-
 
 class NNBenchmarkBB(NNBenchmark):
     """ Black-box version of the NNBenchmark
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index ab945c3c..6ed97e5f 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -215,6 +215,7 @@ def _train_objective(
         )
         # fitting the model with subsampled data
         if get_learning_curve:
+            # IMPORTANT to allow partial_fit
             model.warm_start = True
             model.n_estimators = 0
             lc_time = 0.0
@@ -269,186 +270,6 @@ def _train_objective(
         train_loss = 1 - scores["acc"]
         return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
-    def objective_function(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            record_train: bool = False,
-            get_learning_curve: bool = False,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the validation set
-
-        The ML model is trained on the training split, and evaluated on the valid and test splits.
-
-        Parameters
-        ----------
-        configuration : CS.Configuration, Dict
-            The hyperparameter configuration.
-        fidelity : CS.Configuration, Dict
-            The fidelity configuration.
-        shuffle : bool (optional)
-            If True, shuffles the training split before fitting the ML model.
-        rng : np.random.RandomState, int (optional)
-            The random seed passed to the ML model and if applicable, used for shuffling the data
-            and subsampling the dataset fraction.
-        record_train : bool (optional)
-            If True, records the evaluation metrics of the trained ML model on the training set.
-            This is set to False by default to reduce overall compute time.
-        get_learning_curve : bool (optional)
-            If True, records the learning curve using partial_fit or warm starting, if applicable.
-            This is set to False by default to reduce overall compute time.
-        """
-        # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
-            self._train_objective(
-                configuration, fidelity, shuffle, rng, evaluation="valid",
-                record_stats=record_train, get_learning_curve=get_learning_curve
-            )
-        model_size = self.get_model_size(model)
-
-        # model inference on validation set
-        start = time.time()
-        pred_val = model.predict(self.valid_X)
-        val_inference_time = time.time() - start
-        val_scores = dict()
-        val_score_cost = dict()
-        for k, v in self.scorers.items():
-            val_scores[k] = 0.0
-            val_score_cost[k] = 0.0
-            _start = time.time()
-            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
-            val_score_cost[k] = time.time() - _start + val_inference_time
-        val_loss = 1 - val_scores["acc"]
-
-        # model inference on test set
-        start = time.time()
-        pred_test = model.predict(self.test_X)
-        test_inference_time = time.time() - start
-        test_scores = dict()
-        test_score_cost = dict()
-        for k, v in self.scorers.items():
-            test_scores[k] = 0.0
-            test_score_cost[k] = 0.0
-            _start = time.time()
-            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
-            test_score_cost[k] = time.time() - _start + test_inference_time
-        test_loss = 1 - test_scores["acc"]
-
-        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
-        configuration = configuration.get_dictionary() \
-            if isinstance(configuration, CS.Configuration) else configuration
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': val_loss,
-            'test_loss': test_loss,
-            'model_cost': model_fit_time,
-            'model_size': model_size,
-            'train_scores': train_scores,
-            'train_costs': train_score_cost,
-            'val_scores': val_scores,
-            'val_costs': val_score_cost,
-            'test_scores': test_scores,
-            'test_costs': test_score_cost,
-            'learning_curves': lcs,
-            'learning_curves_cost': lc_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity,
-            'config': configuration,
-        }
-
-        return {
-            'function_value': float(info['val_loss']),
-            'cost': float(model_fit_time + info['val_costs']['acc']),
-            'info': info
-        }
-
-    def objective_function_test(
-            self,
-            configuration: Union[CS.Configuration, Dict],
-            fidelity: Union[CS.Configuration, Dict, None] = None,
-            shuffle: bool = False,
-            rng: Union[np.random.RandomState, int, None] = None,
-            record_train: bool = False,
-            get_learning_curve: bool = False,
-            **kwargs
-    ) -> Dict:
-        """Function that evaluates a 'config' on a 'fidelity' on the test set
-
-        The ML model is trained on the training+valid split, and evaluated on the test split.
-
-        Parameters
-        ----------
-        configuration : CS.Configuration, Dict
-            The hyperparameter configuration.
-        fidelity : CS.Configuration, Dict
-            The fidelity configuration.
-        shuffle : bool (optional)
-            If True, shuffles the training split before fitting the ML model.
-        rng : np.random.RandomState, int (optional)
-            The random seed passed to the ML model and if applicable, used for shuffling the data
-            and subsampling the dataset fraction.
-        record_train : bool (optional)
-            If True, records the evaluation metrics of the trained ML model on the training set.
-            This is set to False by default to reduce overall compute time.
-        get_learning_curve : bool (optional)
-            If True, records the learning curve using partial_fit or warm starting, if applicable.
-            This is set to False by default to reduce overall compute time.
-        """
-        # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
-            self._train_objective(
-                configuration, fidelity, shuffle, rng, evaluation="test",
-                record_stats=record_train, get_learning_curve=get_learning_curve
-            )
-        model_size = self.get_model_size(model)
-
-        # model inference on test set
-        start = time.time()
-        pred_test = model.predict(self.test_X)
-        test_inference_time = time.time() - start
-        test_scores = dict()
-        test_score_cost = dict()
-        for k, v in self.scorers.items():
-            test_scores[k] = 0.0
-            test_score_cost[k] = 0.0
-            _start = time.time()
-            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
-            test_score_cost[k] = time.time() - _start + test_inference_time
-        test_loss = 1 - test_scores["acc"]
-
-        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
-        configuration = configuration.get_dictionary() \
-            if isinstance(configuration, CS.Configuration) else configuration
-
-        info = {
-            'train_loss': train_loss,
-            'val_loss': None,
-            'test_loss': test_loss,
-            'model_cost': model_fit_time,
-            'model_size': model_size,
-            'train_scores': train_scores,
-            'train_costs': train_score_cost,
-            'val_scores': None,
-            'val_costs': None,
-            'test_scores': test_scores,
-            'test_costs': test_score_cost,
-            'learning_curves': lcs,
-            'learning_curves_cost': lc_time,
-            # storing as dictionary and not ConfigSpace saves tremendous memory
-            'fidelity': fidelity,
-            'config': configuration,
-        }
-
-        return {
-            'function_value': float(info['test_loss']),
-            'cost': float(model_fit_time + info['test_costs']['acc']),
-            'info': info
-        }
-
 
 class RandomForestBenchmarkBB(RandomForestBenchmark):
     """ Black-box version of the RandomForestBenchmark
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 0fd1fcc5..9a75f4cd 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -165,6 +165,7 @@ def _train_objective(
             rng: Union[np.random.RandomState, int, None] = None,
             evaluation: Union[str, None] = "valid",
             record_stats: bool = False,
+            get_learning_curve: bool = False,
             **kwargs
     ):
         """Function that instantiates a 'config' on a 'fidelity' and trains it
@@ -189,7 +190,17 @@ def _train_objective(
         record_stats : bool (optional)
             If True, records the evaluation metrics of the trained ML model on the training set.
             This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
         """
+        if get_learning_curve:
+            raise NotImplementedError(
+                "Need to implement partial or intermediate training to record Learning curves"
+            )
+        learning_curves = None
+        lc_time = None
+
         if rng is not None:
             rng = get_rng(rng, self.rng)
 
@@ -251,7 +262,7 @@ def _train_objective(
                 scores[k] = v(train_y, pred_train, **self.scorer_args[k])
             score_cost[k] = time.time() - _start + inference_time
         train_loss = 1 - scores["acc"]
-        return model, model_fit_time, train_loss, scores, score_cost
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
     # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
@@ -262,6 +273,7 @@ def objective_function(
             shuffle: bool = False,
             rng: Union[np.random.RandomState, int, None] = None,
             record_train: bool = False,
+            get_learning_curve: bool = False,
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
@@ -282,11 +294,16 @@ def objective_function(
         record_train : bool (optional)
             If True, records the evaluation metrics of the trained ML model on the training set.
             This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
         """
         # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="valid", record_stats=record_train
-        )
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng, evaluation="valid",
+                record_stats=record_train, get_learning_curve=get_learning_curve
+            )
         model_size = self.get_model_size(model)
 
         # model inference on validation set
@@ -333,6 +350,8 @@ def objective_function(
             'val_costs': val_score_cost,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
@@ -353,6 +372,7 @@ def objective_function_test(
             shuffle: bool = False,
             rng: Union[np.random.RandomState, int, None] = None,
             record_train: bool = False,
+            get_learning_curve: bool = False,
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
@@ -373,11 +393,16 @@ def objective_function_test(
         record_train : bool (optional)
             If True, records the evaluation metrics of the trained ML model on the training set.
             This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
         """
         # obtaining model and training statistics
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="test", record_stats=record_train
-        )
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng, evaluation="test",
+                record_stats=record_train, get_learning_curve=get_learning_curve
+            )
         model_size = self.get_model_size(model)
 
         # model inference on test set
@@ -394,6 +419,10 @@ def objective_function_test(
             test_score_cost[k] = time.time() - _start + test_inference_time
         test_loss = 1 - test_scores["acc"]
 
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
         info = {
             'train_loss': train_loss,
             'val_loss': None,
@@ -406,6 +435,8 @@ def objective_function_test(
             'val_costs': None,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,

From 9734da1d54efb6163d4b3cc5823dc019b4e4e602 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 7 Jan 2022 16:35:58 +0530
Subject: [PATCH 114/147] Making LCs branch primary for experiments

---
 hpobench/benchmarks/ml/lr_benchmark.py      | 5 +++--
 hpobench/benchmarks/ml/nn_benchmark.py      | 4 +++-
 hpobench/benchmarks/ml/rf_benchmark.py      | 4 +++-
 hpobench/benchmarks/ml/svm_benchmark.py     | 4 +++-
 hpobench/benchmarks/ml/xgboost_benchmark.py | 5 ++++-
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index cd250510..a054aee0 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -6,9 +6,10 @@
 * First implementation of the LR Benchmarks.
 0.0.2:
 * Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
-
 import time
 from typing import Union, Tuple, Dict
 
@@ -21,7 +22,7 @@
 from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.2'
+__version__ = '0.0.3'
 
 
 class LRBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index bdd731e1..59f68b8c 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -6,6 +6,8 @@
 * First implementation of the NN Benchmarks.
 0.0.2:
 * Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
 import time
@@ -21,7 +23,7 @@
 from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.2'
+__version__ = '0.0.3'
 
 
 class NNBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 6ed97e5f..a5b01624 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -6,6 +6,8 @@
 * First implementation of the RF Benchmarks.
 0.0.2:
 * Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
 import time
@@ -21,7 +23,7 @@
 from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.2'
+__version__ = '0.0.3'
 
 
 class RandomForestBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 96f3b6c9..dcc56587 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -6,6 +6,8 @@
 * First implementation of the new SVM Benchmarks.
 0.0.2:
 * Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
 from typing import Union, Dict
@@ -17,7 +19,7 @@
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.2'
+__version__ = '0.0.3'
 
 
 class SVMBenchmark(MLBenchmark):
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index 4f95e4a2..d4f287ed 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -6,7 +6,10 @@
 * First implementation of the new XGB Benchmarks.
 0.0.2:
 * Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
+
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
@@ -17,7 +20,7 @@
 from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.2'
+__version__ = '0.0.3'
 
 
 class XGBoostBenchmark(MLBenchmark):

From 0b4dc1f6ecb9a21b4d88667f5ed5d88767e6f464 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Sun, 9 Jan 2022 13:41:49 +0530
Subject: [PATCH 115/147] Adding option to record LC every k iterations

---
 .../dependencies/ml/ml_benchmark_template.py  | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index c6ccf797..359a5eee 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -147,6 +147,19 @@ def shuffle_data_idx(
         rng.shuffle(train_idx)
         return train_idx
 
+    def _get_lc_spacing(self, max_iter, k):
+        """ Creates an integer sequence to record Learning Curves for every k iteration.
+
+        Designed to include the maximum iteration. A k-spaced iteration sequence may not include
+        the endpoint implicitly.
+        """
+        assert k > 0, "Spacing needs to be at >=1"
+        spacing = np.arange(0, max_iter + 1, step=k).tolist()
+        spacing = spacing[1:]  # eliminating 0
+        if spacing[-1] != max_iter:
+            spacing.append(max_iter)
+        return spacing
+
     def _train_objective(
             self,
             config: Dict,
@@ -156,6 +169,7 @@ def _train_objective(
             evaluation: Union[str, None] = "valid",
             record_stats: bool = False,
             get_learning_curve: bool = False,
+            lc_every_k: int = 1,
             **kwargs
     ):
         """Function that instantiates a 'config' on a 'fidelity' and trains it
@@ -183,6 +197,8 @@ def _train_objective(
         get_learning_curve : bool (optional)
             If True, records the learning curve using partial_fit or warm starting, if applicable.
             This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
         if get_learning_curve:
             raise NotImplementedError(
@@ -264,6 +280,7 @@ def objective_function(
             rng: Union[np.random.RandomState, int, None] = None,
             record_train: bool = False,
             get_learning_curve: bool = False,
+            lc_every_k: int = 1,
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
@@ -287,12 +304,15 @@ def objective_function(
         get_learning_curve : bool (optional)
             If True, records the learning curve using partial_fit or warm starting, if applicable.
             This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
         # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
             self._train_objective(
-                configuration, fidelity, shuffle, rng, evaluation="valid",
-                record_stats=record_train, get_learning_curve=get_learning_curve
+                configuration, fidelity, shuffle, rng,
+                evaluation="valid", record_stats=record_train,
+                get_learning_curve=get_learning_curve, lc_every_k=lc_every_k
             )
         model_size = self.get_model_size(model)
 
@@ -363,6 +383,7 @@ def objective_function_test(
             rng: Union[np.random.RandomState, int, None] = None,
             record_train: bool = False,
             get_learning_curve: bool = False,
+            lc_every_k: int = 1,
             **kwargs
     ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
@@ -386,12 +407,15 @@ def objective_function_test(
         get_learning_curve : bool (optional)
             If True, records the learning curve using partial_fit or warm starting, if applicable.
             This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
         # obtaining model and training statistics
         model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
             self._train_objective(
-                configuration, fidelity, shuffle, rng, evaluation="test",
-                record_stats=record_train, get_learning_curve=get_learning_curve
+                configuration, fidelity, shuffle, rng,
+                evaluation="test", record_stats=record_train,
+                get_learning_curve=get_learning_curve, lc_every_k=lc_every_k
             )
         model_size = self.get_model_size(model)
 

From 46addaaff4b8c35fbd217ee1beb4dbe96f0a6f16 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Sun, 9 Jan 2022 14:22:32 +0530
Subject: [PATCH 116/147] Updating LR to collect LC every k iterations

---
 hpobench/benchmarks/ml/lr_benchmark.py        | 19 +++++++++++++++----
 .../dependencies/ml/ml_benchmark_template.py  |  2 ++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index a054aee0..6bd7c214 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -141,6 +141,7 @@ def _train_objective(
             evaluation: Union[str, None] = "valid",
             record_stats: bool = False,
             get_learning_curve: bool = False,
+            lc_every_k: int = 1,
             **kwargs
     ):
         """Function that instantiates a 'config' on a 'fidelity' and trains it
@@ -170,6 +171,8 @@ def _train_objective(
             This is set to False by default to reduce overall compute time.
             Enabling True, implies that the for each iteration, the model will be evaluated on both
             the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
         if rng is not None:
             rng = get_rng(rng, self.rng)
@@ -217,13 +220,21 @@ def _train_objective(
             lc_time = 0.0
             model_fit_time = 0.0
             learning_curves = dict(train=[], valid=[], test=[])
-            for i in range(model.max_iter):
+            lc_spacings = self._get_lc_spacing(model.max_iter, lc_every_k)
+            iter_start = 0
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
                 start = time.time()
-                model.partial_fit(
-                    train_X[train_idx], train_y.iloc[train_idx], np.unique(train_y.iloc[train_idx])
-                )
+                # trains model for k steps
+                for j in range(iter_end - iter_start):
+                    model.partial_fit(
+                        train_X[train_idx],
+                        train_y.iloc[train_idx],
+                        np.unique(train_y.iloc[train_idx])
+                    )
                 # adding all partial fit times
                 model_fit_time += time.time() - start
+                iter_start = iter_end
                 lc_start = time.time()
                 if record_stats:
                     train_pred = model.predict(train_X)
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 359a5eee..b8ac116c 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -362,6 +362,7 @@ def objective_function(
             'test_costs': test_score_cost,
             'learning_curves': lcs,
             'learning_curves_cost': lc_time,
+            'learning_curves_spacing': lc_every_k,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
@@ -451,6 +452,7 @@ def objective_function_test(
             'test_costs': test_score_cost,
             'learning_curves': lcs,
             'learning_curves_cost': lc_time,
+            'learning_curves_spacing': lc_every_k,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,

From e2033df41b27c22bf56cb87bfbdb6be5e243e2fd Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Sun, 9 Jan 2022 15:48:37 +0530
Subject: [PATCH 117/147] Updating RF to collect LC every k iterations

---
 hpobench/benchmarks/ml/rf_benchmark.py            | 14 +++++++++++---
 hpobench/dependencies/ml/ml_benchmark_template.py |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index a5b01624..251f64c5 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -146,6 +146,7 @@ def _train_objective(
             evaluation: Union[str, None] = "valid",
             record_stats: bool = False,
             get_learning_curve: bool = False,
+            lc_every_k: int = 1,
             **kwargs
     ):
         """Function that instantiates a 'config' on a 'fidelity' and trains it
@@ -175,6 +176,8 @@ def _train_objective(
             This is set to False by default to reduce overall compute time.
             Enabling True, implies that the for each iteration, the model will be evaluated on both
             the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
         if rng is not None:
             rng = get_rng(rng, self.rng)
@@ -217,15 +220,20 @@ def _train_objective(
         )
         # fitting the model with subsampled data
         if get_learning_curve:
-            # IMPORTANT to allow partial_fit
+            lc_spacings = self._get_lc_spacing(model.n_estimators, lc_every_k)
+            # IMPORTANT to allow refitting with more estimators
             model.warm_start = True
             model.n_estimators = 0
             lc_time = 0.0
             model_fit_time = 0.0
             learning_curves = dict(train=[], valid=[], test=[])
-            for i in range(fidelity['n_estimators']):
-                model.n_estimators += 1
+            iter_start = 0
+            # for i in range(fidelity['n_estimators']):
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
                 start = time.time()
+                # adds k new estimators to the model for training
+                model.n_estimators += iter_end - iter_start
                 model.fit(train_X[train_idx], train_y.iloc[train_idx])
                 model_fit_time += time.time() - start
                 lc_start = time.time()
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index b8ac116c..396b18ad 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -154,6 +154,7 @@ def _get_lc_spacing(self, max_iter, k):
         the endpoint implicitly.
         """
         assert k > 0, "Spacing needs to be at >=1"
+        assert k < max_iter, "Spacing should be in {1, 2, ..., max_iter-1}"
         spacing = np.arange(0, max_iter + 1, step=k).tolist()
         spacing = spacing[1:]  # eliminating 0
         if spacing[-1] != max_iter:

From 67a3a32ed6e1e5a83753bb28375b8e5128daaf25 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Sun, 6 Feb 2022 20:29:55 +0100
Subject: [PATCH 118/147] Adding LC collection option to MLP

---
 hpobench/benchmarks/ml/nn_benchmark.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 59f68b8c..49723fd8 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -154,6 +154,7 @@ def _train_objective(
             evaluation: Union[str, None] = "valid",
             record_stats: bool = False,
             get_learning_curve: bool = False,
+            lc_every_k: int = 1,
             **kwargs
     ):
         """Function that instantiates a 'config' on a 'fidelity' and trains it
@@ -183,6 +184,8 @@ def _train_objective(
             This is set to False by default to reduce overall compute time.
             Enabling True, implies that the for each iteration, the model will be evaluated on both
             the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
         if rng is not None:
             rng = get_rng(rng, self.rng)
@@ -228,11 +231,19 @@ def _train_objective(
             lc_time = 0.0
             model_fit_time = 0.0
             learning_curves = dict(train=[], valid=[], test=[])
-            for i in range(model.max_iter):
+            lc_spacings = self._get_lc_spacing(model.max_iter, lc_every_k)
+            iter_start = 0
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
                 start = time.time()
-                model.partial_fit(
-                    train_X[train_idx], train_y.iloc[train_idx], np.unique(train_y.iloc[train_idx])
-                )
+                # trains model for k steps
+                for j in range(iter_end - iter_start):
+                    model.partial_fit(
+                        train_X[train_idx],
+                        train_y.iloc[train_idx],
+                        np.unique(train_y.iloc[train_idx])
+                    )
+                # adding all partial fit times
                 model_fit_time += time.time() - start
                 lc_start = time.time()
                 if record_stats:

From c47aa145b72cb7b83fb6e799157be12a11c40b99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Tue, 3 May 2022 11:50:03 +0200
Subject: [PATCH 119/147] Implement AbstractMultiObjectiveBenchmark.

It basically does nothing. It just points out to the user that this is a MO benchmark.
I am not sure if we really need that.
---
 hpobench/abstract_benchmark.py | 70 ++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index c9db4216..c454923e 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -251,3 +251,73 @@ def get_meta_information() -> Dict:
 
         """
         raise NotImplementedError()
+
+
+class AbstractMultiObjectiveBenchmark(AbstractBenchmark):
+    """
+    Abstract Benchmark class for multi-objective benchmarks.
+    The only purpose of this class is to point out to users that this benchmark returns multiple
+    objective function values.
+
+    When writing a benchmark, please make sure to inherit from the correct abstract class.
+    """
+    @abc.abstractmethod
+    def objective_function(self, configuration: Union[ConfigSpace.Configuration, Dict],
+                           fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Objective function.
+
+        Override this function to provide your multi-objective benchmark function. This
+        function will be called by one of the evaluate functions. For
+        flexibility, you have to return a dictionary with the only mandatory
+        key being `function_values`, the objective function values for the
+        `configuration` which was passed. By convention, all benchmarks are
+        minimization problems.
+
+        `function_value` is a dictionary that contains all available criteria.
+
+        Parameters
+        ----------
+        configuration : Dict
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            It might be useful to pass a `rng` argument to the function call to
+            bypass the default "seed" generator. Only using the default random
+            state (`self.rng`) could lead to an overfitting towards the
+            `self.rng`'s seed.
+
+        Returns
+        -------
+        Dict
+            Must contain at least the key `function_value` and `cost`.
+            Note that `function_value` should be a Dict here.
+        """
+        NotImplementedError()
+
+    @abc.abstractmethod
+    def objective_function_test(self, configuration: Union[ConfigSpace.Configuration, Dict],
+                                fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        If there is a different objective function for offline testing, e.g
+        testing a machine learning on a hold extra test set instead
+        on a validation set override this function here.
+
+        Parameters
+        ----------
+        configuration : Dict
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            see :py:func:`~HPOBench.abstract_benchmark.objective_function`
+
+        Returns
+        -------
+        Dict
+            Must contain at least the key `function_value` and `cost`.
+        """
+        NotImplementedError()

From 8ba5db2e6659536459f32b1d942631a5c7bbae1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Wed, 4 May 2022 16:30:25 +0200
Subject: [PATCH 120/147] ADD check for the correct return values

---
 hpobench/abstract_benchmark.py | 35 ++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index c454923e..38662c55 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -1,7 +1,7 @@
 """ Base-class of all benchmarks """
 
 import abc
-from typing import Union, Dict
+from typing import Union, Dict, List, Tuple
 import functools
 
 import logging
@@ -124,7 +124,9 @@ def wrapper(self, configuration: Union[ConfigSpace.Configuration, Dict],
             fidelity = AbstractBenchmark._check_and_cast_fidelity(fidelity, self.fidelity_space, **kwargs)
 
             # All benchmarks should work on dictionaries. Cast the both objects to dictionaries.
-            return wrapped_function(self, configuration.get_dictionary(), fidelity.get_dictionary(), **kwargs)
+            return_values = wrapped_function(self, configuration.get_dictionary(), fidelity.get_dictionary(), **kwargs)
+
+            return_values = AbstractBenchmark._check_return_values(return_values)
         return wrapper
 
     @staticmethod
@@ -204,6 +206,16 @@ def _check_and_cast_fidelity(fidelity: Union[dict, ConfigSpace.Configuration, No
         fidelity_space.check_configuration(fidelity)
         return fidelity
 
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        """
+        The return values should contain the fields `function_value` and `cost`.
+        """
+        assert 'function_value' in return_values.keys()
+        assert 'cost' in return_values.keys()
+
+        return return_values
+
     def __call__(self, configuration: Dict, **kwargs) -> float:
         """ Provides interface to use, e.g., SciPy optimizers """
         return self.objective_function(configuration, **kwargs)['function_value']
@@ -321,3 +333,22 @@ def objective_function_test(self, configuration: Union[ConfigSpace.Configuration
             Must contain at least the key `function_value` and `cost`.
         """
         NotImplementedError()
+
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        """
+        The return values should contain the fields `function_value` and `cost`.
+        The field `function_value` has to be a collection of multiple objective targets.
+        """
+        return_values = AbstractBenchmark._check_return_values(return_values)
+        assert isinstance(return_values['function_value'], (List, Dict, Tuple)), \
+            'Every MO benchmark has to return multiple objectives.'
+        return return_values
+
+    @staticmethod
+    @abc.abstractmethod
+    def get_objective_names():
+        """
+        Return the names of supported targets
+        """
+        NotImplementedError()

From ef6b977a96e92aadda3a163b3aa3b13743d14160 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 5 May 2022 09:20:34 +0200
Subject: [PATCH 121/147] ADD check for return values + Test cases

---
 hpobench/abstract_benchmark.py    |  6 +++-
 tests/test_abstract_benchmark.py  |  5 ++-
 tests/test_check_configuration.py | 55 +++++++++++++++++++++++--------
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index 38662c55..86098d0d 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -126,7 +126,11 @@ def wrapper(self, configuration: Union[ConfigSpace.Configuration, Dict],
             # All benchmarks should work on dictionaries. Cast the both objects to dictionaries.
             return_values = wrapped_function(self, configuration.get_dictionary(), fidelity.get_dictionary(), **kwargs)
 
-            return_values = AbstractBenchmark._check_return_values(return_values)
+            # Make sure that every benchmark returns a well-shaped return object.
+            # Every benchmark have to have the fields 'function_value' and 'cost'.
+            # Multi-Objective benchmarks have to return collections of values for the 'function_value' field.
+            return_values = type(self)._check_return_values(return_values)
+            return return_values
         return wrapper
 
     @staticmethod
diff --git a/tests/test_abstract_benchmark.py b/tests/test_abstract_benchmark.py
index 22a26790..1f61c969 100644
--- a/tests/test_abstract_benchmark.py
+++ b/tests/test_abstract_benchmark.py
@@ -1,6 +1,6 @@
 import pytest
 
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
 
 with pytest.raises(NotImplementedError):
     AbstractBenchmark.get_configuration_space()
@@ -10,3 +10,6 @@
 
 with pytest.raises(NotImplementedError):
     AbstractBenchmark.get_meta_information()
+
+with pytest.raises(NotImplementedError):
+    AbstractMultiObjectiveBenchmark.get_objective_names()
diff --git a/tests/test_check_configuration.py b/tests/test_check_configuration.py
index 8d3db58f..09322025 100644
--- a/tests/test_check_configuration.py
+++ b/tests/test_check_configuration.py
@@ -32,20 +32,23 @@ def get_fidelity_space(self):
 
             _check_and_cast_configuration = AbstractBenchmark._check_and_cast_configuration
             _check_and_cast_fidelity = AbstractBenchmark._check_and_cast_fidelity
+            _check_return_values = AbstractBenchmark._check_return_values
 
         self.foo = Dummy()
 
     def test_config_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration}}
 
         ret = tmp(self=self.foo, configuration=self.foo.configuration_space.sample_configuration())
         self.assertIsInstance(ret, Dict)
+        self.assertIsInstance(ret['info'], Dict)
+        self.assertIsInstance(ret['info']['config'], Dict)
 
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration}}
 
         tmp(self=self.foo, configuration={"flt": 0.2, "cat": 1, "itg": 1})
         tmp(self=self.foo, configuration=self.foo.configuration_space.sample_configuration())
@@ -57,23 +60,27 @@ def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
     def test_fidel_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration, fidelity, kwargs
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration, 'fidel': fidelity, 'kwargs': kwargs}}
+            # return configuration, fidelity, kwargs
 
         sample_fidel = dict(self.foo.get_fidelity_space().get_default_configuration())
 
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration(),
-                        fidelity=sample_fidel)
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration(),
+                          fidelity=sample_fidel)
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
         less_fidel = {"f_cat": 1}
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration(),
-                        fidelity=less_fidel)
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration(),
+                          fidelity=less_fidel)
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration())
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration())
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
         with pytest.raises(ValueError):
@@ -87,6 +94,7 @@ def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
                                            "configuration": self.foo.configuration_space.sample_configuration(),
                                            "fidelity": [0.1]})
 
+
 class TestCheckUnittest2(unittest.TestCase):
 
     def setUp(self):
@@ -100,6 +108,7 @@ class Dummy():
 
             _check_and_cast_configuration = AbstractBenchmark._check_and_cast_configuration
             _check_and_cast_fidelity = AbstractBenchmark._check_and_cast_fidelity
+            _check_return_values = AbstractBenchmark._check_return_values
 
             fidelity_space = ConfigurationSpace(seed=1)
             fidelity_space.add_hyperparameter(UniformFloatHyperparameter('fidelity1', lower=0., upper=1., default_value=1.))
@@ -108,11 +117,14 @@ class Dummy():
     def test_config_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Union[Dict, np.ndarray], fidelity: Dict, **kwargs):
-            return configuration, fidelity
+            return {'function_value': 0, 'cost': 0,
+                    'info': {'config': configuration, 'fidel': fidelity, 'kwargs': kwargs}}
 
         hps = dict(hp1=0.25, hp2=1.25, hp3=2.25)
         configuration = Configuration(self.foo.configuration_space, hps)
-        config, fidel = tmp(self=self.foo, configuration=configuration, fidelity=None)
+
+        return_dict = tmp(self=self.foo, configuration=configuration, fidelity=None)
+        config, fidel = return_dict['info']['config'], return_dict['info']['fidel']
 
         assert isinstance(config, Dict)
         assert isinstance(fidel, Dict)
@@ -153,3 +165,20 @@ def test_remove_inactive_parameter():
     # Remove inactive: - case: config is dict
     transformed = AbstractBenchmark._check_and_cast_configuration(not_allowed, configuration_space)
     assert transformed.get_dictionary() == {'hp1': 0, 'hp3': 5}
+
+
+def test_check_return_values():
+    return_values = {'function_value': 0, 'cost': 0}
+    AbstractBenchmark._check_return_values(return_values)
+
+    with pytest.raises(AssertionError):
+        AbstractBenchmark._check_return_values({'function_value': 0})
+
+    with pytest.raises(AssertionError):
+        AbstractBenchmark._check_return_values({'cost': 0})
+
+
+def test_check_return_values_mo():
+    return_values = {'function_value': {'obj1': 0, 'obj2': 0}, 'cost': 0}
+    from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+    AbstractMultiObjectiveBenchmark._check_return_values(return_values)

From 3e88c74213d710f1e9834e93b966297a01454ca3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 5 May 2022 14:48:14 +0200
Subject: [PATCH 122/147] UPDATE changelog.md

---
 changelog.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/changelog.md b/changelog.md
index fd54dafa..298903ac 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,4 +1,6 @@
 # 0.0.11
+  * Drop Support for 3.6:
+    Although most of the functionality should still work, we drop the official support for 3.6.
 
 # 0.0.10
   * Cartpole Benchmark Version 0.0.4:

From 77941e8bdf4bb4ea89eab8b47f55d635df7cf047 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 5 May 2022 14:53:16 +0200
Subject: [PATCH 123/147] Fix Tests: Random Seeds differ across python versions
 (#144)

* Fix test cases for tabular_benchmarks, nasbench201 and the outlier detection benchmarks.

Some test cases have failed when using python3.6. This error occured because of different configsspace versions on the host.

* Remove python 3.6 from the supported python versions.

We remove python 3.6 from the test cases. It should still work, but we do not longer test specifically for that version.
---
 .github/workflows/run_tests.yml               |  4 ----
 changelog.md                                  |  2 ++
 hpobench/benchmarks/od/od_ae.py               |  1 +
 .../dependencies/ml/ml_benchmark_template.py  |  1 +
 .../dependencies/od/traditional_benchmark.py  |  1 +
 tests/test_nasbench_201.py                    |  9 +++----
 tests/test_od.py                              | 24 +++++++++++--------
 tests/test_tabular_benchmarks.py              |  2 +-
 8 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 322812b2..3c22a210 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -11,10 +11,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: 3.6
-            DISPLAY_NAME: "Singularity Tests"
-            RUN_TESTS: true
-            USE_SINGULARITY: true
           - python-version: 3.7
             DISPLAY_NAME: "Singularity Tests + CODECOV"
             RUN_TESTS: true
diff --git a/changelog.md b/changelog.md
index fd54dafa..298903ac 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,4 +1,6 @@
 # 0.0.11
+  * Drop Support for 3.6:
+    Although most of the functionality should still work, we drop the official support for 3.6.
 
 # 0.0.10
   * Cartpole Benchmark Version 0.0.4:
diff --git a/hpobench/benchmarks/od/od_ae.py b/hpobench/benchmarks/od/od_ae.py
index e3beca47..af80b106 100644
--- a/hpobench/benchmarks/od/od_ae.py
+++ b/hpobench/benchmarks/od/od_ae.py
@@ -407,6 +407,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
         return fidel_space
 
+    # pylint: disable=arguments-differ
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
         X_train, _ = self.datamanager.dataset.get_train_data()
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 3c6fcdaf..7cef515f 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -102,6 +102,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """
         raise NotImplementedError()
 
+    # pylint: disable=arguments-differ
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
         return {
diff --git a/hpobench/dependencies/od/traditional_benchmark.py b/hpobench/dependencies/od/traditional_benchmark.py
index 68cef2e5..1d82dfe6 100644
--- a/hpobench/dependencies/od/traditional_benchmark.py
+++ b/hpobench/dependencies/od/traditional_benchmark.py
@@ -214,6 +214,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidel_space = CS.ConfigurationSpace(seed=seed)
         return fidel_space
 
+    # pylint: disable=arguments-differ
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
         X_train, y_train = self.datamanager.dataset.get_train_data()
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
index 925ac911..22c24b34 100644
--- a/tests/test_nasbench_201.py
+++ b/tests/test_nasbench_201.py
@@ -84,10 +84,11 @@ def test_nasbench201_config():
     func = Cifar10ValidNasBench201Benchmark.config_to_structure_func(4)
     struct = func(c)
 
-    assert struct.__repr__() == '_Structure(4 nodes with |avg_pool_3x3~0|+|none~0|nor_conv_3x3~1|+' \
-                                '|nor_conv_3x3~0|nor_conv_3x3~1|skip_connect~2|)'
+    assert struct.__repr__() == '_Structure(4 nodes with |nor_conv_1x1~0|+|nor_conv_3x3~0|nor_conv_3x3~1|+' \
+                                '|nor_conv_1x1~0|nor_conv_1x1~1|nor_conv_3x3~2|)'
     assert len(struct) == 4
-    assert struct[0] == (('avg_pool_3x3', 0),)
+    assert struct[0] == (('nor_conv_1x1', 0),)
 
     struct_str = struct.tostr()
-    assert struct_str == '|avg_pool_3x3~0|+|none~0|nor_conv_3x3~1|+|nor_conv_3x3~0|nor_conv_3x3~1|skip_connect~2|'
+    assert struct_str == '|nor_conv_1x1~0|+|nor_conv_3x3~0|nor_conv_3x3~1|+' \
+                         '|nor_conv_1x1~0|nor_conv_1x1~1|nor_conv_3x3~2|'
diff --git a/tests/test_od.py b/tests/test_od.py
index f6ca038f..c6e2b36a 100644
--- a/tests/test_od.py
+++ b/tests/test_od.py
@@ -14,17 +14,16 @@ def test_ocsvm():
 
 
 def test_kde():
-    from hpobench.container.benchmarks.od.od_benchmarks import  ODKernelDensityEstimation
+    from hpobench.container.benchmarks.od.od_benchmarks import ODKernelDensityEstimation
     seed = 6
     benchmark = ODKernelDensityEstimation("cardio", rng=seed)
 
     config = benchmark.get_configuration_space(seed=seed).sample_configuration()
-    result = benchmark.objective_function_test(configuration=config, rng=seed)
-    print(config['kernel'], config['bandwidth'], result['function_value'])
+    assert config is not None
 
-    assert config['kernel'] == "exponential"
-    assert config['bandwidth'] == pytest.approx(15.2274, abs=0.001)
-    assert result['function_value'] == pytest.approx(0.14409, abs=0.0001)
+    test_config = {'bandwidth': 15.227439996058147, 'kernel': 'tophat', 'scaler': 'Standard'}
+    result = benchmark.objective_function_test(configuration=test_config, rng=seed)
+    assert result['function_value'] == pytest.approx(0.8675, abs=0.0001)
 
 
 def test_ae():
@@ -33,8 +32,13 @@ def test_ae():
     benchmark = ODAutoencoder("cardio", rng=seed)
 
     config = benchmark.get_configuration_space(seed=seed).sample_configuration()
-    result = benchmark.objective_function(configuration=config, rng=seed)
-    print(config['dropout_rate'], result['function_value'])
+    assert config is not None
+
+    test_config = {'activation': 'tanh', 'batch_normalization': True,
+                   'batch_size': 424, 'beta1': 0.8562127972330622, 'beta2': 0.9107549023256032,
+                   'dropout': False, 'lr': 0.0013160410886450579, 'num_latent_units': 5,
+                   'num_layers': 1, 'scaler': 'MinMax', 'skip_connection': True,
+                   'weight_decay': 0.07358821063486902, 'num_units_layer_1': 16}
 
-    assert config['dropout_rate'] == pytest.approx(0.69512, abs=0.00001)
-    assert result['function_value'] == pytest.approx(0.2833, abs=0.0001)
+    result = benchmark.objective_function(configuration=test_config, rng=seed)
+    assert result['function_value'] == pytest.approx(0.81378, abs=0.001)
diff --git a/tests/test_tabular_benchmarks.py b/tests/test_tabular_benchmarks.py
index 59d8dd45..573a2822 100644
--- a/tests/test_tabular_benchmarks.py
+++ b/tests/test_tabular_benchmarks.py
@@ -134,7 +134,7 @@ def test_parkinson_benchmark(self):
             benchmark.objective_function_test(default_config, fidelity=dict(budget=1, ))
 
         result = benchmark.objective_function_test(configuration=default_config, fidelity=dict(budget=100))
-        assert pytest.approx(0.15010187, result['function_value'], abs=0.001)
+        assert result['function_value'] == pytest.approx(0.15010187, abs=0.001)
 
         runtime = 62.7268
         assert result['cost'] == pytest.approx(runtime, abs=0.0001)

From e001e5a6e5fb6fbe866d8d6feed86cf06e64ba47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Tue, 3 May 2022 11:50:03 +0200
Subject: [PATCH 124/147] Implement AbstractMultiObjectiveBenchmark.

It basically does nothing. It just points out to the user that this is a MO benchmark.
I am not sure if we really need that.
---
 hpobench/abstract_benchmark.py | 70 ++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index c9db4216..c454923e 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -251,3 +251,73 @@ def get_meta_information() -> Dict:
 
         """
         raise NotImplementedError()
+
+
+class AbstractMultiObjectiveBenchmark(AbstractBenchmark):
+    """
+    Abstract Benchmark class for multi-objective benchmarks.
+    The only purpose of this class is to point out to users that this benchmark returns multiple
+    objective function values.
+
+    When writing a benchmark, please make sure to inherit from the correct abstract class.
+    """
+    @abc.abstractmethod
+    def objective_function(self, configuration: Union[ConfigSpace.Configuration, Dict],
+                           fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Objective function.
+
+        Override this function to provide your multi-objective benchmark function. This
+        function will be called by one of the evaluate functions. For
+        flexibility, you have to return a dictionary with the only mandatory
+        key being `function_values`, the objective function values for the
+        `configuration` which was passed. By convention, all benchmarks are
+        minimization problems.
+
+        `function_value` is a dictionary that contains all available criteria.
+
+        Parameters
+        ----------
+        configuration : Dict
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            It might be useful to pass a `rng` argument to the function call to
+            bypass the default "seed" generator. Only using the default random
+            state (`self.rng`) could lead to an overfitting towards the
+            `self.rng`'s seed.
+
+        Returns
+        -------
+        Dict
+            Must contain at least the key `function_value` and `cost`.
+            Note that `function_value` should be a Dict here.
+        """
+        NotImplementedError()
+
+    @abc.abstractmethod
+    def objective_function_test(self, configuration: Union[ConfigSpace.Configuration, Dict],
+                                fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        If there is a different objective function for offline testing, e.g
+        testing a machine learning on a hold extra test set instead
+        on a validation set override this function here.
+
+        Parameters
+        ----------
+        configuration : Dict
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            see :py:func:`~HPOBench.abstract_benchmark.objective_function`
+
+        Returns
+        -------
+        Dict
+            Must contain at least the key `function_value` and `cost`.
+        """
+        NotImplementedError()

From 0d09c04100f5ced84255bdcdf80582d15c8688a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Wed, 4 May 2022 16:30:25 +0200
Subject: [PATCH 125/147] ADD check for the correct return values

---
 hpobench/abstract_benchmark.py | 35 ++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index c454923e..38662c55 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -1,7 +1,7 @@
 """ Base-class of all benchmarks """
 
 import abc
-from typing import Union, Dict
+from typing import Union, Dict, List, Tuple
 import functools
 
 import logging
@@ -124,7 +124,9 @@ def wrapper(self, configuration: Union[ConfigSpace.Configuration, Dict],
             fidelity = AbstractBenchmark._check_and_cast_fidelity(fidelity, self.fidelity_space, **kwargs)
 
             # All benchmarks should work on dictionaries. Cast the both objects to dictionaries.
-            return wrapped_function(self, configuration.get_dictionary(), fidelity.get_dictionary(), **kwargs)
+            return_values = wrapped_function(self, configuration.get_dictionary(), fidelity.get_dictionary(), **kwargs)
+
+            return_values = AbstractBenchmark._check_return_values(return_values)
         return wrapper
 
     @staticmethod
@@ -204,6 +206,16 @@ def _check_and_cast_fidelity(fidelity: Union[dict, ConfigSpace.Configuration, No
         fidelity_space.check_configuration(fidelity)
         return fidelity
 
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        """
+        The return values should contain the fields `function_value` and `cost`.
+        """
+        assert 'function_value' in return_values.keys()
+        assert 'cost' in return_values.keys()
+
+        return return_values
+
     def __call__(self, configuration: Dict, **kwargs) -> float:
         """ Provides interface to use, e.g., SciPy optimizers """
         return self.objective_function(configuration, **kwargs)['function_value']
@@ -321,3 +333,22 @@ def objective_function_test(self, configuration: Union[ConfigSpace.Configuration
             Must contain at least the key `function_value` and `cost`.
         """
         NotImplementedError()
+
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        """
+        The return values should contain the fields `function_value` and `cost`.
+        The field `function_value` has to be a collection of multiple objective targets.
+        """
+        return_values = AbstractBenchmark._check_return_values(return_values)
+        assert isinstance(return_values['function_value'], (List, Dict, Tuple)), \
+            'Every MO benchmark has to return multiple objectives.'
+        return return_values
+
+    @staticmethod
+    @abc.abstractmethod
+    def get_objective_names():
+        """
+        Return the names of supported targets
+        """
+        NotImplementedError()

From 9ae08aab180e79b0b81b6b7b339f68abd0eb35c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 5 May 2022 09:20:34 +0200
Subject: [PATCH 126/147] ADD check for return values + Test cases

---
 hpobench/abstract_benchmark.py    |  6 +++-
 tests/test_abstract_benchmark.py  |  5 ++-
 tests/test_check_configuration.py | 55 +++++++++++++++++++++++--------
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index 38662c55..86098d0d 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -126,7 +126,11 @@ def wrapper(self, configuration: Union[ConfigSpace.Configuration, Dict],
             # All benchmarks should work on dictionaries. Cast the both objects to dictionaries.
             return_values = wrapped_function(self, configuration.get_dictionary(), fidelity.get_dictionary(), **kwargs)
 
-            return_values = AbstractBenchmark._check_return_values(return_values)
+            # Make sure that every benchmark returns a well-shaped return object.
+            # Every benchmark have to have the fields 'function_value' and 'cost'.
+            # Multi-Objective benchmarks have to return collections of values for the 'function_value' field.
+            return_values = type(self)._check_return_values(return_values)
+            return return_values
         return wrapper
 
     @staticmethod
diff --git a/tests/test_abstract_benchmark.py b/tests/test_abstract_benchmark.py
index 22a26790..1f61c969 100644
--- a/tests/test_abstract_benchmark.py
+++ b/tests/test_abstract_benchmark.py
@@ -1,6 +1,6 @@
 import pytest
 
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
 
 with pytest.raises(NotImplementedError):
     AbstractBenchmark.get_configuration_space()
@@ -10,3 +10,6 @@
 
 with pytest.raises(NotImplementedError):
     AbstractBenchmark.get_meta_information()
+
+with pytest.raises(NotImplementedError):
+    AbstractMultiObjectiveBenchmark.get_objective_names()
diff --git a/tests/test_check_configuration.py b/tests/test_check_configuration.py
index 8d3db58f..09322025 100644
--- a/tests/test_check_configuration.py
+++ b/tests/test_check_configuration.py
@@ -32,20 +32,23 @@ def get_fidelity_space(self):
 
             _check_and_cast_configuration = AbstractBenchmark._check_and_cast_configuration
             _check_and_cast_fidelity = AbstractBenchmark._check_and_cast_fidelity
+            _check_return_values = AbstractBenchmark._check_return_values
 
         self.foo = Dummy()
 
     def test_config_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration}}
 
         ret = tmp(self=self.foo, configuration=self.foo.configuration_space.sample_configuration())
         self.assertIsInstance(ret, Dict)
+        self.assertIsInstance(ret['info'], Dict)
+        self.assertIsInstance(ret['info']['config'], Dict)
 
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration}}
 
         tmp(self=self.foo, configuration={"flt": 0.2, "cat": 1, "itg": 1})
         tmp(self=self.foo, configuration=self.foo.configuration_space.sample_configuration())
@@ -57,23 +60,27 @@ def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
     def test_fidel_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration, fidelity, kwargs
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration, 'fidel': fidelity, 'kwargs': kwargs}}
+            # return configuration, fidelity, kwargs
 
         sample_fidel = dict(self.foo.get_fidelity_space().get_default_configuration())
 
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration(),
-                        fidelity=sample_fidel)
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration(),
+                          fidelity=sample_fidel)
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
         less_fidel = {"f_cat": 1}
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration(),
-                        fidelity=less_fidel)
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration(),
+                          fidelity=less_fidel)
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration())
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration())
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
         with pytest.raises(ValueError):
@@ -87,6 +94,7 @@ def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
                                            "configuration": self.foo.configuration_space.sample_configuration(),
                                            "fidelity": [0.1]})
 
+
 class TestCheckUnittest2(unittest.TestCase):
 
     def setUp(self):
@@ -100,6 +108,7 @@ class Dummy():
 
             _check_and_cast_configuration = AbstractBenchmark._check_and_cast_configuration
             _check_and_cast_fidelity = AbstractBenchmark._check_and_cast_fidelity
+            _check_return_values = AbstractBenchmark._check_return_values
 
             fidelity_space = ConfigurationSpace(seed=1)
             fidelity_space.add_hyperparameter(UniformFloatHyperparameter('fidelity1', lower=0., upper=1., default_value=1.))
@@ -108,11 +117,14 @@ class Dummy():
     def test_config_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Union[Dict, np.ndarray], fidelity: Dict, **kwargs):
-            return configuration, fidelity
+            return {'function_value': 0, 'cost': 0,
+                    'info': {'config': configuration, 'fidel': fidelity, 'kwargs': kwargs}}
 
         hps = dict(hp1=0.25, hp2=1.25, hp3=2.25)
         configuration = Configuration(self.foo.configuration_space, hps)
-        config, fidel = tmp(self=self.foo, configuration=configuration, fidelity=None)
+
+        return_dict = tmp(self=self.foo, configuration=configuration, fidelity=None)
+        config, fidel = return_dict['info']['config'], return_dict['info']['fidel']
 
         assert isinstance(config, Dict)
         assert isinstance(fidel, Dict)
@@ -153,3 +165,20 @@ def test_remove_inactive_parameter():
     # Remove inactive: - case: config is dict
     transformed = AbstractBenchmark._check_and_cast_configuration(not_allowed, configuration_space)
     assert transformed.get_dictionary() == {'hp1': 0, 'hp3': 5}
+
+
+def test_check_return_values():
+    return_values = {'function_value': 0, 'cost': 0}
+    AbstractBenchmark._check_return_values(return_values)
+
+    with pytest.raises(AssertionError):
+        AbstractBenchmark._check_return_values({'function_value': 0})
+
+    with pytest.raises(AssertionError):
+        AbstractBenchmark._check_return_values({'cost': 0})
+
+
+def test_check_return_values_mo():
+    return_values = {'function_value': {'obj1': 0, 'obj2': 0}, 'cost': 0}
+    from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+    AbstractMultiObjectiveBenchmark._check_return_values(return_values)

From c5f0494da7b4043f388d1133f71d55971a4f04f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 5 May 2022 15:07:34 +0200
Subject: [PATCH 127/147] Fix Abstract Benchmark tests

---
 hpobench/abstract_benchmark.py   |  6 +++---
 tests/test_abstract_benchmark.py | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index 86098d0d..57e837c5 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -311,7 +311,7 @@ def objective_function(self, configuration: Union[ConfigSpace.Configuration, Dic
             Must contain at least the key `function_value` and `cost`.
             Note that `function_value` should be a Dict here.
         """
-        NotImplementedError()
+        raise NotImplementedError()
 
     @abc.abstractmethod
     def objective_function_test(self, configuration: Union[ConfigSpace.Configuration, Dict],
@@ -336,7 +336,7 @@ def objective_function_test(self, configuration: Union[ConfigSpace.Configuration
         Dict
             Must contain at least the key `function_value` and `cost`.
         """
-        NotImplementedError()
+        raise NotImplementedError()
 
     @staticmethod
     def _check_return_values(return_values: Dict) -> Dict:
@@ -355,4 +355,4 @@ def get_objective_names():
         """
         Return the names of supported targets
         """
-        NotImplementedError()
+        raise NotImplementedError()
diff --git a/tests/test_abstract_benchmark.py b/tests/test_abstract_benchmark.py
index 1f61c969..5c98e613 100644
--- a/tests/test_abstract_benchmark.py
+++ b/tests/test_abstract_benchmark.py
@@ -2,14 +2,16 @@
 
 from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
 
-with pytest.raises(NotImplementedError):
-    AbstractBenchmark.get_configuration_space()
 
-with pytest.raises(NotImplementedError):
-    AbstractBenchmark.get_fidelity_space()
+def test_abstract_benchmark():
+    with pytest.raises(NotImplementedError):
+        AbstractBenchmark.get_configuration_space()
 
-with pytest.raises(NotImplementedError):
-    AbstractBenchmark.get_meta_information()
+    with pytest.raises(NotImplementedError):
+        AbstractBenchmark.get_fidelity_space()
 
-with pytest.raises(NotImplementedError):
-    AbstractMultiObjectiveBenchmark.get_objective_names()
+    with pytest.raises(NotImplementedError):
+        AbstractBenchmark.get_meta_information()
+
+    with pytest.raises(NotImplementedError):
+        AbstractMultiObjectiveBenchmark.get_objective_names()

From 34ccb6675ba683db2dfda2f89212e68a00bd42e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 5 May 2022 16:04:59 +0200
Subject: [PATCH 128/147] UPDATE changelog.md

---
 changelog.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/changelog.md b/changelog.md
index 298903ac..818e978b 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,6 +1,9 @@
 # 0.0.11
   * Drop Support for 3.6:
     Although most of the functionality should still work, we drop the official support for 3.6.
+  * Add an interface for Multi-Objective Benchmarks.
+  * Add a check for the return values of the objective_functions
+    The returned dictionary of the objective functions have to fulfill now some criteria. 
 
 # 0.0.10
   * Cartpole Benchmark Version 0.0.4:

From 9dde3976911017e52e2ccc4a5ebc0235ec7e487d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Fri, 6 May 2022 07:07:23 +0200
Subject: [PATCH 129/147] MO Benchmarks: API enhancement (#146)

* UPDATE Benchmark API to work with Multi-Objective Benchmarks
---
 hpobench/container/client_abstract_benchmark.py | 9 +++++++--
 hpobench/container/server_abstract_benchmark.py | 8 ++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/hpobench/container/client_abstract_benchmark.py b/hpobench/container/client_abstract_benchmark.py
index 4a9eb96c..6bbc3489 100644
--- a/hpobench/container/client_abstract_benchmark.py
+++ b/hpobench/container/client_abstract_benchmark.py
@@ -22,8 +22,7 @@
 import subprocess
 import time
 from pathlib import Path
-from typing import Optional
-from typing import Union, Dict
+from typing import Optional, Union, Dict, List, Tuple
 from uuid import uuid1
 
 import ConfigSpace as CS
@@ -512,3 +511,9 @@ def __del__(self):
     def _id_generator() -> str:
         """ Helper function: Creates unique socket ids for the benchmark server """
         return str(uuid1())
+
+
+class AbstractMOBenchmarkClient(AbstractBenchmarkClient):
+    def get_objective_names(self) -> Union[Tuple, List, Dict]:
+        json_str = self.benchmark.get_objective_names()
+        return json.loads(json_str, cls=BenchmarkDecoder)
diff --git a/hpobench/container/server_abstract_benchmark.py b/hpobench/container/server_abstract_benchmark.py
index cad09792..de828300 100644
--- a/hpobench/container/server_abstract_benchmark.py
+++ b/hpobench/container/server_abstract_benchmark.py
@@ -104,6 +104,14 @@ def get_meta_information(self):
         logger.debug('Server: get_meta_info called')
         return json.dumps(self.benchmark.get_meta_information(), indent=None, cls=BenchmarkEncoder)
 
+    def get_objective_names(self):
+        logger.debug('Server: get_objective_names called')
+        if hasattr(self.benchmark, 'get_objective_names'):
+            return json.dumps(self.benchmark.get_objective_names(), indent=None, cls=BenchmarkEncoder)
+        else:
+            logger.warning('Server: This is not a MO Benchmark. The `get_objective_names` function is not implemented.')
+            return ''
+
     @Pyro4.oneway   # in case call returns much later than daemon.shutdown
     def shutdown(self):
         logger.debug('Server: Shutting down...')

From 812bdd03b4972cfd323634b3465f0ee1ce85c056 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Mon, 23 May 2022 18:22:30 +0200
Subject: [PATCH 130/147] Update Github Actions (#151)

* Update Github Actions Workflow  and drop support for singularity < 3.7
---
 .../workflows/run_singularity_versions.yml    | 28 ++++++++++------
 .github/workflows/run_tests.yml               | 15 +++++++++
 README.md                                     | 11 +++----
 ci_scripts/install.sh                         | 32 ++-----------------
 ci_scripts/install_singularity.sh             | 32 +++++++++++++------
 requirements.txt                              |  1 -
 6 files changed, 64 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/run_singularity_versions.yml b/.github/workflows/run_singularity_versions.yml
index fe576a30..c7862636 100644
--- a/.github/workflows/run_singularity_versions.yml
+++ b/.github/workflows/run_singularity_versions.yml
@@ -1,6 +1,16 @@
 name: Test Support for different Singularity Versions
 
-on: [push]
+on:
+  pull_request:
+    types: [ready_for_review]
+
+  pull_request_review:
+    types: [submitted]
+
+  push:
+    branches:
+      - 'main'
+      - 'development'
 
 jobs:
   Tests:
@@ -10,25 +20,25 @@ jobs:
       matrix:
         include:
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.5"
+            DISPLAY_NAME: "Singularity Container Examples with S3.7"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.5"
+            SINGULARITY_VERSION: "3.7"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.6"
+            DISPLAY_NAME: "Singularity Container Examples with S3.8"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.6"
+            SINGULARITY_VERSION: "3.8"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.7"
+            DISPLAY_NAME: "Singularity Container Examples with S3.9"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.7"
+            SINGULARITY_VERSION: "3.9"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.8"
+            DISPLAY_NAME: "Singularity Container Examples with S3.10"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.8"
+            SINGULARITY_VERSION: "3.10"
 
       fail-fast: false
 
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 3c22a210..4fecec7d 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -15,26 +15,36 @@ jobs:
             DISPLAY_NAME: "Singularity Tests + CODECOV"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
             RUN_CODECOV: true
+
           - python-version: 3.7
             DISPLAY_NAME: "Codestyle"
             RUN_CODESTYLE: true
+            USE_SINGULARITY: false
+
           - python-version: 3.7
             DISPLAY_NAME: "Singularity Container Examples"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+
           - python-version: 3.7
             DISPLAY_NAME: "Local Examples"
             RUN_LOCAL_EXAMPLES: true
             USE_SINGULARITY: false
+
           - python-version: 3.8
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+
           - python-version: 3.9
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
       fail-fast: false
 
     name: Tests ${{ matrix.python-version }} ${{ matrix.DISPLAY_NAME }}
@@ -42,6 +52,7 @@ jobs:
     env:
       RUN_TESTS: ${{ matrix.RUN_TESTS }}
       USE_SINGULARITY: ${{ matrix.USE_SINGULARITY }}
+      SINGULARITY_VERSION: ${{ matrix.SINGULARITY_VERSION }}
       RUN_CODECOV: ${{ matrix.RUN_CODECOV }}
       RUN_CODESTYLE: ${{ matrix.RUN_CODESTYLE }}
       RUN_CONTAINER_EXAMPLES: ${{ matrix.RUN_CONTAINER_EXAMPLES }}
@@ -58,6 +69,10 @@ jobs:
       uses: actions/setup-go@v2
       with:
         go-version: '1.14.15' # The Go version to download (if necessary) and use.
+    - name: Set up Singularity
+      if: matrix.USE_SINGULARITY == true
+      run: |
+        chmod +x ci_scripts/install_singularity.sh && source ./ci_scripts/install_singularity.sh
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/README.md b/README.md
index b74b1a00..ec0a442e 100644
--- a/README.md
+++ b/README.md
@@ -54,14 +54,14 @@ cd HPOBench
 pip install .
 ```
 
-**Note:** This does not install *singularity (version 3.6)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.6/user-guide/quick_start.html#quick-installation-steps).   
+**Note:** This does not install *singularity (version 3.8)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.8/user-guide/quick_start.html#quick-installation-steps).   
 If you run into problems, using the most recent singularity version might help: [here](https://singularity.hpcng.org/admin-docs/master/installation.html)
 
 ## Containerized Benchmarks
 
-We provide all benchmarks as containerized versions to (i) isolate their dependencies and (ii) keep them reproducible. Our containerized benchmarks do not rely on external dependencies and thus do not change over time. For this, we rely on [Singularity (version 3.6)](https://sylabs.io/guides/3.6/user-guide/) and for now upload all containers to a [gitlab registry](https://gitlab.tf.uni-freiburg.de/muelleph/hpobench-registry/container_registry)
+We provide all benchmarks as containerized versions to (i) isolate their dependencies and (ii) keep them reproducible. Our containerized benchmarks do not rely on external dependencies and thus do not change over time. For this, we rely on [Singularity (version 3.8)](https://sylabs.io/guides/3.8/user-guide/) and for now upload all containers to a [gitlab registry](https://gitlab.tf.uni-freiburg.de/muelleph/hpobench-registry/container_registry)
 
-The only other requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace), *scipy* and *numpy* 
+The only other requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace), *numpy*, *oslo* and *Pyro4* 
 
 ### Run a Benchmark Locally
 
@@ -139,10 +139,9 @@ If you use a benchmark in your experiments, please specify the version number of
 the used container to ensure reproducibility. When starting an experiment, HPOBench writes automatically these two version numbers to the log. 
 
 ### Troubleshooting and Further Notes
-
   - **Singularity throws an 'Invalid Image format' exception**
-  Use a singularity version > 3. For users of the Meta-Cluster in Freiburg, you have to set the following path:
-  ```export PATH=/usr/local/kislurm/singularity-3.5/bin/:$PATH```
+  Use a singularity version >= 3.8. If you have multiple singularity installations, you have to add the correct singularity version to your $PATH, e.g.
+  ```export PATH=/usr/local/kislurm/singularity-3.8/bin/:$PATH```
 
   - **A Benchmark fails with `SystemError: Could not start an instance of the benchmark. Retried 5 times` but the container 
 can be started locally with `singularity instance start <pathtocontainer> test`**
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index b68a1b88..2d229f74 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -40,35 +40,9 @@ else
     echo "Skip installing packages for local examples"
 fi
 
-if [[ "$USE_SINGULARITY" == "true" ]]; then
-    echo "Install Singularity"
-
-    sudo apt-get update && sudo apt-get install -y \
-      build-essential \
-      libssl-dev \
-      uuid-dev \
-      libgpgme11-dev \
-      squashfs-tools \
-      libseccomp-dev \
-      wget \
-      pkg-config \
-      git \
-      cryptsetup
-
-    export VERSION=3.5.3 && # adjust this as necessary \
-      wget https://github.com/sylabs/singularity/archive/refs/tags/v${VERSION}.tar.gz && \
-      tar -xzf v${VERSION}.tar.gz && \
-      cd singularity-${VERSION}
-
-    ./mconfig && \
-      make -C builddir && \
-      sudo make -C builddir install
-
-    cd ..
-    install_packages="${install_packages}placeholder,"
-else
-    echo "Skip installing Singularity"
-fi
+# We add a placeholder / No-OP operator. When running the container examples, we don't install any
+# additional packages. That causes an error, since `pip install .[]` does not work.
+install_packages="${install_packages}NOP,"
 
 # remove the trailing comma
 install_packages="$(echo ${install_packages} | sed 's/,*\r*$//')"
diff --git a/ci_scripts/install_singularity.sh b/ci_scripts/install_singularity.sh
index 292df85b..9a89e4a3 100644
--- a/ci_scripts/install_singularity.sh
+++ b/ci_scripts/install_singularity.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env sh
 
-echo "Install Singularity"
+echo "Inside Singularity Installation Script"
 
 sudo apt-get update && sudo apt-get install -y \
   build-essential \
@@ -14,21 +14,33 @@ sudo apt-get update && sudo apt-get install -y \
   git \
   cryptsetup
 
-if [[ "$SINGULARITY_VERSION" == "3.5" ]]; then
-    export VERSION=3.5.3
-elif [[ "$SINGULARITY_VERSION" == "3.6" ]]; then
-    export VERSION=3.6.4
-elif [[ "$SINGULARITY_VERSION" == "3.7" ]]; then
+if [[ "$SINGULARITY_VERSION" == "3.7" ]]; then
     export VERSION=3.7.3
+    export FILENAME=singularity-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity
+
 elif [[ "$SINGULARITY_VERSION" == "3.8" ]]; then
-    export VERSION=3.8.0
+    export VERSION=3.8.4
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
+elif [[ "$SINGULARITY_VERSION" == "3.9" ]]; then
+    export VERSION=3.9.3
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
+elif [[ "$SINGULARITY_VERSION" == "3.10" ]]; then
+    export VERSION=3.10.0
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
 else
     echo "Skip installing Singularity"
 fi
 
-wget https://github.com/sylabs/singularity/archive/refs/tags/v${VERSION}.tar.gz && \
-tar -xzf v${VERSION}.tar.gz && \
-cd singularity-${VERSION} && \
+wget https://github.com/sylabs/singularity/releases/download/v"${VERSION}"/"${FILENAME}".tar.gz && \
+tar -xzf "${FILENAME}".tar.gz && \
+cd "${EXTRACTED_FILENAME}" && \
 ./mconfig && \
 make -C builddir && \
 sudo make -C builddir install
diff --git a/requirements.txt b/requirements.txt
index 73ae9818..aad54f85 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-scipy>=1.4.1
 numpy>=1.18.1
 ConfigSpace>=0.4.12
 Pyro4==4.80

From 5d3d75ec1da14bb31dc0caffc0f47df1efab3e02 Mon Sep 17 00:00:00 2001
From: Florian <pfistfl@users.noreply.github.com>
Date: Mon, 30 May 2022 13:20:13 +0100
Subject: [PATCH 131/147] Add YAHPO Benchmark (#142)

* Add yahpo_gym w help from phmueller

Co-authored-by: PhMueller <muller-phil@gmx.net>
---
 extra_requirements/yahpo_gym.json             |   3 +
 hpobench/benchmarks/surrogates/yahpo_gym.py   | 193 ++++++++++++++++++
 .../benchmarks/surrogates/yahpo_gym.py        |  20 ++
 .../surrogates/Singularity.YAHPOGymBenchmark  |  39 ++++
 tests/test_yahpo.py                           |  77 +++++++
 5 files changed, 332 insertions(+)
 create mode 100644 extra_requirements/yahpo_gym.json
 create mode 100644 hpobench/benchmarks/surrogates/yahpo_gym.py
 create mode 100644 hpobench/container/benchmarks/surrogates/yahpo_gym.py
 create mode 100644 hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
 create mode 100644 tests/test_yahpo.py

diff --git a/extra_requirements/yahpo_gym.json b/extra_requirements/yahpo_gym.json
new file mode 100644
index 00000000..77bea14d
--- /dev/null
+++ b/extra_requirements/yahpo_gym.json
@@ -0,0 +1,3 @@
+{
+  "yahpo_gym": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym"]
+}
diff --git a/hpobench/benchmarks/surrogates/yahpo_gym.py b/hpobench/benchmarks/surrogates/yahpo_gym.py
new file mode 100644
index 00000000..19522700
--- /dev/null
+++ b/hpobench/benchmarks/surrogates/yahpo_gym.py
@@ -0,0 +1,193 @@
+"""
+How to use this benchmark:
+--------------------------
+
+We recommend using the containerized version of this benchmark.
+If you want to use this benchmark locally (without running it via the corresponding container),
+you need to perform the following steps.
+
+Prerequisites:
+==============
+Conda environment in which the HPOBench is installed (pip install .). Activate your environment.
+```
+conda activate <Name_of_Conda_HPOBench_environment>
+```
+
+1. Clone from github:
+=====================
+```
+git clone HPOBench
+```
+
+2. Clone and install
+====================
+```
+cd /path/to/HPOBench
+pip install .[yahpo_gym]
+
+```
+
+Changelog:
+==========
+0.0.1:
+* First implementation
+"""
+import os
+import logging
+from typing import Union, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+
+from yahpo_gym.benchmark_set import BenchmarkSet
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark, AbstractBenchmark
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('YAHPOGym')
+
+
+class YAHPOGymMOBenchmark(AbstractMultiObjectiveBenchmark):
+
+    def __init__(self, scenario: str, instance: str,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        rng : np.random.RandomState, int, None
+        """
+
+        # When in the containerized version, redirect to the data inside the container.
+        if 'YAHPO_CONTAINER' in os.environ:
+            from yahpo_gym.local_config import LocalConfiguration
+            local_config = LocalConfiguration()
+            local_config.init_config(data_path='/home/data/yahpo_data')
+
+        self.scenario = scenario
+        self.instance = instance
+        self.benchset = BenchmarkSet(scenario, active_session=True)
+        self.benchset.set_instance(instance)
+
+        logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
+        super(YAHPOGymMOBenchmark, self).__init__(rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_opt_space(drop_fidelity_params=True, seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_fidelity_space(seed=seed)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        # No batch predicts, so we can grab the first item
+        out = self.benchset.objective_function({**configuration, **fidelity})[0]
+        # Convert to float for serialization
+        out = {k: float(v) for k, v in out.items()}
+
+        # Get runtime name
+        cost = out[self.benchset.config.runtime_name]
+
+        return {'function_value': out,
+                "cost": cost,
+                'info': {'fidelity': fidelity}}
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) \
+            -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+    @staticmethod
+    def get_meta_information():
+        """ Returns the meta information for the benchmark """
+        return {'name': 'YAHPO Gym',
+                'references': ['@misc{pfisterer2021yahpo,',
+                               'title={YAHPO Gym -- Design Criteria and a new Multifidelity '
+                               '       Benchmark for Hyperparameter Optimization},',
+                               'author    = {Florian Pfisterer and Lennart Schneider and'
+                               '             Julia Moosbauer and Martin Binder'
+                               '             and Bernd Bischl},',
+                               'eprint={2109.03670},',
+                               'archivePrefix={arXiv},',
+                               'year      = {2021}}'],
+                'code': 'https://github.com/pfistfl/yahpo_gym/yahpo_gym'}
+
+
+class YAHPOGymBenchmark(AbstractBenchmark):
+
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        objective : str
+            Name of the (single-crit) objective. See `self.benchset.config.y_names`.
+            Initialized to None, picks the first element in y_names.
+        rng : np.random.RandomState, int, None
+        """
+
+        self.backbone = YAHPOGymMOBenchmark(scenario=scenario, instance=instance, rng=rng)
+        self.objective = objective
+
+        super(YAHPOGymBenchmark, self).__init__(rng=rng)
+
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        mo_results = self.backbone.objective_function(configuration=configuration,
+                                                      fidelity=fidelity,
+                                                      **kwargs)
+
+        # If not objective is set, we just grab the first returned entry.
+        if self.objective is None:
+            self.objective = self.backbone.benchset.config.y_names[0]
+
+        obj_value = mo_results['function_value'][self.objective]
+
+        return {'function_value': obj_value,
+                "cost": mo_results['cost'],
+                'info': {'fidelity': fidelity, 'objectives': mo_results['function_value']}}
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_configuration_space(seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        return YAHPOGymMOBenchmark.get_meta_information()
diff --git a/hpobench/container/benchmarks/surrogates/yahpo_gym.py b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
new file mode 100644
index 00000000..9774975d
--- /dev/null
+++ b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
+
+
+class YAHPOGymBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymBenchmark, self).__init__(**kwargs)
+
+
+class YAHPOGymMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
new file mode 100644
index 00000000..66ee63b1
--- /dev/null
+++ b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
@@ -0,0 +1,39 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER pfistererf@googlemail.com
+VERSION v0.0.1
+
+%help
+    This is a template for a Singularity recipe
+
+%environment
+    YAHPO_CONTAINER=1
+    export YAHPO_CONTAINER
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    /usr/local/bin/python -m pip install --upgrade pip
+
+    cd /home \
+    && mkdir data && cd data \
+    && git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git\
+
+    cd /home \
+    && git clone https://github.com/pfistfl/HPOBench.git \
+    && cd HPOBench \
+    && echo "Please never push a recipe that checks out any other branch than development or master" \
+    && git checkout master \
+    && pip install .[yahpo_gym] \
+    && echo "Please don't touch the following lines" \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge \
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py surrogates.yahpo_gym $@
diff --git a/tests/test_yahpo.py b/tests/test_yahpo.py
new file mode 100644
index 00000000..97a7d06d
--- /dev/null
+++ b/tests/test_yahpo.py
@@ -0,0 +1,77 @@
+import sys
+from typing import Dict, List
+
+import pytest
+
+from hpobench.container.benchmarks.surrogates.yahpo_gym import YAHPOGymBenchmark, YAHPOGymMOBenchmark
+
+
+def test_yahpo_init():
+    b = YAHPOGymBenchmark(scenario="lcbench", instance="167152", objective="val_accuracy")
+
+    fs = b.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert isinstance(fidelity, Dict)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+
+    # Some tests are dependent on the python version.
+    if sys.version.startswith('3.9'):
+        assert fidelity['epoch'] == pytest.approx(29, abs=0.001)
+        assert config['OpenML_task_id'] == "167152"
+        assert config['num_layers'] == pytest.approx(4, abs=0.001)
+        assert config['max_units'] == pytest.approx(289, abs=0.0001)
+        assert config['weight_decay'] == pytest.approx(0.04376, abs=0.001)
+        assert config['learning_rate'] == pytest.approx(0.01398, abs=0.0001)
+        assert config['batch_size'] == pytest.approx(106, abs=0.001)
+
+    constant_fidelity = {'epoch': 29}
+    constant_config = {
+        'OpenML_task_id': '167152', 'batch_size': 106, 'learning_rate': 0.013981961408994055,
+        'max_dropout': 0.6027633760716439, 'max_units': 289, 'momentum': 0.47705277141162516,
+        'num_layers': 4, 'weight_decay': 0.04376434525415663
+    }
+
+    result = b.objective_function(configuration=constant_config, fidelity=constant_fidelity)
+    assert result['function_value'] == pytest.approx(61.297, abs=0.1)
+    assert result['cost'] == pytest.approx(119.4965, abs=0.1)
+    assert isinstance(result['info'], Dict)
+
+
+def test_yahpo_mo():
+    b = YAHPOGymMOBenchmark(scenario="lcbench", instance="167152")
+
+    fs = b.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert isinstance(fidelity, Dict)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+
+    # Some tests are dependent on the python version.
+    if sys.version.startswith('3.9'):
+        assert fidelity['epoch'] == pytest.approx(29, abs=0.001)
+        assert config['OpenML_task_id'] == "167152"
+        assert config['num_layers'] == pytest.approx(4, abs=0.001)
+        assert config['max_units'] == pytest.approx(289, abs=0.0001)
+        assert config['weight_decay'] == pytest.approx(0.04376, abs=0.001)
+        assert config['learning_rate'] == pytest.approx(0.01398, abs=0.0001)
+        assert config['batch_size'] == pytest.approx(106, abs=0.001)
+
+    constant_fidelity = {'epoch': 29}
+    constant_config = {
+        'OpenML_task_id': '167152', 'batch_size': 106, 'learning_rate': 0.013981961408994055,
+        'max_dropout': 0.6027633760716439, 'max_units': 289, 'momentum': 0.47705277141162516,
+        'num_layers': 4, 'weight_decay': 0.04376434525415663
+    }
+
+    result = b.objective_function(configuration=constant_config, fidelity=constant_fidelity)
+    assert isinstance(result['function_value'], Dict)
+    assert result['function_value']['val_accuracy'] == pytest.approx(61.2971, abs=0.0001)
+    assert result['cost'] == pytest.approx(119.4965, abs=0.0001)
+
+    names = b.get_objective_names()
+    assert isinstance(names, List)
+    assert len(names) == 6
+    assert names[2] == 'val_cross_entropy'

From 1544e1f62b620628323818da41eee84e5a9dce21 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 30 May 2022 15:09:00 +0200
Subject: [PATCH 132/147] Updating tabular benchmark data for ML benchmarks

---
 hpobench/util/data_manager.py | 10 +++++-----
 requirements.txt              |  6 +++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 479244f9..09b37445 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -43,11 +43,11 @@
 
 
 tabular_multi_fidelity_urls = dict(
-    xgb="https://figshare.com/ndownloader/files/30469920",
-    svm="https://figshare.com/ndownloader/files/30379359",
-    lr="https://figshare.com/ndownloader/files/30379038",
-    rf="https://figshare.com/ndownloader/files/30469089",
-    nn="https://figshare.com/ndownloader/files/30379005"
+    xgb="https://figshare.com/ndownloader/files/35414756",
+    svm="https://figshare.com/ndownloader/files/35414447",
+    lr="https://figshare.com/ndownloader/files/35412425",
+    rf="https://figshare.com/ndownloader/files/35414801",
+    nn="https://figshare.com/ndownloader/files/35414996"
 )
 
 class DataManager(abc.ABC, metaclass=abc.ABCMeta):
diff --git a/requirements.txt b/requirements.txt
index 73ae9818..cdd06606 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,8 @@ scipy>=1.4.1
 numpy>=1.18.1
 ConfigSpace>=0.4.12
 Pyro4==4.80
-oslo.concurrency>=4.2.0
\ No newline at end of file
+oslo.concurrency>=4.2.0
+pandas>=1.3.5
+scikit-learn>=0.24.1
+openml>=0.12.2
+tqdm>=4.64.0
\ No newline at end of file

From 225a82bab8f617efe8e181c416a99e206b872bdd Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 30 May 2022 15:14:34 +0200
Subject: [PATCH 133/147] version update for tabular benchmark

---
 hpobench/benchmarks/ml/tabular_benchmark.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 28945940..342766b4 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -6,6 +6,8 @@
 * First implementation of the Tabular Benchmark.
 0.0.2:
 * Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
 from pathlib import Path
@@ -19,7 +21,7 @@
 from hpobench.dependencies.ml.ml_benchmark_template import metrics
 from hpobench.util.data_manager import TabularDataManager
 
-__version__ = '0.0.2'
+__version__ = '0.0.3'
 
 
 class TabularBenchmark(AbstractBenchmark):

From ac9547a8dc53f006c79147ef56b363d0ffb9cde1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Mon, 30 May 2022 16:40:27 +0200
Subject: [PATCH 134/147] ADD Multi-Objective Nasbench201 (v0.0.6) (#152)

Update the Nasbench201 benchmark to support Multi-Objective queries.

If you want to use the *single objective* Nasbench201 benchmark, you can query the SO version of this benchmark.
Although, we have not changed the benchmark logic, you can also use the container v0.0.5 in your experiments to reproduce results from the old version of this benchmark.
---
 hpobench/benchmarks/nas/nasbench_201.py       | 398 ++++++++++++++++--
 .../container/benchmarks/nas/nasbench_201.py  |  29 +-
 tests/test_nasbench_201.py                    |  92 ++--
 3 files changed, 446 insertions(+), 73 deletions(-)

diff --git a/hpobench/benchmarks/nas/nasbench_201.py b/hpobench/benchmarks/nas/nasbench_201.py
index 17bac321..0c2324c2 100644
--- a/hpobench/benchmarks/nas/nasbench_201.py
+++ b/hpobench/benchmarks/nas/nasbench_201.py
@@ -27,6 +27,10 @@
 
 Changelog:
 ==========
+0.0.6
+* Add the multiobjective version of this benchmark by returning flops, model size, latency and missclassification rate
+* Integrate #138: Improve the docstrings about the seeds.
+
 0.0.5
 * Add for each benchmark a new one with a different fidelity space.
   The new fidelity space corresponds to the fidelity space in the DEHB paper.
@@ -54,16 +58,18 @@
 import numpy as np
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
+
 from hpobench.util.data_manager import NASBench_201Data
 
-__version__ = '0.0.5'
+
+__version__ = '0.0.6'
 MAX_NODES = 4
 
 logger = logging.getLogger('NASBENCH201')
 
 
-class NasBench201BaseBenchmark(AbstractBenchmark):
+class NasBench201BaseMOBenchmark(AbstractMultiObjectiveBenchmark):
     def __init__(self, dataset: str,
                  rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         """
@@ -129,6 +135,8 @@ def __init__(self, dataset: str,
         - In the original data, the training splits are always marked with the key 'train' but they use different
           identifiers to refer to the available evaluation splits. We report them also in the table below.
         - We exclude the data set cifar10 from this benchmark.
+        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
+          missing values with the values from an available seed.
 
          Some further remarks:
         - cifar10-valid is trained on the train split and tested on the validation split.
@@ -145,13 +153,13 @@ def __init__(self, dataset: str,
             Random seed for the benchmark's random state.
         """  # noqa: E501
 
-        super(NasBench201BaseBenchmark, self).__init__(rng=rng)
+        super(NasBench201BaseMOBenchmark, self).__init__(rng=rng)
 
         data_manager = NASBench_201Data(dataset=dataset)
 
         self.dataset = dataset
         self.data = data_manager.load()
-        self.config_to_structure = NasBench201BaseBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        self.config_to_structure = NasBench201BaseMOBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
 
     def dataset_mapping(self, dataset):
         mapping = {'cifar10-valid': ('x-valid', 'ori-test'),
@@ -160,7 +168,7 @@ def dataset_mapping(self, dataset):
         return mapping[dataset]
 
     # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
+    @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
@@ -205,7 +213,15 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : training precision
+            function_value : Dict
+                misclassification_rate : float
+                    1 - validation accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
             cost : time to train the network
             info : Dict
                 train_precision : float
@@ -264,22 +280,38 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         test_times = [np.sum((self.data[seed][structure_str]['eval_times'][f'{test_key}@{199}'])
                              for e in range(1, epoch + 1)) for seed in data_seed]
 
-        return {'function_value': float(100 - np.mean(valid_accuracies)),
-                'cost': float(np.sum(valid_times) + np.sum(train_times)),
-                'info': {'train_precision': float(100 - np.mean(train_accuracies)),
-                         'train_losses': float(np.mean(train_losses)),
-                         'train_cost': float(np.sum(train_times)),
-                         'valid_precision': float(100 - np.mean(valid_accuracies)),
-                         'valid_losses': float(np.mean(valid_losses)),
-                         'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
-                         'test_precision': float(100 - np.mean(test_accuracies)),
-                         'test_losses': float(np.mean(test_losses)),
-                         'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
-                         'fidelity': fidelity
-                         }
-                }
-
-    @AbstractBenchmark.check_parameters
+        # Number of floating point operations in million
+        num_flops = [self.data[seed][structure_str]['flop'] for seed in data_seed]
+
+        # Number of trainable model parameters in MB
+        model_size = [self.data[seed][structure_str]['params'] for seed in data_seed]
+
+        # Time to evaluate in seconds
+        latency = [self.data[seed][structure_str]['latency'] for seed in data_seed]
+
+        return {
+            'function_value': {
+                'misclassification_rate': float(100 - np.mean(valid_accuracies)),
+                'num_flops': float(np.mean(num_flops)),
+                'model_size': float(np.mean(model_size)),
+                'latency': float(np.mean(latency)),
+            },
+            'cost': float(np.sum(valid_times) + np.sum(train_times)),
+            'info': {
+                'train_precision': float(100 - np.mean(train_accuracies)),
+                'train_losses': float(np.mean(train_losses)),
+                'train_cost': float(np.sum(train_times)),
+                'valid_precision': float(100 - np.mean(valid_accuracies)),
+                'valid_losses': float(np.mean(valid_losses)),
+                'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
+                'test_precision': float(100 - np.mean(test_accuracies)),
+                'test_losses': float(np.mean(test_losses)),
+                'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
+                'fidelity': fidelity
+            }
+        }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
@@ -294,10 +326,9 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         ----------
         configuration
         fidelity: Dict, None
-            epoch: int - Values: [1, 200]
+            epoch: int - Values: [200]
                 Number of epochs an architecture was trained.
-                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
-
+                Note: We only have test performance on the last epoch.
             Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
         rng : np.random.RandomState, int, None
             Random seed to use in the benchmark.
@@ -311,7 +342,15 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : evaluation precision
+            function_value : Dict
+                misclassification_rate : float
+                    1 - test accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
             cost : time to the network + time to validate
             info : Dict
                 train_precision
@@ -327,10 +366,19 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         # to test and the corresponding time cost
         assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
 
+        if 'data_seed' in kwargs:
+            all_seeds_available = all([seed in kwargs['data_seed'] for seed in (777, 888, 999)])
+            if not all_seeds_available:
+                logger.warning('You have not specified all available seeds for the '
+                               '`objective_function_test`. However, we are going to ignore them, '
+                               ' because we report test values only as mean across all seeds.'
+                               f' Your given seeds: {kwargs["seed"]}')
+            del kwargs['data_seed']
+
         result = self.objective_function(configuration=configuration, fidelity=fidelity,
                                          data_seed=(777, 888, 999),
                                          rng=rng, **kwargs)
-        result['function_value'] = result['info']['test_precision']
+        result['function_value']['misclassification_rate'] = result['info']['test_precision']
         result['cost'] = result['info']['test_cost']
         return result
 
@@ -349,7 +397,7 @@ def config_to_structure(config):
                     op_name = config[node_str]
                     x_list.append((op_name, j))
                 genotypes.append(tuple(x_list))
-            return NasBench201BaseBenchmark._Structure(genotypes)
+            return NasBench201BaseMOBenchmark._Structure(genotypes)
         return config_to_structure
 
     @staticmethod
@@ -387,7 +435,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         seed = seed if seed is not None else np.random.randint(1, 100000)
         cs = CS.ConfigurationSpace(seed=seed)
 
-        search_space = NasBench201BaseBenchmark.get_search_spaces('cell', 'nas-bench-201')
+        search_space = NasBench201BaseMOBenchmark.get_search_spaces('cell', 'nas-bench-201')
         hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
         cs.add_hyperparameters(hps)
         return cs
@@ -420,6 +468,10 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
         return fidel_space
 
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'num_flops', 'model_size', 'latency']
+
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
@@ -471,25 +523,296 @@ def __getitem__(self, index):
             return self.nodes[index]
 
 
-class Cifar10ValidNasBench201Benchmark(NasBench201BaseBenchmark):
+class Cifar10ValidNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar10ValidNasBench201MOBenchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
+
+
+class Cifar100NasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar100NasBench201MOBenchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
+
+
+class ImageNetNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(ImageNetNasBench201MOBenchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
+
+
+class NasBench201SOBenchmark(AbstractBenchmark):
+    def __init__(self, dataset: str,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Benchmark interface to the NASBench201 Benchmarks. The NASBench201 contains
+        results for architectures on 4 different data sets.
+
+        We have split the "api" file from NASBench201 in separate files per data set.
+        The original "api" file contains all data sets, but loading this single file took too much RAM.
+
+        We recommend to not call this base class directly but using the correct subclass below.
+
+        The parameter ``dataset`` indicates which data set was used for training.
+
+        For each data set the metrics
+        'train_acc1es', 'train_losses', 'train_times', 'eval_acc1es', 'eval_times', 'eval_losses' are available.
+        However, the data sets report them on different data splits (train, train + valid, test, valid or test+valid).
+
+        We summarize all information about the data sets in the following tables.
+
+        Datastet        Metric      Avail.Epochs    Explanation             returned by HPOBENCH
+        ----------------------------------------------------------------------------------------
+        cifar10-valid   train       [0-199]         training set
+        cifar10-valid   x-valid     [0-199]         validation set          objective function
+        cifar10-valid   x-test
+        cifar10-valid   ori-test    199             test set                objective function test
+
+        cifar100        train       [0-199]         training set
+        cifar100        x-valid     199             validation set
+        cifar100        x-test      199             test set                objective function test
+        cifar100        ori-test    [0-199]         validation + test set   objective function
+
+        ImageNet16-120  train       [0-199]         training set
+        ImageNet16-120  x-valid     199             validation set
+        ImageNet16-120  x-test      199             test set                objective function test
+        ImageNet16-120  ori-test    [0-199]         validation + test set   objective function
+
+
+        We have also extracted the incumbents per split. We report the incumbent accuracy and loss performance
+        i) by taking the maximum value across all seeds and configurations
+        ii) averaged across the three available seeds
+
+                                    i) The best possible incumbents (NO AVG!)                       ii) The "average" incumbent
+        Datastet        Metric      (Index of Arch, Accuracy)       (Index, Loss)                   (Index of Arch, Accuracy)       (Index, Loss)
+        ----------------------------------------------------------------------------------------------------------------------------------------------------------
+        cifar10-valid   train       (258, 100.0)                    (2778, 0.001179278278425336)    (10154, 100)                    (2778, 0.0013082386429297428)
+        cifar10-valid   x-valid     (6111, 91.71999999023437)       (14443, 0.3837750501537323)     (6111, 91.60666665039064)       (3888, 0.3894046771335602)
+        cifar10-valid   x-test
+        cifar10-valid   ori-test    (14174, 91.65)                  (3385, 0.3850496160507202)      (1459, 91.52333333333333)       (3385, 0.3995230517864227)
+
+        cifar100        train       (9930, 99.948)                  (9930, 0.012630240231156348)    (9930, 99.93733333333334)       (9930, 0.012843489621082942)
+        cifar100        x-valid     (13714, 73.71999998779297)      (13934, 1.1490126512527465)     (9930, 73.4933333577474)        (7361, 1.1600867895126343)
+        cifar100        x-test      (1459, 74.28000004882813)       (15383, 1.1427113876342774)     (9930, 73.51333332112631)       (7337, 1.1747569534301758)
+        cifar100        ori-test    (9930, 73.88)                   (13706, 1.1610547459602356)     (9930, 73.50333333333333)       (7361, 1.1696554500579834)
+
+        ImageNet16-120  train       (9930, 73.2524719841793)        (9930, 0.9490517352046979)      (9930, 73.22918040138735)       (9930, 0.9524298415108582)
+        ImageNet16-120  x-valid     (13778, 47.39999985758463)      (10721, 2.0826991437276203)     (10676, 46.73333327229818)      (10721, 2.0915397168795264)
+        ImageNet16-120  x-test      (857, 48.03333317057292)        (12887, 2.0940088628133138)     (857, 47.31111100599501)        (11882, 2.106453532218933)
+        ImageNet16-120  ori-test    (857, 47.083333353678384)       (11882, 2.0950548852284747)     (857, 46.8444444647895)         (11882, 2.1028235816955565)
+
+
+        Note:
+        - The parameter epoch is 0 indexed!
+        - In the original data, the training splits are always marked with the key 'train' but they use different
+          identifiers to refer to the available evaluation splits. We report them also in the table below.
+        - We exclude the data set cifar10 from this benchmark.
+        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
+          missing values with the values from an available seed.
+
+         Some further remarks:
+        - cifar10-valid is trained on the train split and tested on the validation split.
+        - The train metrics are dictionaries with epochs (e.g. 0, 1, 2) as key and the metric as value.
+          The evaluation metrics, however, have as key the identifiers, e.g. ori-test@0, with 0 indicating the epoch.
+          Also, each data set reports values for all 200 epochs for a metric on the specified split
+          and a single value on the 200th epoch for the other splits.
+
+        Parameters
+        ----------
+        dataset : str
+            One of cifar10-valid, cifar10, cifar100, ImageNet16-120.
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """  # noqa: E501
+
+        super(NasBench201SOBenchmark, self).__init__(rng=rng, **kwargs)
+        self.mo_benchmark = NasBench201BaseMOBenchmark(rng=rng, dataset=dataset, **kwargs)
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                           **kwargs) -> Dict:
+        """
+        Objective function for the NASBench201 benchmark.
+        This functions sends a query to NASBench201 and evaluates the configuration.
+        As already explained in the class definition, different data sets are trained on different splits.
+
+        The table above gives a detailed summary over the available splits, epochs, and which identifier are used per
+        dataset.
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        data_seed : List, Tuple, None, int
+            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
+            The user can specify which seed to use. If more than one seed is given, the results are averaged
+            across the seeds but then the training time is the sum of the costs per seed.
+            When this value is explicitly set to `None`, the function will chose randomly one out of [777, 888, 999].
+
+            Note:
+                For some architectures (configurations) no run was available. We've set missing values to an
+                available value from another seed. Therefore, it is possible that run results are exactly the same for
+                different seeds.
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : training precision
+            cost : time to train the network
+            info : Dict
+                train_precision : float
+                train_losses : float
+                train_cost : float
+                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
+                    this field is the sum of the training time per network
+                eval_precision : float
+                eval_losses : float
+                eval_cost : float
+                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
+                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        results = self.mo_benchmark.objective_function(
+            configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed, **kwargs
+        )
+
+        results['function_value'] = results['function_value']['misclassification_rate']
+        return results
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 200).
+        The test function uses all data set seeds (777, 888, 999).
+
+        See also :py:meth:`~hpobench.benchmarks.nas.nasbench_201.objective_function`
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : evaluation precision
+            cost : time to the network + time to validate
+            info : Dict
+                train_precision
+                train_losses
+                train_cost
+                eval_precision
+                eval_losses
+                eval_cost
+                fidelity : used fidelities in this evaluation
+        """
+
+        results = self.mo_benchmark.objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+        results['function_value'] = results['function_value']['misclassification_rate']
+        return results
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Return the CS representation of the search space.
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+
+        Parameters
+        ----------
+        seed : int, None
+            Random seed for the configuration space.
+
+        Returns
+        -------
+        CS.ConfigurationSpace -
+            Containing the benchmark's hyperparameter
+        """
+        return NasBench201BaseMOBenchmark.get_configuration_space(seed=seed)
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 201.
+
+        Fidelities:
+         - epoch: int
+         The loss / accuracy at `epoch`. Can be from 0 to 199.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        return NasBench201BaseMOBenchmark.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return NasBench201BaseMOBenchmark.get_meta_information()
+
+
+class Cifar10ValidNasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201Benchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201Benchmark(NasBench201BaseBenchmark):
+class Cifar100NasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201Benchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201Benchmark(NasBench201BaseBenchmark):
+class ImageNetNasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201Benchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
 
 
-class _NasBench201BaseBenchmarkOriginal(NasBench201BaseBenchmark):
+class _NasBench201BaseBenchmarkOriginal(NasBench201SOBenchmark):
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -528,7 +851,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
-        meta_information = NasBench201BaseBenchmark.get_meta_information()
+        meta_information = NasBench201SOBenchmark.get_meta_information()
         meta_information['note'] = \
             'This version of the benchmark implements the fidelity space defined in the DEHB paper.' \
             'See [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)'
@@ -558,4 +881,7 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
            "ImageNetNasBench201Benchmark",
            "Cifar10ValidNasBench201BenchmarkOriginal",
            "Cifar100NasBench201BenchmarkOriginal",
-           "ImageNetNasBench201BenchmarkOriginal"]
+           "ImageNetNasBench201BenchmarkOriginal",
+           "Cifar10ValidNasBench201MOBenchmark",
+           "Cifar100NasBench201MOBenchmark",
+           "ImageNetNasBench201MOBenchmark"]
diff --git a/hpobench/container/benchmarks/nas/nasbench_201.py b/hpobench/container/benchmarks/nas/nasbench_201.py
index 5eb9c68f..2a948c6b 100644
--- a/hpobench/container/benchmarks/nas/nasbench_201.py
+++ b/hpobench/container/benchmarks/nas/nasbench_201.py
@@ -54,9 +54,36 @@ def __init__(self, **kwargs):
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
+class Cifar10ValidNasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(Cifar10ValidNasBench201MOBenchmark, self).__init__(**kwargs)
+
+
+class Cifar100NasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(Cifar100NasBench201MOBenchmark, self).__init__(**kwargs)
+
+
+class ImageNetNasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(ImageNetNasBench201MOBenchmark, self).__init__(**kwargs)
+
+
 __all__ = ["Cifar10ValidNasBench201Benchmark",
            "Cifar100NasBench201Benchmark",
            "ImageNetNasBench201Benchmark",
            "Cifar10ValidNasBench201BenchmarkOriginal",
            "Cifar100NasBench201BenchmarkOriginal",
-           "ImageNetNasBench201BenchmarkOriginal"]
+           "ImageNetNasBench201BenchmarkOriginal",
+           "Cifar10ValidNasBench201MOBenchmark",
+           "Cifar100NasBench201MOBenchmark",
+           "ImageNetNasBench201MOBenchmark"]
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
index 22c24b34..70e46de9 100644
--- a/tests/test_nasbench_201.py
+++ b/tests/test_nasbench_201.py
@@ -1,11 +1,11 @@
 import logging
 logging.basicConfig(level=logging.DEBUG)
-
 import pytest
 
-from hpobench.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
+from hpobench.container.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
     Cifar10ValidNasBench201Benchmark
-
+from hpobench.benchmarks.nas.nasbench_201 import \
+    Cifar10ValidNasBench201MOBenchmark as LocalCifar10ValidNasBench201MOBenchmark
 from hpobench.util.container_utils import disable_container_debug, enable_container_debug
 
 skip_message = 'We currently skip this test because it takes too much time.'
@@ -23,67 +23,87 @@ def test_nasbench201_cifar10valid(enable_debug):
 
     b = Cifar10ValidNasBench201Benchmark(rng=0)
 
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
-    fidelity = {'epoch': 199}
-
-    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
-    assert result['function_value'] == pytest.approx(0.411, abs=0.1)
-    assert result['cost'] == pytest.approx(6650.88, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
-
-    result = b.objective_function_test(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
-    with pytest.raises(AssertionError):
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    config = {
+        '1<-0': 'nor_conv_1x1',
+        '2<-0': 'nor_conv_3x3',
+        '2<-1': 'nor_conv_3x3',
+        '3<-0': 'nor_conv_1x1',
+        '3<-1': 'nor_conv_1x1',
+        '3<-2': 'nor_conv_3x3'
+    }
+    result = b.objective_function(configuration=config, fidelity={'epoch': 199}, data_seed=(777, 888, 999))
+    assert result['function_value'] == pytest.approx(9.78, abs=0.1)
+    assert result['cost'] == pytest.approx(11973.20, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
+
+    result = b.objective_function_test(configuration=config, fidelity={'epoch': 200})
+    assert result['function_value'] == pytest.approx(9.70, abs=0.1)
+    assert result['cost'] == pytest.approx(10426.33, abs=0.2)
+    assert result['info']['test_precision'] == result['function_value']
+    assert result['info']['test_cost'] == result['cost']
+
+    with pytest.raises(ValueError):
         result = b.objective_function_test(configuration=config, fidelity={'epoch': 10})
 
+
 @pytest.mark.skip(reason=skip_message)
 def test_nasbench201_cifar100(enable_debug):
     b = Cifar100NasBench201Benchmark(rng=0)
 
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
+    config = {'1<-0': 'nor_conv_1x1',
+              '2<-0': 'nor_conv_3x3',
+              '2<-1': 'nor_conv_3x3',
+              '3<-0': 'nor_conv_1x1',
+              '3<-1': 'nor_conv_1x1',
+              '3<-2': 'nor_conv_3x3'}
     fidelity = {'epoch': 199}
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
     assert result is not None
-    assert result['function_value'] == pytest.approx(7.8259, abs=0.1)
-    assert result['cost'] == pytest.approx(13301.76, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
+    assert result['function_value'] == pytest.approx(29.5233, abs=0.1)
+    assert result['cost'] == pytest.approx(19681.70, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
 
 
 @pytest.mark.skip(reason=skip_message)
 def test_nasbench201_Image(enable_debug):
     b = ImageNetNasBench201Benchmark(rng=0)
-
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
+    config = {'1<-0': 'nor_conv_1x1',
+              '2<-0': 'nor_conv_3x3',
+              '2<-1': 'nor_conv_3x3',
+              '3<-0': 'nor_conv_1x1',
+              '3<-1': 'nor_conv_1x1',
+              '3<-2': 'nor_conv_3x3'}
     fidelity = {'epoch': 199}
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
     assert result is not None
-    assert result['function_value'] == pytest.approx(62.858, abs=0.1)
-    assert result['cost'] == pytest.approx(40357.56, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
+    assert result['function_value'] == pytest.approx(55.2167, abs=0.1)
+    assert result['cost'] == pytest.approx(57119.22, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
 
 
 def test_nasbench201_fidelity_space():
-    fs = Cifar10ValidNasBench201Benchmark.get_fidelity_space()
+    fs = LocalCifar10ValidNasBench201MOBenchmark.get_fidelity_space()
     assert len(fs.get_hyperparameters()) == 1
 
 
 def test_nasbench201_config():
-    cs = Cifar10ValidNasBench201Benchmark.get_configuration_space(seed=0)
+
+    cs = LocalCifar10ValidNasBench201MOBenchmark.get_configuration_space(seed=0)
     c = cs.sample_configuration()
-    func = Cifar10ValidNasBench201Benchmark.config_to_structure_func(4)
-    struct = func(c)
 
+    func = LocalCifar10ValidNasBench201MOBenchmark.config_to_structure_func(4)
+    struct = func(c)
     assert struct.__repr__() == '_Structure(4 nodes with |nor_conv_1x1~0|+|nor_conv_3x3~0|nor_conv_3x3~1|+' \
                                 '|nor_conv_1x1~0|nor_conv_1x1~1|nor_conv_3x3~2|)'
     assert len(struct) == 4

From 3f08eb2dec42ed44917f21f106748718a4ee70a2 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <77584036+ayushi-3536@users.noreply.github.com>
Date: Tue, 31 May 2022 12:42:31 +0200
Subject: [PATCH 135/147] Benchmark: Fair Adult from MO-ASHA (#148)

We add the benchmark from the MO-ASHA paper by Schmucker et al.

It is a MO benchmark, training an MLP on the Adult data set.
---
 extra_requirements/multi_objective.json       |   3 +
 hpobench/benchmarks/mo/__init__.py            |   0
 hpobench/benchmarks/mo/adult_benchmark.py     | 445 ++++++++++++++++++
 hpobench/container/benchmarks/mo/__init__.py  |   0
 .../benchmarks/mo/adult_benchmark.py          |  12 +
 .../recipes/mo/Singularity.AdultBenchmark     |  25 +
 hpobench/dependencies/mo/__init__.py          |   0
 hpobench/dependencies/mo/fairness_metrics.py  | 110 +++++
 hpobench/dependencies/mo/scalar.py            |  36 ++
 hpobench/util/data_manager.py                 | 159 +++++++
 tests/test_adult.py                           |  37 ++
 11 files changed, 827 insertions(+)
 create mode 100644 extra_requirements/multi_objective.json
 create mode 100644 hpobench/benchmarks/mo/__init__.py
 create mode 100644 hpobench/benchmarks/mo/adult_benchmark.py
 create mode 100644 hpobench/container/benchmarks/mo/__init__.py
 create mode 100644 hpobench/container/benchmarks/mo/adult_benchmark.py
 create mode 100644 hpobench/container/recipes/mo/Singularity.AdultBenchmark
 create mode 100644 hpobench/dependencies/mo/__init__.py
 create mode 100644 hpobench/dependencies/mo/fairness_metrics.py
 create mode 100644 hpobench/dependencies/mo/scalar.py
 create mode 100644 tests/test_adult.py

diff --git a/extra_requirements/multi_objective.json b/extra_requirements/multi_objective.json
new file mode 100644
index 00000000..146c06a7
--- /dev/null
+++ b/extra_requirements/multi_objective.json
@@ -0,0 +1,3 @@
+{
+  "mo_adult": ["pandas==1.2.4","scikit-learn==0.24.2","tqdm>=3.1.4"]
+}
\ No newline at end of file
diff --git a/hpobench/benchmarks/mo/__init__.py b/hpobench/benchmarks/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/benchmarks/mo/adult_benchmark.py b/hpobench/benchmarks/mo/adult_benchmark.py
new file mode 100644
index 00000000..a12e8a70
--- /dev/null
+++ b/hpobench/benchmarks/mo/adult_benchmark.py
@@ -0,0 +1,445 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective Fair Adult Benchmark.
+"""
+import logging
+import time
+from typing import Union, Dict, List, Any, Tuple
+
+import ConfigSpace as CS
+import numpy as np
+from ConfigSpace.conditions import GreaterThanCondition
+from sklearn.metrics import accuracy_score
+from sklearn.neural_network import MLPClassifier
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.dependencies.mo.fairness_metrics import fairness_risk, STATISTICAL_DISPARITY, UNEQUALIZED_ODDS, \
+    UNEQUAL_OPPORTUNITY
+from hpobench.dependencies.mo.scalar import get_fitted_scaler
+from hpobench.util.data_manager import AdultDataManager
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('ADULT_FAIR')
+
+
+class AdultBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Multi-objective fairness HPO task. Optimize the HP of a NN on the adult data set.
+
+        Parameters
+        ----------
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+        super(AdultBenchmark, self).__init__(rng=rng, **kwargs)
+
+        data_manager = AdultDataManager()
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = data_manager.load()
+        self.output_class = np.unique(self.y_train)
+        self.feature_names = data_manager.feature_names
+        self.sensitive_feature = data_manager.sensitive_names
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for the MLP.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('n_fc_layers', default_value=3, lower=1, upper=4, log=False),
+            CS.UniformIntegerHyperparameter('fc_layer_0', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_1', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_2', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_3', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformFloatHyperparameter('alpha', lower=10**-5, upper=10**-1, default_value=10**-2, log=True),
+            CS.UniformFloatHyperparameter('learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('beta_1', lower=10**-3, upper=0.99, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('beta_2', lower=10**-3, upper=0.99, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('tol', lower=10**-5, upper=10**-2, default_value=10**-3, log=True),
+        ])
+
+        cs.add_conditions([
+            # Add the fc_layer_1 (2nd layer) if we allow more than 1 `n_fc_layers`, and so on...
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_1'), cs.get_hyperparameter('n_fc_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_2'), cs.get_hyperparameter('n_fc_layers'), 2),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_3'), cs.get_hyperparameter('n_fc_layers'), 3),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters.
+
+        Fidelities
+        ----------
+        budget: int - Values: [1, 200]
+            Number of epochs an architecture was trained.
+            Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameter(
+            CS.UniformIntegerHyperparameter(
+                'budget', lower=1, upper=200, default_value=200, log=False
+            )
+        )
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Multi-objective Asynchronous Successive Halving',
+            'references':
+                ['@article{schmucker2021multi,'
+                 'title={Multi-objective Asynchronous Successive Halving},'
+                 'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas,'
+                 ' David and Archambeau, C{\'e}dric},'
+                 'journal={arXiv preprint arXiv:2106.12639},'
+                 'year={2021}']}
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        """Get a list of objectives evaluated in the objective_function. """
+        return ['accuracy', 'DSP', 'DEO', 'DFP']
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+        Objective function for the multi-objective adult benchmark.
+
+        We train a NN and evaluate its performance using fairness metrics.
+        This function returns the performance on the validation set.
+        However, we report also train and test performance.
+
+        Parameters
+        ----------
+        configuration: Dict, CS.Configuration
+            Configuration for the MLP model.
+        fidelity: Dict, None
+            budget: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict - validation metrics after training on train
+                accuracy: float
+                DSO: float
+                DEO: float
+                DFP: float
+            cost : time to train the network
+            info : Dict
+                 train_accuracy : float
+                 valid_accuracy : float
+                 test_accuracy : float
+                 training_cost : float - time to train the network. see `training_cost`
+                 total_cost : float - elapsed time for the entire obj_func call,
+                 eval_train_cost : float - time to compute metrics on training split
+                 eval_valid_cost : float - time to compute metrics on validation split
+                 eval_test_cost : float - time to compute metrics on test split
+                 train_DSO : float
+                 train_DEO : float
+                 train_DFP : float
+                 valid_DSO : float
+                 valid_DEO : float
+                 valid_DFP : float
+                 test_DSO : float
+                 test_DEO : float
+                 test_DFP : float
+                 fidelity : int
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        ts_start = time.time()
+
+        budget = fidelity['budget']
+        logger.debug(f"budget for evaluation of config:{budget}")
+        logger.debug(f"config for evaluation:{configuration}")
+
+        sensitive_rows_train = self.X_train[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_val = self.X_valid[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_test = self.X_test[:, self.feature_names.index(self.sensitive_feature)]
+
+        X_train, X_valid, X_test = self.X_train.copy(), self.X_valid.copy(), self.X_test.copy()
+
+        # Normalize data
+        scaler = get_fitted_scaler(X_train, "Standard")
+        if scaler is not None:
+            X_train = scaler(X_train)
+            X_valid = scaler(X_valid)
+            X_test = scaler(X_test)
+
+        # Create model. The parameters fc_layer_1-3 might not be included in the search space.
+        hidden = [configuration['fc_layer_0'],
+                  configuration.get('fc_layer_1', None),
+                  configuration.get('fc_layer_2', None),
+                  configuration.get('fc_layer_3', None)][:configuration['n_fc_layers']]
+
+        for item in ['fc_layer_0', 'fc_layer_1', 'fc_layer_2', 'fc_layer_3', 'n_fc_layers']:
+            if item in configuration:
+                configuration.pop(item)
+
+        # We deviate here from the original implementation. They have called `budget`-times mlp.partial_fit().
+        # We call `.fit()` due to efficiency aspects.
+        mlp = MLPClassifier(**configuration, hidden_layer_sizes=hidden, shuffle=shuffle,
+                            random_state=self.rng, max_iter=budget)
+
+        mlp.fit(X_train, self.y_train)
+        training_cost = time.time() - ts_start
+
+        train_accuracy, train_statistical_disparity, train_unequal_opportunity, train_unequalized_odds, \
+            eval_train_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_train, self.y_train, sensitive_rows_train, mlp)
+
+        val_accuracy, val_statistical_disparity, val_unequal_opportunity, val_unequalized_odds, eval_valid_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_valid, self.y_valid, sensitive_rows_val, mlp)
+
+        test_accuracy, test_statistical_disparity, test_unequal_opportunity, test_unequalized_odds, eval_test_runtime =\
+            AdultBenchmark._compute_metrics_on_split(X_test, self.y_test, sensitive_rows_test, mlp)
+
+        logger.debug(f"config: {configuration}, val_acc: {val_accuracy}, test_score: {test_accuracy}, "
+                     f"train score: {train_accuracy}, dsp: {val_statistical_disparity}, "
+                     f"deo :{val_unequal_opportunity}, dfp :{val_unequalized_odds}")
+
+        elapsed_time = time.time() - ts_start
+
+        return {'function_value': {'accuracy': float(val_accuracy),
+                                   'DSO': float(val_statistical_disparity),
+                                   'DEO': float(val_unequal_opportunity),
+                                   'DFP': float(val_unequalized_odds)
+                                   },
+                'cost': training_cost,
+                'info': {'train_accuracy': float(train_accuracy),
+                         'valid_accuracy': float(val_accuracy),
+                         'test_accuracy': float(test_accuracy),
+                         'training_cost': training_cost,
+                         'total_cost': elapsed_time,
+                         'eval_train_cost': eval_train_runtime,
+                         'eval_valid_cost': eval_valid_runtime,
+                         'eval_test_cost': eval_test_runtime,
+                         'train_DSO': float(train_statistical_disparity),
+                         'train_DEO': float(train_unequal_opportunity),
+                         'train_DFP': float(train_unequalized_odds),
+                         'valid_DSO': float(val_statistical_disparity),
+                         'valid_DEO': float(val_unequal_opportunity),
+                         'valid_DFP': float(val_unequalized_odds),
+                         'test_DSO': float(test_statistical_disparity),
+                         'test_DEO': float(test_unequal_opportunity),
+                         'test_DFP': float(test_unequalized_odds),
+                         'fidelity': budget
+                         }
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: Union[bool, None] = False,
+                                **kwargs) -> Dict:
+        """
+        Objective function for the multi-objective adult benchmark.
+
+        We train a NN and evaluate its performance using fairness metrics.
+        This function returns the performance on the test set.
+
+        Parameters
+        ----------
+        configuration: Dict, CS.Configuration
+            Configuration for the MLP model.
+            Use default configuration if None.
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict - test metrics reported after training on (train+valid)
+                accuracy: float
+                DSO: float
+                DEO: float
+                DFP: float
+            cost : float - time to train the network. see `training_cost`
+            info : Dict
+                 train_accuracy : float
+                 test_accuracy : float
+                 training_cost : float
+                 total_cost : float - elapsed time for the entire obj_func_test call,
+                 eval_train_cost : float - time to compute metrics on training split
+                 eval_test_cost : float - time to compute metrics on test split
+                 train_DSO : float
+                 train_DEO : float
+                 train_DFP : float
+                 test_DSO : float
+                 test_DEO : float
+                 test_DFP : float
+                 fidelity : int
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self._shuffle_data(self.rng, shuffle_valid=True)
+
+        ts_start = time.time()
+
+        budget = fidelity['budget']
+
+        X_train, X_valid, X_test = self.X_train.copy(), self.X_valid.copy(), self.X_test.copy()
+        X_train = np.vstack((X_train, X_valid))
+        y_train = np.vstack((self.y_train[:, np.newaxis], self.y_valid[:, np.newaxis])).ravel()
+
+        sensitive_rows_train = X_train[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_test = X_test[:, self.feature_names.index(self.sensitive_feature)]
+
+        # Normalize data
+        scaler = get_fitted_scaler(X_train, "Standard")
+        if scaler is not None:
+            X_train = scaler(X_train)
+            X_test = scaler(X_test)
+
+        # Create model. The parameters fc_layer_1-3 might not be included in the search space.
+        hidden = [configuration['fc_layer_0'],
+                  configuration.get('fc_layer_1', None),
+                  configuration.get('fc_layer_2', None),
+                  configuration.get('fc_layer_3', None)][:configuration['n_fc_layers']]
+
+        for item in ['fc_layer_0', 'fc_layer_1', 'fc_layer_2', 'fc_layer_3', 'n_fc_layers']:
+            if item in configuration:
+                configuration.pop(item)
+
+        # We deviate here from the original implementation. They have called `budget`-times mlp.partial_fit().
+        # We call `.fit()` due to efficiency aspects.
+        mlp = MLPClassifier(**configuration, hidden_layer_sizes=hidden, shuffle=shuffle,
+                            random_state=rng, max_iter=budget)
+        mlp.fit(X_train, y_train)
+        training_cost = time.time() - ts_start
+
+        train_accuracy, train_statistical_disparity, train_unequal_opportunity, train_unequalized_odds, \
+            eval_train_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_train, y_train, sensitive_rows_train, mlp)
+
+        test_accuracy, test_statistical_disparity, test_unequal_opportunity, test_unequalized_odds, eval_test_runtime =\
+            AdultBenchmark._compute_metrics_on_split(X_test, self.y_test, sensitive_rows_test, mlp)
+
+        elapsed_time = time.time() - ts_start
+
+        logger.debug(f"config:{configuration}, test_score: {test_accuracy}, train score:{train_accuracy},"
+                     f"dsp:{test_statistical_disparity}, deo :{test_unequal_opportunity}, dfp :{test_unequalized_odds}")
+
+        return {'function_value': {'accuracy': float(test_accuracy),
+                                   'DSO': float(test_statistical_disparity),
+                                   'DEO': float(test_unequal_opportunity),
+                                   'DFP': float(test_unequalized_odds)
+                                   },
+                'cost': training_cost,
+                'info': {'train_accuracy': float(train_accuracy),
+                         'test_accuracy': float(test_accuracy),
+                         'training_cost': training_cost,
+                         'total_cost': elapsed_time,
+                         'eval_train_cost': eval_train_runtime,
+                         'eval_test_cost': eval_test_runtime,
+                         'train_DSO': float(train_statistical_disparity),
+                         'train_DEO': float(train_unequal_opportunity),
+                         'train_DFP': float(train_unequalized_odds),
+                         'test_DSO': float(test_statistical_disparity),
+                         'test_DEO': float(test_unequal_opportunity),
+                         'test_DFP': float(test_unequalized_odds),
+                         'fidelity': budget
+                         }
+                }
+
+    @staticmethod
+    def _compute_metrics_on_split(
+            x_split: np.ndarray, y_split: np.ndarray, sensitive_rows: Any,  mlp: Any
+    ) -> Tuple:
+
+        start = time.time()
+        _y_pred = mlp.predict(x_split)
+        accuracy = accuracy_score(y_split, _y_pred)
+        statistical_disparity = fairness_risk(x_split, y_split, sensitive_rows, mlp, STATISTICAL_DISPARITY)
+        unequal_opportunity = fairness_risk(x_split, y_split, sensitive_rows, mlp, UNEQUAL_OPPORTUNITY)
+        unequalized_odds = fairness_risk(x_split, y_split, sensitive_rows, mlp, UNEQUALIZED_ODDS)
+        runtime = time.time() - start
+        return accuracy, statistical_disparity, unequal_opportunity, unequalized_odds, runtime
+
+    def _shuffle_data(self, rng=None, shuffle_valid=False) -> None:
+        """
+        Reshuffle the training data.
+
+        Parameters
+        ----------
+        rng
+            If 'rng' is None, the training idx are shuffled according to the class-random-state
+        shuffle_valid: bool, None
+            If true, shuffle the validation data. Defaults to False.
+        """
+        random_state = rng_helper.get_rng(rng, self.rng)
+
+        train_idx = np.arange(len(self.X_train))
+        random_state.shuffle(train_idx)
+        self.X_train = self.X_train[train_idx]
+        self.y_train = self.y_train[train_idx]
+
+        if shuffle_valid:
+            valid_idx = np.arange(len(self.X_valid))
+            random_state.shuffle(valid_idx)
+            self.X_valid = self.X_valid[valid_idx]
+            self.y_valid = self.y_valid[valid_idx]
+
+
+__all__ = ['AdultBenchmark']
diff --git a/hpobench/container/benchmarks/mo/__init__.py b/hpobench/container/benchmarks/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/benchmarks/mo/adult_benchmark.py b/hpobench/container/benchmarks/mo/adult_benchmark.py
new file mode 100644
index 00000000..dbdcaf4d
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/adult_benchmark.py
@@ -0,0 +1,12 @@
+""" Benchmark for the Multi-Objective Adult Benchmark from hpobench/benchmarks/mo/adult_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class AdultBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'AdultBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'fair_adult')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(AdultBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/recipes/mo/Singularity.AdultBenchmark b/hpobench/container/recipes/mo/Singularity.AdultBenchmark
new file mode 100644
index 00000000..d373caa2
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.AdultBenchmark
@@ -0,0 +1,25 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout master \
+    && pip install .[mo_adult] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.adult_benchmark $@
\ No newline at end of file
diff --git a/hpobench/dependencies/mo/__init__.py b/hpobench/dependencies/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/dependencies/mo/fairness_metrics.py b/hpobench/dependencies/mo/fairness_metrics.py
new file mode 100644
index 00000000..7776fbd9
--- /dev/null
+++ b/hpobench/dependencies/mo/fairness_metrics.py
@@ -0,0 +1,110 @@
+"""
+This file contains functionality to compute various fairness related risk scores.
+"""
+
+import numpy as np
+
+STATISTICAL_DISPARITY = 'statistical_disparity'  # P(1 | group A) - P(1 | group B)
+UNEQUAL_OPPORTUNITY = 'unequal_opportunity'  # P(1 | group A, 0) - P(1 | group B, 0)
+UNEQUALIZED_ODDS = 'unequalized_odds'  # P(1 | group A, 1) - P(1 | group B, 1)
+
+TPR0 = 'tpr0'
+TPR1 = 'tpr1'
+TPR_DIF = 'tpr_dif'
+TPR_MIN = 'tpr_min'
+
+FAIRNESS_METRICS = [STATISTICAL_DISPARITY, UNEQUAL_OPPORTUNITY, UNEQUALIZED_ODDS, TPR0, TPR1, TPR_DIF, TPR_MIN]
+
+PRED_THRESHOLD = 0.5
+
+
+def fairness_risk(x, y, sensitive_rows, model, unfairness_metric):
+    """
+    Returns the fairness_risk based on the definition of the unfairness_metric, currently supporting:
+    statistical_disparity: P(positive prediction | group A) = P(positive prediction | group B)
+
+    Parameters
+    ----------
+    x: np.ndarray
+        inputs
+    y: np.ndarray
+        labels in {0, 1} such that 0 is a "positive" label, 1 "negative"
+    sensitive_rows: np.ndarray
+        binary array indicating which rows correspond to the protected group
+    model:
+        trained sklearn model
+    unfairness_metric: str
+        string with unfairness condition
+
+    Returns
+    -------
+        float
+    """
+    predicted_probs = model.predict_proba(x)
+    if unfairness_metric == STATISTICAL_DISPARITY:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == UNEQUAL_OPPORTUNITY:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1 & (y == 0)][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0 & (y == 0)][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == UNEQUALIZED_ODDS:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1 & (y == 1)][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0 & (y == 1)][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == TPR0:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+        return tpr0
+    elif unfairness_metric == TPR1:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return tpr1
+    elif unfairness_metric == TPR_DIF:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return abs(tpr0 - tpr1)
+    elif unfairness_metric == TPR_MIN:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return min(tpr0, tpr1)
+    else:
+        raise ValueError(
+            f'{unfairness_metric} is not a valid unfairness condition. '
+            f'Please specify one among ({STATISTICAL_DISPARITY}, {UNEQUAL_OPPORTUNITY}, {UNEQUALIZED_ODDS})'
+        )
diff --git a/hpobench/dependencies/mo/scalar.py b/hpobench/dependencies/mo/scalar.py
new file mode 100644
index 00000000..3f434fde
--- /dev/null
+++ b/hpobench/dependencies/mo/scalar.py
@@ -0,0 +1,36 @@
+import numpy as np
+from typing import Union
+
+try:
+    from sklearn.preprocessing import MinMaxScaler, StandardScaler
+except ImportError:
+    print("scikit-learn not installed")
+
+
+def get_fitted_scaler(x_train: np.ndarray, name: Union[None, str] = None):
+    """
+    Instantiates a scaler by a given name and fits the scaler with x_train.
+    Parameters
+    ----------
+    x_train: np.ndarray
+        Train data
+
+    name: str, None
+        Name of the scaling method. Defaults to no scaling.
+
+    Returns
+    -------
+
+    """
+
+    if name == "MinMax":
+        scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
+    elif name == "Standard":
+        scaler = StandardScaler(copy=True)
+    elif name is None or name == "None":
+        return None
+    else:
+        raise NotImplementedError()
+
+    scaler.fit(x_train)
+    return lambda x: scaler.transform(x)
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index a2e33121..00d9568d 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -926,6 +926,165 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         return X_trn, y_trn, X_val, y_val, X_tst, y_tst
 
 
+class AdultDataManager(HoldoutDataManager):
+
+    def __init__(self):
+        super(AdultDataManager, self).__init__()
+        self.logger.debug('AdultDataManager: Starting to load data')
+        self.urls = {"data": "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+                     "test_data": "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"}
+
+        self.feature_names = ['age', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'race',
+                              'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country',
+                              'employment_type']
+        self.sensitive_names = 'sex'
+
+        self._save_dir = hpobench.config_file.data_dir / "adult"
+
+        self._data_extract_path = self._save_dir / "processed_data"
+
+        self.create_save_directory(self._data_extract_path)
+
+    def load(self):
+        """
+        Loads Adult Fair Datasets from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        X_trn, y_trn, X_val, y_val, X_tst, y_tst = self._load()
+        self.logger.info(f'AdultDataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+    def _download(self):
+
+        if not (self._save_dir / "adult.data").exists():
+            self._download_file_with_progressbar(self.urls["data"], self._save_dir / "adult.data")
+
+        if not (self._save_dir / "adult.test").exists():
+            self._download_file_with_progressbar(self.urls["test_data"], self._save_dir / "adult.test")
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+        processed_files = ['x_train', 'x_valid', 'x_test', 'y_train', 'y_valid', 'y_test']
+        file_is_missing = not all([(self._data_extract_path / f'{file}.npy').exists() for file in processed_files])
+
+        if file_is_missing:
+            columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
+                       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
+                       "hours-per-week", "country", "salary"]
+            train_data = pd.read_csv(self._save_dir / 'adult.data', names=columns, sep=',', na_values='?')
+            test_data = pd.read_csv(self._save_dir / 'adult.test', names=columns, sep=',', skiprows=1, na_values='?')
+
+            X, y = self._process_adult_data(train_data)
+            X_test, y_test = self._process_adult_data(test_data)
+
+            n_trn = int(X.shape[0] * 0.7)
+            # Creation of Train and Test dataset
+            X_train, y_train = X[:n_trn], y[:n_trn]
+            X_valid, y_valid = X[n_trn:], y[n_trn:]
+
+            np.save(self._data_extract_path / 'x_train.npy', X_train)
+            np.save(self._data_extract_path / 'x_valid.npy', X_valid)
+            np.save(self._data_extract_path / 'x_test.npy', X_test)
+
+            np.save(self._data_extract_path / 'y_train.npy', y_train)
+            np.save(self._data_extract_path / 'y_valid.npy', y_valid)
+            np.save(self._data_extract_path / 'y_test.npy', y_test)
+
+        else:
+            X_train = np.load(self._data_extract_path / 'x_train.npy')
+            X_valid = np.load(self._data_extract_path / 'x_valid.npy')
+            X_test = np.load(self._data_extract_path / 'x_test.npy')
+
+            y_train = np.load(self._data_extract_path / 'y_train.npy')
+            y_valid = np.load(self._data_extract_path / 'y_valid.npy')
+            y_test = np.load(self._data_extract_path / 'y_test.npy')
+
+        return X_train, y_train, X_valid, y_valid, X_test, y_test
+
+    def _process_adult_data(self, df) -> Tuple[np.ndarray, np.ndarray]:
+        # mapping all categories of marital status to Single(1) or Couple(0)
+        df['marital-status'] = df['marital-status'].replace(
+            [' Divorced', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'], 'Single')
+        df['marital-status'] = df['marital-status'].replace([' Married-AF-spouse', ' Married-civ-spouse'], 'Couple')
+        df['marital-status'] = df['marital-status'].map({'Couple': 0, 'Single': 1})
+
+        # mapping race
+        race_map = {' White': 0, ' Amer-Indian-Eskimo': 1, ' Asian-Pac-Islander': 2, ' Black': 3, ' Other': 4}
+        df['race'] = df['race'].map(race_map)
+
+        # categorizing all work classes into 4 major categories
+        def get_workclass(x):
+            if x['workclass'] == ' Federal-gov' or x['workclass'] == ' Local-gov' or x['workclass'] == ' State-gov':
+                return 'govt'
+            elif x['workclass'] == ' Private':
+                return 'private'
+            elif x['workclass'] == ' Self-emp-inc' or x['workclass'] == ' Self-emp-not-inc':
+                return 'self_employed'
+            else:
+                return 'without_pay'
+
+        df['employment_type'] = df.apply(get_workclass, axis=1)
+        employment_map = {'govt': 0, 'private': 1, 'self_employed': 2, 'without_pay': 3}
+        df['employment_type'] = df['employment_type'].map(employment_map)
+
+        # mapping relationship map
+        rel_map = {' Unmarried': 0, ' Wife': 1, ' Husband': 2, ' Not-in-family': 3, ' Own-child': 4,
+                   ' Other-relative': 5}
+        df['relationship'] = df['relationship'].map(rel_map)
+
+        # maping capital gain/loss to binary values
+        df.loc[(df['capital-gain'] > 0), 'capital-gain'] = 1
+        df.loc[(df['capital-gain'] == 0, 'capital-gain')] = 0
+        df.loc[(df['capital-loss'] > 0), 'capital-loss'] = 1
+        df.loc[(df['capital-loss'] == 0, 'capital-loss')] = 0
+
+        # defining salary map
+        salary_map = {' <=50K': 1, ' >50K': 0, ' <=50K.': 1, ' >50K.': 0, }
+        df['salary'] = df['salary'].map(salary_map).astype(int)
+
+        df['sex'] = df['sex'].map({' Male': 1, ' Female': 0}).astype(int)
+
+        # replacing all missing values with np.nan
+        df['country'] = df['country'].replace(' ?', np.nan)
+        df['workclass'] = df['workclass'].replace(' ?', np.nan)
+        df['occupation'] = df['occupation'].replace(' ?', np.nan)
+
+        # categorizing countries into "Non-US" and "US"
+        df.loc[df['country'] != ' United-States', 'country'] = 'Non-US'
+        df.loc[df['country'] == ' United-States', 'country'] = 'US'
+        df['country'] = df['country'].map({'US': 1, 'Non-US': 0}).astype(int)
+
+        df.drop(labels=['workclass', 'education', 'occupation'], axis=1, inplace=True)
+        X = df.drop(['salary'], axis=1)
+        y = df['salary']
+
+        return X.to_numpy(), y.to_numpy()
+
+
 class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
diff --git a/tests/test_adult.py b/tests/test_adult.py
new file mode 100644
index 00000000..d7a030b7
--- /dev/null
+++ b/tests/test_adult.py
@@ -0,0 +1,37 @@
+import logging
+import pytest
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+def test_adult_benchmark():
+    from hpobench.container.benchmarks.mo.adult_benchmark import AdultBenchmark
+
+    # Check Seeding
+    benchmark = AdultBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'alpha': 0.00046568046379195655, 'beta_1': 0.14382335124614148, 'beta_2': 0.0010007892350251595,
+        'fc_layer_0': 4, 'fc_layer_1': 2, 'fc_layer_2': 2, 'fc_layer_3': 3,'n_fc_layers': 4,
+        'learning_rate_init': 0.0005343227125594117,
+        'tol': 0.0004134759007834719
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+
+    assert result_1['info']['valid_accuracy'] == pytest.approx(0.7539, rel=0.001)
+    assert result_1['info']['valid_accuracy'] == result_1['function_value']['accuracy']
+    assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
+
+    result_1 = benchmark.objective_function_test(test_config, rng=1, fidelity={'budget': 3})
+    assert result_1['function_value']['accuracy'] == pytest.approx(0.76377, rel=0.001)
+    assert result_1['function_value']['accuracy'] == result_1['info']['test_accuracy']

From 4c4f1d93b446b96dc01c16d1448d05c6132e6440 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <77584036+ayushi-3536@users.noreply.github.com>
Date: Wed, 1 Jun 2022 17:04:26 +0200
Subject: [PATCH 136/147] Multi Objective CNN benchmark: Flowers and Fashion
 (#147)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added mo cnn benchmarks from bag of baseline paper

We deviate from the original benchmark in two points:
* we return as cost only the  training time instead of the total elapsed time
* we return as objective for minimization instead of `-100 * accuracy` now `1 - accuracy` to achieve better output scalings.

Co-authored-by: ayushi-3536 <ayushi-3536@github.com>
Co-authored-by: Philipp Müller <muller-phil@gmx.net>
---
 extra_requirements/mo_cnn.json                |   7 +
 hpobench/benchmarks/mo/cnn_benchmark.py       | 575 ++++++++++++++++++
 .../container/benchmarks/mo/cnn_benchmark.py  |  22 +
 hpobench/container/benchmarks/od/__init__.py  |   0
 .../recipes/mo/Singularity.CNNBenchmark       |  26 +
 hpobench/util/data_manager.py                 |  88 ++-
 tests/test_mo_cnn.py                          |  48 ++
 7 files changed, 765 insertions(+), 1 deletion(-)
 create mode 100644 extra_requirements/mo_cnn.json
 create mode 100644 hpobench/benchmarks/mo/cnn_benchmark.py
 create mode 100644 hpobench/container/benchmarks/mo/cnn_benchmark.py
 create mode 100644 hpobench/container/benchmarks/od/__init__.py
 create mode 100644 hpobench/container/recipes/mo/Singularity.CNNBenchmark
 create mode 100644 tests/test_mo_cnn.py

diff --git a/extra_requirements/mo_cnn.json b/extra_requirements/mo_cnn.json
new file mode 100644
index 00000000..35914e3e
--- /dev/null
+++ b/extra_requirements/mo_cnn.json
@@ -0,0 +1,7 @@
+{
+  "mo_cnn": [
+    "tqdm>=3.0.0",
+    "torch==1.9.0",
+    "pandas==1.2.4"
+  ]
+}
diff --git a/hpobench/benchmarks/mo/cnn_benchmark.py b/hpobench/benchmarks/mo/cnn_benchmark.py
new file mode 100644
index 00000000..d8bfd939
--- /dev/null
+++ b/hpobench/benchmarks/mo/cnn_benchmark.py
@@ -0,0 +1,575 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective CNN Benchmark.
+"""
+import logging
+import random
+import time
+from typing import Union, Dict, List, Tuple, Any
+
+import ConfigSpace as CS
+import numpy as np
+import torch
+import torch.nn as nn
+import tqdm
+from ConfigSpace.conditions import GreaterThanCondition
+from torch.utils.data import TensorDataset, DataLoader
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.util.data_manager import CNNDataManager
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('MO_CNN')
+
+
+class AccuracyTop1:
+
+    def __init__(self):
+        self.reset()
+
+        self.sum = 0
+        self.cnt = 0
+
+    def reset(self):
+        self.sum = 0
+        self.cnt = 0
+
+    def __call__(self, y_true: torch.Tensor, y_pred: torch.Tensor) -> float:
+        self.sum += y_pred.topk(1)[1].eq(y_true.argmax(-1).reshape(-1, 1).expand(-1, 1)).float().sum().to('cpu').numpy()
+        self.cnt += y_pred.size(0)
+        return self.sum / self.cnt
+
+
+class Net(nn.Module):
+    """
+    The model to optimize
+    """
+
+    def __init__(self, config: Dict, input_shape: Tuple = (3, 28, 28),
+                 num_classes: Union[int, None] = 10):
+        super(Net, self).__init__()
+        inp_ch = input_shape[0]
+        layers = []
+        for i in range(config['n_conv_layers']):
+            out_ch = config['conv_layer_{}'.format(i)]
+            ks = config['kernel_size']
+            layers.append(nn.Conv2d(inp_ch, out_ch, kernel_size=ks, padding=(ks - 1) // 2))
+            layers.append(nn.ReLU())
+            if config['batch_norm']:
+                layers.append(nn.BatchNorm2d(out_ch))
+            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
+            inp_ch = out_ch
+
+        self.conv_layers = nn.Sequential(*layers)
+        self.pooling = nn.AdaptiveAvgPool2d(1) if config['global_avg_pooling'] else nn.Identity()
+        self.output_size = num_classes
+
+        self.fc_layers = nn.ModuleList()
+
+        inp_n = self._get_conv_output(input_shape)
+
+        layers = [nn.Flatten()]
+        for i in range(config['n_fc_layers']):
+            out_n = config['fc_layer_{}'.format(i)]
+
+            layers.append(nn.Linear(inp_n, out_n))
+            layers.append(nn.ReLU())
+
+            inp_n = out_n
+
+        layers.append(nn.Linear(inp_n, num_classes))
+        self.fc_layers = nn.Sequential(*layers)
+
+    # generate input sample and forward to get shape
+    def _get_conv_output(self, shape: Tuple) -> int:
+        bs = 1
+        input = torch.autograd.Variable(torch.rand(bs, *shape))
+        output_feat = self.conv_layers(input)
+        output_feat = self.pooling(output_feat)
+        n_size = output_feat.data.view(bs, -1).size(1)
+        return n_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv_layers(x)
+        x = self.pooling(x)
+        x = self.fc_layers(x)
+        return x
+
+    def train_fn(self, optimizer: torch.optim.Optimizer, criterion: Any, loader: DataLoader, device: torch.device):
+        """
+        Training method
+
+        Parameters
+        ----------
+        optimizer
+            optimization algorithm
+        criterion
+            loss function
+        loader
+            data loader for either training or testing set
+        device
+            Either CPU or GPU
+        Returns
+        -------
+        accuracy on the data
+        """
+        accuracy = AccuracyTop1()
+        self.train()
+
+        acc = 0
+        for images, labels in loader:
+            images = images.to(device)
+            labels = labels.to(device)
+
+            optimizer.zero_grad()
+            logits = self(images)
+
+            loss = criterion(logits, labels.argmax(-1))
+            loss.backward()
+            optimizer.step()
+
+            acc = accuracy(labels, logits)
+
+        return acc
+
+    def eval_fn(self, loader: DataLoader, device: torch.device):
+        """
+        Evaluation method
+
+        Parameters
+        ----------
+        loader:
+            data loader for either training or testing set
+        device:
+            torch device
+
+        Returns
+        -------
+        accuracy on the data
+        """
+        accuracy = AccuracyTop1()
+        self.eval()
+
+        acc = 0
+        with torch.no_grad():  # no gradient needed
+            for images, labels in loader:
+                images = images.to(device)
+                labels = labels.to(device)
+
+                outputs = self(images)
+                acc = accuracy(labels, outputs)
+
+        return acc
+
+
+class CNNBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self, dataset: str,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Parameters
+        ----------
+        dataset : str
+            One of fashion, flower.
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+
+        super(CNNBenchmark, self).__init__(rng=rng)
+        allowed_datasets = ["fashion", "flower"]
+        assert dataset in allowed_datasets, f'Requested data set is not supported. Must be one of ' \
+                                            f'{", ".join(allowed_datasets)}, but was {dataset}'
+        logger.info(f'Start Benchmark on dataset {dataset}')
+
+        self.dataset = dataset
+        self.__seed_everything()
+
+        # Dataset loading
+        data_manager = CNNDataManager(dataset=self.dataset)
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = data_manager.load()
+
+        self.output_classes = self.y_train.shape[1]
+        self.input_shape = self.X_train.shape[1:4]
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the CNN model.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('n_conv_layers', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter('n_fc_layers', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter('conv_layer_0', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('conv_layer_1', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('conv_layer_2', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_0', default_value=32, lower=2, upper=512, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_1', default_value=32, lower=2, upper=512, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_2', default_value=32, lower=2, upper=512, log=True),
+
+            CS.UniformIntegerHyperparameter('batch_size', lower=1, upper=512, default_value=128, log=True),
+            CS.UniformFloatHyperparameter('learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True),
+            CS.CategoricalHyperparameter('batch_norm', default_value=False, choices=[False, True]),
+            CS.CategoricalHyperparameter('global_avg_pooling', default_value=True, choices=[False, True]),
+            CS.CategoricalHyperparameter('kernel_size', default_value=5, choices=[7, 5, 3])
+        ])
+
+        cs.add_conditions([
+            # Add the conv_layer_1 (2nd layer) if we allow more than 1 (>1) `n_conv_layers`, and so on...
+            GreaterThanCondition(cs.get_hyperparameter('conv_layer_1'), cs.get_hyperparameter('n_conv_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('conv_layer_2'), cs.get_hyperparameter('n_conv_layers'), 2),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_1'), cs.get_hyperparameter('n_fc_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_2'), cs.get_hyperparameter('n_fc_layers'), 2),
+        ])
+
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters
+
+        Fidelities
+        ----------
+        budget: int - [1, 25]
+            Number of epochs to train
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('budget', lower=1, upper=25, default_value=25, log=False)
+        ])
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Bag of baselines for multi-objective joint neural architecture search and '
+                    'hyperparameter optimization',
+            'references': ['@article{guerrero2021bag,'
+                           'title   = {Bag of baselines for multi - objective joint neural architecture search and '
+                           'hyperparameter optimization},'
+                           'author  = {Guerrero-Viu, Julia and Hauns, Sven and Izquierdo, Sergio and Miotto, '
+                           'Guilherme and Schrodi, Simon and Biedenkapp, Andre and Elsken, Thomas and Deng, '
+                           'Difan and Lindauer, Marius and Hutter, Frank},},'
+                           'journal = {arXiv preprint arXiv:2105.01015},'
+                           'year    = {2021}}',
+                           ],
+            'code': 'https://github.com/automl/multi-obj-baselines',
+        }
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        """Get the names of the objectives reported in the objective function."""
+        return ['accuracy', 'model_size']
+
+    def init_model(self, config: Union[CS.Configuration, Dict]) -> Net:
+        """
+        Function that returns the model initialized based on the configuration and fidelity
+        """
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        return Net(config, self.input_shape, self.output_classes)
+
+    def __seed_everything(self):
+        """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
+        seed = self.rng.randint(0, 100000)
+        logger.debug(f'Generate seed: {seed}')
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+
+    def _shuffle_data(self, rng=None, shuffle_valid=False) -> None:
+        """
+        Reshuffle the training data.
+
+        Parameters
+        ----------
+        rng
+            If 'rng' is None, the training idx are shuffled according to the class-random-state
+        shuffle_valid: bool, None
+            If true, shuffle the validation data. Defaults to False.
+        """
+        random_state = rng_helper.get_rng(rng, self.rng)
+
+        train_idx = np.arange(len(self.X_train))
+        random_state.shuffle(train_idx)
+        self.X_train = self.X_train[train_idx]
+        self.y_train = self.y_train[train_idx]
+
+        if shuffle_valid:
+            valid_idx = np.arange(len(self.X_valid))
+            random_state.shuffle(valid_idx)
+            self.X_valid = self.X_valid[valid_idx]
+            self.y_valid = self.y_valid[valid_idx]
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+        Train a CNN on either the flower or the fashion data set and return the performance on the validation
+        data split.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the CNN Model
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                negative_accuracy: float
+                    1 - validation accuracy
+                log_model_size: float
+                    log10 of the number of parameters
+            cost : time to train the network
+            info : Dict
+                train_accuracy : float,
+                training_cost : float,
+                valid_accuracy : float,
+                valid_cost : float,
+                test_accuracy : float,
+                test_cost : float,
+                model_size : int,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        self.__seed_everything()
+
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        time_in = time.time()
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        logger.info(f'We use the device: {device}')
+
+        # initializing model
+        model = self.init_model(configuration).to(device)
+        epochs = fidelity['budget']
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=configuration['learning_rate_init'])
+        criterion = torch.nn.CrossEntropyLoss()
+
+        ds_train = TensorDataset(self.X_train, self.y_train)
+        ds_train = DataLoader(ds_train, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_val = TensorDataset(self.X_valid, self.y_valid)
+        ds_val = DataLoader(ds_val, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_test = TensorDataset(self.X_test, self.y_test)
+        ds_test = DataLoader(ds_test, batch_size=configuration['batch_size'], shuffle=True)
+
+        start = time.time()
+        t = tqdm.tqdm(total=epochs)
+
+        train_accuracy = 0
+        for epoch in range(epochs):
+            train_accuracy = model.train_fn(optimizer, criterion, ds_train, device).item()
+            t.set_postfix(train_accuracy=train_accuracy)
+            t.update()
+        training_runtime = time.time() - start
+
+        num_params = np.sum([p.numel() for p in model.parameters()]).item()
+        start = time.time()
+        val_accuracy = model.eval_fn(ds_val, device).item()
+        eval_valid_runtime = time.time() - start
+        start = time.time()
+        test_accuracy = model.eval_fn(ds_test, device).item()
+        eval_test_runtime = time.time() - start
+
+        t.set_postfix(
+            train_acc=train_accuracy,
+            val_acc=val_accuracy,
+            tst_acc=test_accuracy,
+            len=np.log10(num_params),
+            train_runtime=training_runtime,
+            eval_valid_runtime=eval_valid_runtime,
+            eval_test_runtime=eval_test_runtime,
+        )
+        t.close()
+
+        elapsed_time = time.time() - time_in
+
+        return {'function_value': {'negative_accuracy': 1 - val_accuracy,
+                                   'log_model_size': float(np.log10(num_params))},
+                'cost': float(training_runtime),
+                'info': {'train_accuracy': train_accuracy,
+                         'training_cost': training_runtime,
+                         'valid_accuracy': val_accuracy,
+                         'valid_cost': eval_valid_runtime,
+                         'test_accuracy': test_accuracy,
+                         'test_cost': eval_test_runtime,
+                         'total_time': elapsed_time,
+                         'model_size': num_params,
+                         'fidelity': fidelity}
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: bool = False,
+                                **kwargs) -> Dict:
+        """
+        Train a CNN on both the train adn validation split of either the flower or the fashion data set and
+        get the test results.
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the CNN Model
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                negative_accuracy: float
+                    1 - test accuracy
+                log_model_size: float
+                    log10 of the number of parameters
+            cost : time to train the network
+            info : Dict
+                train_accuracy : float,
+                training_cost : float,
+                test_accuracy : float,
+                test_cost : float,
+                model_size : int,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        time_in = time.time()
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        self.__seed_everything()
+
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        train_X = torch.vstack((self.X_train, self.X_valid))
+        y_train = torch.cat((self.y_train, self.y_valid))
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        # initializing model
+        model = self.init_model(configuration).to(device)
+        epochs = fidelity['budget']
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=configuration['learning_rate_init'])
+        criterion = torch.nn.CrossEntropyLoss()
+
+        ds_train = TensorDataset(train_X, y_train)
+        ds_train = DataLoader(ds_train, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_test = TensorDataset(self.X_test, self.y_test)
+        ds_test = DataLoader(ds_test, batch_size=configuration['batch_size'], shuffle=True)
+
+        start = time.time()
+        t = tqdm.tqdm(total=epochs)
+
+        train_accuracy = 0
+        for epoch in range(epochs):
+            train_accuracy = model.train_fn(optimizer, criterion, ds_train, device).item()
+            t.set_postfix(train_accuracy=train_accuracy)
+            t.update()
+        training_runtime = time.time() - start
+
+        num_params = np.sum([p.numel() for p in model.parameters()])
+        start = time.time()
+        test_accuracy = model.eval_fn(ds_test, device).item()
+        eval_test_runtime = time.time() - start
+
+        t.set_postfix(
+            train_acc=train_accuracy,
+            tst_acc=test_accuracy,
+            len=np.log10(num_params),
+            eval_train_runtime=training_runtime,
+            eval_test_runtime=eval_test_runtime,
+
+        )
+        t.close()
+
+        elapsed_time = time.time() - time_in
+
+        return {'function_value': {'negative_accuracy': 1 - test_accuracy,
+                                   'log_model_size': float(np.log10(num_params))},
+                'cost': training_runtime,
+                'info': {'train_accuracy': train_accuracy,
+                         'training_cost': training_runtime,
+                         'test_accuracy': test_accuracy,
+                         'test_cost': eval_test_runtime,
+                         'total_time': elapsed_time,
+                         'model_size': num_params,
+                         'fidelity': fidelity}
+                }
+
+
+class FashionCNNBenchmark(CNNBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(FashionCNNBenchmark, self).__init__(dataset='fashion', rng=rng, **kwargs)
+
+
+class FlowerCNNBenchmark(CNNBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(FlowerCNNBenchmark, self).__init__(dataset='flower', rng=rng, **kwargs)
+
+
+__all__ = ["FashionCNNBenchmark",
+           "FlowerCNNBenchmark"]
diff --git a/hpobench/container/benchmarks/mo/cnn_benchmark.py b/hpobench/container/benchmarks/mo/cnn_benchmark.py
new file mode 100644
index 00000000..c9a1d009
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/cnn_benchmark.py
@@ -0,0 +1,22 @@
+""" Benchmark for the Multi-Objective CNN Benchmark from hpobench/benchmarks/mo/cnn_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient
+
+
+class FlowerCNNBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FlowerCNNBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(FlowerCNNBenchmark, self).__init__(**kwargs)
+
+
+class FashionCNNBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FashionCNNBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(FashionCNNBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/od/__init__.py b/hpobench/container/benchmarks/od/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/recipes/mo/Singularity.CNNBenchmark b/hpobench/container/recipes/mo/Singularity.CNNBenchmark
new file mode 100644
index 00000000..c9870968
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.CNNBenchmark
@@ -0,0 +1,26 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout master \
+    && pip install .[mo_cnn] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.cnn_benchmark $@
\ No newline at end of file
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 00d9568d..c72305e1 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -37,7 +37,6 @@
 except ImportError:
     print("pandas is not installed, can't download datasets for the ml.tabular_benchmarks (not needed for containers)")
 
-
 import hpobench
 
 
@@ -845,6 +844,93 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_train, y_train = data[:n_train, 1:], data[:n_train, 0]
         X_val, y_val = data[n_train:n_train + n_val, 1:], data[n_train:n_train + n_val, 0]
         X_test, y_test = data[n_train + n_val:, 1:], data[n_train + n_val:, 0]
+        return X_train, y_train, X_val, y_val, X_test, y_test
+
+
+class CNNDataManager(HoldoutDataManager):
+
+    def __init__(self, dataset: str):
+
+        super(CNNDataManager, self).__init__()
+        self.logger.debug('CNNDataManager: Starting to load data')
+
+        allowed_datasets = ["fashion", "flower"]
+        assert dataset in allowed_datasets, f'Requested data set is not supported. Must be one of ' \
+                                            f'{", ".join(allowed_datasets)}, but was {dataset}'
+
+        self.url_source = f'https://github.com/ayushi-3536/DatasetHost/blob/main/{dataset}.tar.gz?raw=true'
+        self.dataset = dataset
+        self.save_dir = hpobench.config_file.data_dir / "CNN" / f'{dataset}'
+        self.compressed_data = self.save_dir / f'{dataset}.tar.gz'
+        self.create_save_directory(self.save_dir)
+
+    def load(self):
+        """
+        Loads CNN Benchmark from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        X_trn, y_trn, X_val, y_val, X_tst, y_tst = self._load()
+        self.logger.info(f'CNNDataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+    def _download(self):
+
+        # Check if data is already downloaded.
+        # Use a file lock to ensure that no two processes try to download the same files at the same time.
+        if self.compressed_data.exists():
+            self.logger.debug('CNNDataManager: Data already downloaded')
+        else:
+
+            self.logger.info(f'CNNDataManager: Start downloading data from {self.url_source} '
+                             f'to {self.save_dir}')
+            self._download_file_with_progressbar(data_url=self.url_source, data_file=self.compressed_data)
+            self._untar_data(compressed_file=self.compressed_data, save_dir=self.save_dir)
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        data_extract_path = self.save_dir / "data"
+        X_train = np.load(data_extract_path / 'x_train.npy')
+        y_train = np.load(data_extract_path / 'y_train.npy')
+
+        X_val = np.load(data_extract_path / 'x_val.npy')
+        y_val = np.load(data_extract_path / 'y_val.npy')
+
+        # Read Test datasets
+        X_test = np.load(data_extract_path / 'x_test.npy')
+        y_test = np.load(data_extract_path / 'y_test.npy')
+
+        def __cast_x_y(x, y) -> Tuple:
+            import torch
+            return torch.tensor(x).float().permute(0, 3, 1, 2), torch.tensor(y).long()
+
+        X_train, y_train = __cast_x_y(X_train, y_train)
+        X_val, y_val = __cast_x_y(X_val, y_val)
+        X_test, y_test = __cast_x_y(X_test, y_test)
 
         return X_train, y_train, X_val, y_val, X_test, y_test
 
diff --git a/tests/test_mo_cnn.py b/tests/test_mo_cnn.py
new file mode 100644
index 00000000..308c59ad
--- /dev/null
+++ b/tests/test_mo_cnn.py
@@ -0,0 +1,48 @@
+import pytest
+
+
+def test_mo_cnn_seeding():
+    from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
+    b1 = FlowerCNNBenchmark(rng=0)
+    b2 = FlowerCNNBenchmark(rng=0)
+    test_config = {
+        'batch_norm': True, 'batch_size': 71, 'conv_layer_0': 194,  'conv_layer_1': 152,
+        'conv_layer_2': 92, 'fc_layer_0': 65, 'fc_layer_1': 19, 'fc_layer_2': 273,
+        'global_avg_pooling': True, 'kernel_size': 5, 'learning_rate_init': 0.09091283280651452,
+        'n_conv_layers': 2, 'n_fc_layers': 2
+    }
+
+    result_1 = b1.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = b2.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    for metric in result_1['function_value'].keys():
+        assert result_1['function_value'][metric] == pytest.approx(result_2['function_value'][metric], abs=0.001)
+
+
+def test_mo_cnn_benchmark():
+    from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
+
+    # Check Seeding
+    benchmark = FlowerCNNBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'batch_norm': True, 'batch_size': 71, 'conv_layer_0': 194,  'conv_layer_1': 152,
+        'conv_layer_2': 92, 'fc_layer_0': 65, 'fc_layer_1': 19, 'fc_layer_2': 273,
+        'global_avg_pooling': True, 'kernel_size': 5, 'learning_rate_init': 0.09091283280651452,
+        'n_conv_layers': 2, 'n_fc_layers': 2
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    print(f'MO CNN: Valid Accuracy = {result_1["info"]["valid_accuracy"]}')
+    print(f'MO CNN: Train Accuracy = {result_1["info"]["train_accuracy"]}')
+    # assert result_1['info']['train_accuracy'] == pytest.approx(0.1044, rel=0.001)
+    # assert result_1['info']['valid_accuracy'] == pytest.approx(0.1029, rel=0.001)
+    assert result_1['info']['valid_accuracy'] == pytest.approx(1 - result_1['function_value']['negative_accuracy'], abs=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']

From fd25d57e846bf3e39b313820a9ad39558f52bd2d Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Thu, 2 Jun 2022 16:24:50 +0200
Subject: [PATCH 137/147] Updating container information for ML benches

---
 .../container/benchmarks/ml/lr_benchmark.py   | 16 ++++++++-----
 .../container/benchmarks/ml/nn_benchmark.py   | 16 ++++++++-----
 .../container/benchmarks/ml/rf_benchmark.py   | 16 ++++++++-----
 .../container/benchmarks/ml/svm_benchmark.py  | 16 ++++++++-----
 .../benchmarks/ml/tabular_benchmark.py        |  8 +++++--
 .../benchmarks/ml/xgboost_benchmark.py        | 24 ++++++++++++-------
 requirements.txt                              |  2 +-
 7 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/hpobench/container/benchmarks/ml/lr_benchmark.py b/hpobench/container/benchmarks/ml/lr_benchmark.py
index 979cda3e..2f40118f 100644
--- a/hpobench/container/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/container/benchmarks/ml/lr_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mfbb"
+container_version = "0.0.3"
+
+
 class LRBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmark, self).__init__(**kwargs)
 
 
 class LRBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmarkBB, self).__init__(**kwargs)
 
 
 class LRBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/nn_benchmark.py b/hpobench/container/benchmarks/ml/nn_benchmark.py
index 04955e82..bc042ee9 100644
--- a/hpobench/container/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/container/benchmarks/ml/nn_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mfbb"
+container_version = "0.0.3"
+
+
 class NNBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmark, self).__init__(**kwargs)
 
 
 class NNBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmarkBB, self).__init__(**kwargs)
 
 
 class NNBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/rf_benchmark.py b/hpobench/container/benchmarks/ml/rf_benchmark.py
index a414349d..dc366ca9 100644
--- a/hpobench/container/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/container/benchmarks/ml/rf_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mfbb"
+container_version = "0.0.3"
+
+
 class RandomForestBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmark, self).__init__(**kwargs)
 
 
 class RandomForestBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmarkBB, self).__init__(**kwargs)
 
 
 class RandomForestBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark.py b/hpobench/container/benchmarks/ml/svm_benchmark.py
index 7547a81a..b6251a07 100644
--- a/hpobench/container/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/container/benchmarks/ml/svm_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mfbb"
+container_version = "0.0.3"
+
+
 class SVMBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmark, self).__init__(**kwargs)
 
 
 class SVMBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmarkMF, self).__init__(**kwargs)
 
 
 class SVMBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmarkBB, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/tabular_benchmark.py b/hpobench/container/benchmarks/ml/tabular_benchmark.py
index 6d19953b..185e2b46 100644
--- a/hpobench/container/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/container/benchmarks/ml/tabular_benchmark.py
@@ -6,11 +6,15 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_tabular_benchmarks"
+container_version = "0.0.3"
+
+
 class TabularBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'TabularBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_tabular_benchmarks')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(TabularBenchmark, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark.py b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
index c82ea606..45d87611 100644
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
@@ -6,36 +6,42 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mfbb"
+container_version = "0.0.3"
+
+
 class XGBoostBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmark, self).__init__(**kwargs)
 
 
 class XGBoostBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmarkBB, self).__init__(**kwargs)
 
 
 class XGBoostBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmarkMF, self).__init__(**kwargs)
 
 
 class XGBoostSearchSpace3Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace3Benchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
 
 
-__all__ = ['XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
+__all__ = [
+    'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF', 'XGBoostSearchSpace3Benchmark'
+]
diff --git a/requirements.txt b/requirements.txt
index cdd06606..46e2ffa4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ numpy>=1.18.1
 ConfigSpace>=0.4.12
 Pyro4==4.80
 oslo.concurrency>=4.2.0
-pandas>=1.3.5
+pandas>=1.2.4
 scikit-learn>=0.24.1
 openml>=0.12.2
 tqdm>=4.64.0
\ No newline at end of file

From 0ad578fdad2d4df226c0b0d7c369055d57b1cfda Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 3 Jun 2022 21:12:38 +0200
Subject: [PATCH 138/147] Bug fix for containers and fidelity space

---
 hpobench/benchmarks/ml/__init__.py            | 33 +++++++++++--------
 hpobench/benchmarks/ml/lr_benchmark.py        |  8 +++--
 hpobench/benchmarks/ml/nn_benchmark.py        |  6 ++--
 hpobench/benchmarks/ml/rf_benchmark.py        |  6 ++--
 hpobench/benchmarks/ml/svm_benchmark.py       |  3 +-
 hpobench/benchmarks/ml/xgboost_benchmark.py   |  6 ++--
 .../container/benchmarks/ml/lr_benchmark.py   |  2 +-
 .../container/benchmarks/ml/nn_benchmark.py   |  2 +-
 .../container/benchmarks/ml/rf_benchmark.py   |  2 +-
 .../container/benchmarks/ml/svm_benchmark.py  |  2 +-
 .../benchmarks/ml/svm_benchmark_old.py        | 15 ---------
 .../benchmarks/ml/xgboost_benchmark.py        |  2 +-
 .../benchmarks/ml/xgboost_benchmark_old.py    | 24 --------------
 .../dependencies/ml/ml_benchmark_template.py  | 15 ---------
 14 files changed, 44 insertions(+), 82 deletions(-)
 delete mode 100644 hpobench/container/benchmarks/ml/svm_benchmark_old.py
 delete mode 100644 hpobench/container/benchmarks/ml/xgboost_benchmark_old.py

diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index 64e399cd..68d70459 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -1,4 +1,3 @@
-from hpobench.benchmarks.ml.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
 from hpobench.benchmarks.ml.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
 from hpobench.benchmarks.ml.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
 from hpobench.benchmarks.ml.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
@@ -6,17 +5,25 @@
 from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
 from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
 
+
 try:
+    # `xgboost` is from https://xgboost.readthedocs.io/en/latest/install.html#conda
+    # and not part of the scikit-learn bundle and not a strict requirement for running HPOBench
+    # for other spaces and also for tabular benchmarks
     from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-except ImportError:
-    pass
-
-
-__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
-           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
-           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
-           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
-           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
-           'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
-           ]
+    __all__ = [
+        'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
+        'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
+        'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
+        'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
+        'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
+        'TabularBenchmark',
+    ]
+except (ImportError, AttributeError) as e:
+    __all__ = [
+        'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
+        'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
+        'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
+        'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
+        'TabularBenchmark',
+    ]
diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 6bd7c214..aa7aa162 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -53,7 +53,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         return cs
 
     @staticmethod
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - iterations + data subsample
@@ -285,7 +285,8 @@ def _train_objective(
 class LRBenchmarkBB(LRBenchmark):
     """ Black-box version of the LRBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
@@ -297,7 +298,8 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 class LRBenchmarkMF(LRBenchmark):
     """ Multi-fidelity version of the LRBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - iterations
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 49723fd8..4263278f 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -293,7 +293,8 @@ def _train_objective(
 class NNBenchmarkBB(NNBenchmark):
     """ Black-box version of the NNBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
@@ -305,7 +306,8 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 class NNBenchmarkMF(NNBenchmark):
     """ Multi-fidelity version of the NNBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - iterations
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 251f64c5..b6874788 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -284,7 +284,8 @@ def _train_objective(
 class RandomForestBenchmarkBB(RandomForestBenchmark):
     """ Black-box version of the RandomForestBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
@@ -298,7 +299,8 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 class RandomForestBenchmarkMF(RandomForestBenchmark):
     """ Multi-fidelity version of the RandomForestBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index dcc56587..c7b6a816 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -110,7 +110,8 @@ def get_model_size(self, model: SVC) -> float:
 class SVMBenchmarkBB(SVMBenchmark):
     """ Black-box version of the SVMBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameter(
             # uses the entire data (subsample=1), reflecting the black-box setup
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index d4f287ed..234c2cee 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -144,7 +144,8 @@ def get_model_size(self, model: xgb.XGBClassifier) -> float:
 class XGBoostBenchmarkBB(XGBoostBenchmark):
     """ Black-box version of the XGBoostBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
@@ -158,7 +159,8 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 class XGBoostBenchmarkMF(XGBoostBenchmark):
     """ Multi-fidelity version of the XGBoostBenchmark
     """
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
diff --git a/hpobench/container/benchmarks/ml/lr_benchmark.py b/hpobench/container/benchmarks/ml/lr_benchmark.py
index 2f40118f..b06955fa 100644
--- a/hpobench/container/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/container/benchmarks/ml/lr_benchmark.py
@@ -6,7 +6,7 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-container_name = "ml_mfbb"
+container_name = "ml_mmfb"
 container_version = "0.0.3"
 
 
diff --git a/hpobench/container/benchmarks/ml/nn_benchmark.py b/hpobench/container/benchmarks/ml/nn_benchmark.py
index bc042ee9..3cae8748 100644
--- a/hpobench/container/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/container/benchmarks/ml/nn_benchmark.py
@@ -6,7 +6,7 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-container_name = "ml_mfbb"
+container_name = "ml_mmfb"
 container_version = "0.0.3"
 
 
diff --git a/hpobench/container/benchmarks/ml/rf_benchmark.py b/hpobench/container/benchmarks/ml/rf_benchmark.py
index dc366ca9..72b11d67 100644
--- a/hpobench/container/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/container/benchmarks/ml/rf_benchmark.py
@@ -6,7 +6,7 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-container_name = "ml_mfbb"
+container_name = "ml_mmfb"
 container_version = "0.0.3"
 
 
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark.py b/hpobench/container/benchmarks/ml/svm_benchmark.py
index b6251a07..473e76bb 100644
--- a/hpobench/container/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/container/benchmarks/ml/svm_benchmark.py
@@ -6,7 +6,7 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-container_name = "ml_mfbb"
+container_name = "ml_mmfb"
 container_version = "0.0.3"
 
 
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark_old.py b/hpobench/container/benchmarks/ml/svm_benchmark_old.py
deleted file mode 100644
index 4955f057..00000000
--- a/hpobench/container/benchmarks/ml/svm_benchmark_old.py
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
-
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
-
-
-class SupportVectorMachine(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SupportVectorMachine')
-        kwargs['container_name'] = kwargs.get('container_name', 'svm_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(SupportVectorMachine, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark.py b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
index 45d87611..c99e7602 100644
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
@@ -6,7 +6,7 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
-container_name = "ml_mfbb"
+container_name = "ml_mmfb"
 container_version = "0.0.3"
 
 
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
deleted file mode 100644
index df475748..00000000
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
-
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
-
-
-class XGBoostBenchmark(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(XGBoostBenchmark, self).__init__(**kwargs)
-
-
-class XGBoostExtendedBenchmark(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostExtendedBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(XGBoostExtendedBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 396b18ad..94aadcf8 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -464,18 +464,3 @@ def objective_function_test(
             'cost': float(model_fit_time + info['test_costs']['acc']),
             'info': info
         }
-
-
-if __name__ == "__main__":
-    from hpobench.benchmarks.ml import RandomForestBenchmarkMF
-    benchmark = RandomForestBenchmarkMF(task_id=10101)
-    config = benchmark.configuration_space.sample_configuration()
-    print(config)
-    fidelity = benchmark.fidelity_space.sample_configuration()
-    print(fidelity)
-    start = time.time()
-    res = benchmark.objective_function(
-        config, fidelity, record_train=True, rng=123, get_learning_curve=True
-    )
-    print(res)
-    print(time.time() - start)

From dfa203455b35ce6ee7dd176083e4f10caa8598f7 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 31 Aug 2022 14:59:07 +0200
Subject: [PATCH 139/147] Updating container version used for experiments

---
 hpobench/container/benchmarks/ml/lr_benchmark.py      | 2 +-
 hpobench/container/benchmarks/ml/nn_benchmark.py      | 2 +-
 hpobench/container/benchmarks/ml/rf_benchmark.py      | 2 +-
 hpobench/container/benchmarks/ml/svm_benchmark.py     | 2 +-
 hpobench/container/benchmarks/ml/tabular_benchmark.py | 2 +-
 hpobench/container/benchmarks/ml/xgboost_benchmark.py | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hpobench/container/benchmarks/ml/lr_benchmark.py b/hpobench/container/benchmarks/ml/lr_benchmark.py
index b06955fa..61b80a13 100644
--- a/hpobench/container/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/container/benchmarks/ml/lr_benchmark.py
@@ -7,7 +7,7 @@
 
 
 container_name = "ml_mmfb"
-container_version = "0.0.3"
+container_version = "0.0.4"
 
 
 class LRBenchmark(AbstractBenchmarkClient):
diff --git a/hpobench/container/benchmarks/ml/nn_benchmark.py b/hpobench/container/benchmarks/ml/nn_benchmark.py
index 3cae8748..d4b0f52a 100644
--- a/hpobench/container/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/container/benchmarks/ml/nn_benchmark.py
@@ -7,7 +7,7 @@
 
 
 container_name = "ml_mmfb"
-container_version = "0.0.3"
+container_version = "0.0.4"
 
 
 class NNBenchmark(AbstractBenchmarkClient):
diff --git a/hpobench/container/benchmarks/ml/rf_benchmark.py b/hpobench/container/benchmarks/ml/rf_benchmark.py
index 72b11d67..13e9bb47 100644
--- a/hpobench/container/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/container/benchmarks/ml/rf_benchmark.py
@@ -7,7 +7,7 @@
 
 
 container_name = "ml_mmfb"
-container_version = "0.0.3"
+container_version = "0.0.4"
 
 
 class RandomForestBenchmark(AbstractBenchmarkClient):
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark.py b/hpobench/container/benchmarks/ml/svm_benchmark.py
index 473e76bb..7a20f40b 100644
--- a/hpobench/container/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/container/benchmarks/ml/svm_benchmark.py
@@ -7,7 +7,7 @@
 
 
 container_name = "ml_mmfb"
-container_version = "0.0.3"
+container_version = "0.0.4"
 
 
 class SVMBenchmark(AbstractBenchmarkClient):
diff --git a/hpobench/container/benchmarks/ml/tabular_benchmark.py b/hpobench/container/benchmarks/ml/tabular_benchmark.py
index 185e2b46..5c8a22ef 100644
--- a/hpobench/container/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/container/benchmarks/ml/tabular_benchmark.py
@@ -7,7 +7,7 @@
 
 
 container_name = "ml_tabular_benchmarks"
-container_version = "0.0.3"
+container_version = "0.0.4"
 
 
 class TabularBenchmark(AbstractBenchmarkClient):
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark.py b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
index c99e7602..726d6f45 100644
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
@@ -7,7 +7,7 @@
 
 
 container_name = "ml_mmfb"
-container_version = "0.0.3"
+container_version = "0.0.4"
 
 
 class XGBoostBenchmark(AbstractBenchmarkClient):

From 5889a04348dfac59cfe5528cae37b8cf4759bedd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Wed, 28 Sep 2022 11:13:58 +0200
Subject: [PATCH 140/147] HPOBench with python >= 3.10 (#162)

Make installable for Python 3.10.6
* Add 3.10 to test pipeline
* Remove upper Python version limit
* Change python-versions to strings
* Surround action python-version with quotation marks

* Install paramnet dependencies only for Python <= 3.9
* Disable local paramnet test for Python > 3.9

Co-authored-by: Dominik Woiwode <dwoiwode@users.noreply.github.com>
---
 .github/workflows/run_tests.yml | 22 ++++++++++++++--------
 README.md                       | 16 ++++++++++++++++
 ci_scripts/install.sh           | 19 +++++++++++++------
 extra_requirements/tests.json   |  2 +-
 setup.py                        |  2 +-
 tests/test_paramnet.py          | 10 ++++++----
 6 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 4fecec7d..bfacc9d1 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -11,36 +11,42 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Singularity Tests + CODECOV"
             RUN_TESTS: true
             USE_SINGULARITY: true
             SINGULARITY_VERSION: "3.8"
             RUN_CODECOV: true
 
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Codestyle"
             RUN_CODESTYLE: true
             USE_SINGULARITY: false
 
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Singularity Container Examples"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: true
             SINGULARITY_VERSION: "3.8"
 
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Local Examples"
             RUN_LOCAL_EXAMPLES: true
             USE_SINGULARITY: false
 
-          - python-version: 3.8
+          - python-version: "3.8"
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
             SINGULARITY_VERSION: "3.8"
 
-          - python-version: 3.9
+          - python-version: "3.9"
+            DISPLAY_NAME: "Singularity Tests"
+            RUN_TESTS: true
+            USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+            
+          - python-version: "3.10"
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
@@ -63,7 +69,7 @@ jobs:
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
-        python-version: ${{ matrix.python-version }}
+        python-version: "${{ matrix.python-version }}"
     - name: Set up Go for Singularity
       if: matrix.USE_SINGULARITY == true
       uses: actions/setup-go@v2
@@ -78,4 +84,4 @@ jobs:
         python -m pip install --upgrade pip
         chmod +x ci_scripts/install.sh && source ./ci_scripts/install.sh
     - name: Run Tests
-      run: chmod +x ci_scripts/script.sh && source ./ci_scripts/script.sh
\ No newline at end of file
+      run: chmod +x ci_scripts/script.sh && source ./ci_scripts/script.sh
diff --git a/README.md b/README.md
index ec0a442e..96dd406d 100644
--- a/README.md
+++ b/README.md
@@ -149,3 +149,19 @@ See whether in `~/.singularity/instances/sing/$HOSTNAME/*/` there is a file that
 
 **Note:** If you are looking for a different or older version of our benchmarking library, you might be looking for
  [HPOlib1.5](https://github.com/automl/HPOlib1.5) 
+ 
+## Reference
+
+If you use HPOBench, please cite the following paper:
+
+```bibtex
+@inproceedings{
+  eggensperger2021hpobench,
+  title={{HPOB}ench: A Collection of Reproducible Multi-Fidelity Benchmark Problems for {HPO}},
+  author={Katharina Eggensperger and Philipp M{\"u}ller and Neeratyoy Mallik and Matthias Feurer and Rene Sass and Aaron Klein and Noor Awad and Marius Lindauer and Frank Hutter},
+  booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
+  year={2021},
+  url={https://openreview.net/forum?id=1k4rJYEwda-}
+}
+```
+
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index 2d229f74..fb62b5d2 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -4,14 +4,21 @@ install_packages=""
 
 if [[ "$RUN_TESTS" == "true" ]]; then
     echo "Install tools for testing"
-    install_packages="${install_packages}xgboost,pytest,test_paramnet,test_tabular_datamanager,"
+    install_packages="${install_packages}xgboost,pytest,test_tabular_datamanager,"
     pip install codecov
 
-    # The param net benchmark does not work with a scikit-learn version != 0.23.2. (See notes in the benchmark)
-    # To make sure that no newer version is installed, we install it before the other requirements.
-    # Since we are not using a "--upgrade" option later on, pip skips to install another scikit-learn version.
-    echo "Install the right scikit-learn function for the param net tests."
-    pip install --upgrade scikit-learn==0.23.2
+    PYVERSION=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]*\).*/\1\2/')
+    if [[ "${PYVERSION}" != "310" ]]; then
+      # The param net benchmark does not work with a scikit-learn version != 0.23.2. (See notes in the benchmark)
+      # To make sure that no newer version is installed, we install it before the other requirements.
+      # Since we are not using a "--upgrade" option later on, pip skips to install another scikit-learn version.
+      echo "Install the right scikit-learn function for the param net tests."
+      pip install --upgrade scikit-learn==0.23.2
+      install_packages="${install_packages}test_paramnet,"
+    else
+      echo "Skip installing the extra paramnet tests."
+    fi
+
 else
     echo "Skip installing tools for testing"
 fi
diff --git a/extra_requirements/tests.json b/extra_requirements/tests.json
index 6c27be97..b25d6755 100644
--- a/extra_requirements/tests.json
+++ b/extra_requirements/tests.json
@@ -2,5 +2,5 @@
   "codestyle": ["pycodestyle","flake8","pylint"],
   "pytest": ["pytest>=4.6","pytest-cov"],
   "test_paramnet": ["tqdm", "scikit-learn==0.23.2"],
-  "test_tabular_datamanager": ["pyarrow", "fastparquet"]
+  "test_tabular_datamanager": ["tqdm","pyarrow", "fastparquet"]
 }
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 4c53ecb0..ef1f292c 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@ def read_file(file_name):
     version=read_file('hpobench/__version__.py').split()[-1].strip('\''),
     packages=setuptools.find_packages(exclude=['*.tests', '*.tests.*',
                                                'tests.*', 'tests'],),
-    python_requires='>=3.6, <=3.10',
+    python_requires='>=3.6',
     install_requires=read_file('./requirements.txt').split('\n'),
     extras_require=get_extra_requirements(),
     test_suite='pytest',
diff --git a/tests/test_paramnet.py b/tests/test_paramnet.py
index 52d55f94..076f4b38 100644
--- a/tests/test_paramnet.py
+++ b/tests/test_paramnet.py
@@ -1,11 +1,13 @@
 import pytest
+import sys
 
-# import logging
-# logging.basicConfig(level=logging.DEBUG)
-# from hpobench.util.container_utils import enable_container_debug
-# enable_container_debug()
 
+MSG = 'Skip this test for new (>3.9) python versions. ' \
+      'The paramnet benchmarks require an specific old scikit learn version. This version however does not work under ' \
+      'python 3.10. Therefore we skip this test. The containerized version does still work under 3.10.'
 
+
+@pytest.mark.skipif(sys.version_info > (3, 9), reason=MSG)
 def test_load_data():
     from hpobench.util.data_manager import ParamNetDataManager
 

From 45b1eb0771ffd0eed228598fafeae052f5788844 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Wed, 28 Sep 2022 15:07:15 +0200
Subject: [PATCH 141/147] Speed up CI testing (#163)

* Speed up tests by disabling slow tests.
---
 hpobench/util/test_utils.py | 24 ++++++++++++++++++++++++
 tests/test_data_manager.py  | 11 +++++------
 tests/test_mo_cnn.py        |  2 ++
 tests/test_nasbench_201.py  | 11 ++++-------
 tests/test_pybnn.py         | 11 +++++++++--
 tests/test_utils.py         | 12 ++++++++++++
 tests/test_whitebox.py      |  1 +
 7 files changed, 57 insertions(+), 15 deletions(-)
 create mode 100644 hpobench/util/test_utils.py

diff --git a/hpobench/util/test_utils.py b/hpobench/util/test_utils.py
new file mode 100644
index 00000000..b2683135
--- /dev/null
+++ b/hpobench/util/test_utils.py
@@ -0,0 +1,24 @@
+import os
+
+CONST_RUN_ALL_TESTS_ENV_VAR = 'HPOBENCH_RUN_EXPENSIVE_TESTS'
+DEFAULT_SKIP_MSG = 'Skip this test due to time limitations'
+
+
+def check_run_all_tests():
+    """ Helper function: Check if all tests should run. """
+    return os.environ.get(CONST_RUN_ALL_TESTS_ENV_VAR, 'false').lower() == 'true'
+
+
+def enable_all_tests():
+    """
+    Some tests are quite expensive. We control if all runs should be executed by this
+    environment variable.
+    """
+    os.environ[CONST_RUN_ALL_TESTS_ENV_VAR] = 'true'
+
+
+def disable_all_tests():
+    """
+    This function disables the evaluation of all test functions.
+    """
+    os.environ[CONST_RUN_ALL_TESTS_ENV_VAR] = 'false'
diff --git a/tests/test_data_manager.py b/tests/test_data_manager.py
index 7e32ce84..cee56ccc 100644
--- a/tests/test_data_manager.py
+++ b/tests/test_data_manager.py
@@ -1,14 +1,13 @@
 import shutil
-from multiprocessing import Pool
-
 import pytest
+from multiprocessing import Pool
 
 import hpobench
 from hpobench.util.data_manager import NASBench_201Data, YearPredictionMSDData, ProteinStructureData, BostonHousingData
-skip_message = 'We currently skip this test because it takes too much time.'
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_load_thread_safe():
     shutil.rmtree(hpobench.config_file.data_dir / "nasbench_201", ignore_errors=True)
     function = lambda: NASBench_201Data(dataset='cifar100').load()
@@ -16,7 +15,7 @@ def test_nasbench_201_load_thread_safe():
         pool.map(function, [])
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_init():
 
     data_manager = NASBench_201Data(dataset='cifar100')
@@ -30,7 +29,7 @@ def test_nasbench_201_init():
     assert data_manager._save_dir.exists()
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_load():
 
     shutil.rmtree(hpobench.config_file.data_dir / "nasbench_201", ignore_errors=True)
diff --git a/tests/test_mo_cnn.py b/tests/test_mo_cnn.py
index 308c59ad..f721dfc3 100644
--- a/tests/test_mo_cnn.py
+++ b/tests/test_mo_cnn.py
@@ -1,6 +1,8 @@
 import pytest
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_mo_cnn_seeding():
     from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
     b1 = FlowerCNNBenchmark(rng=0)
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
index 70e46de9..42efbfea 100644
--- a/tests/test_nasbench_201.py
+++ b/tests/test_nasbench_201.py
@@ -1,5 +1,3 @@
-import logging
-logging.basicConfig(level=logging.DEBUG)
 import pytest
 
 from hpobench.container.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
@@ -7,8 +5,7 @@
 from hpobench.benchmarks.nas.nasbench_201 import \
     Cifar10ValidNasBench201MOBenchmark as LocalCifar10ValidNasBench201MOBenchmark
 from hpobench.util.container_utils import disable_container_debug, enable_container_debug
-
-skip_message = 'We currently skip this test because it takes too much time.'
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
 
 
 @pytest.fixture(scope='module')
@@ -18,7 +15,7 @@ def enable_debug():
     disable_container_debug()
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_cifar10valid(enable_debug):
 
     b = Cifar10ValidNasBench201Benchmark(rng=0)
@@ -53,7 +50,7 @@ def test_nasbench201_cifar10valid(enable_debug):
         result = b.objective_function_test(configuration=config, fidelity={'epoch': 10})
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_cifar100(enable_debug):
     b = Cifar100NasBench201Benchmark(rng=0)
 
@@ -73,7 +70,7 @@ def test_nasbench201_cifar100(enable_debug):
     assert result['info']['valid_cost'] == result['cost']
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_Image(enable_debug):
     b = ImageNetNasBench201Benchmark(rng=0)
     config = {'1<-0': 'nor_conv_1x1',
diff --git a/tests/test_pybnn.py b/tests/test_pybnn.py
index 0e749457..f1c6b5fc 100644
--- a/tests/test_pybnn.py
+++ b/tests/test_pybnn.py
@@ -1,14 +1,19 @@
+import sys
 import pytest
 
 from hpobench.container.benchmarks.ml.pybnn import BNNOnToyFunction, BNNOnBostonHousing, BNNOnProteinStructure, \
     BNNOnYearPrediction
 
-import logging
-logging.basicConfig(level=logging.DEBUG)
 from hpobench.util.container_utils import enable_container_debug
+from hpobench.util.test_utils import check_run_all_tests, DEFAULT_SKIP_MSG
+
 enable_container_debug()
+MSG = 'Skip this test for new (>3.9) python versions. ' \
+      'The paramnet benchmarks require an specific old scikit learn version. This version however does not work under ' \
+      'python 3.10. Therefore we skip this test. The containerized version does still work under 3.10.'
 
 
+@pytest.mark.skipif(sys.version_info > (3, 9), reason=MSG)
 def test_bnn_init():
     benchmark = BNNOnToyFunction(rng=1)
 
@@ -58,6 +63,7 @@ def test_bnn_boston_housing():
     assert test_result['info']['fidelity']['budget'] == 1000
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_bnn_protein():
     benchmark = BNNOnProteinStructure(rng=1)
     test_result = simple_call(benchmark)
@@ -66,6 +72,7 @@ def test_bnn_protein():
     assert test_result['info']['fidelity']['budget'] == 1000
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_year_pred():
     benchmark = BNNOnYearPrediction(rng=1)
     test_result = simple_call(benchmark)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9bc5ff3b..e570dbd7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -105,3 +105,15 @@ def test_debug_level():
 
     disable_container_debug()
     assert os.environ['HPOBENCH_DEBUG'] == 'false'
+
+
+def test_test_utils():
+    from hpobench.util.test_utils import DEFAULT_SKIP_MSG, enable_all_tests, disable_all_tests, check_run_all_tests
+
+    assert isinstance(DEFAULT_SKIP_MSG, str)
+
+    enable_all_tests()
+    assert check_run_all_tests()
+
+    disable_all_tests()
+    assert not check_run_all_tests()
\ No newline at end of file
diff --git a/tests/test_whitebox.py b/tests/test_whitebox.py
index 35a9a940..585f9867 100644
--- a/tests/test_whitebox.py
+++ b/tests/test_whitebox.py
@@ -63,6 +63,7 @@ def test_whitebox_with_container():
     assert np.isclose(test_loss, 0.43636, atol=0.001)
 
 
+@pytest.mark.skipif(skip_container_test, reason="Requires singularity and flask")
 def test_cartpole():
     from hpobench.container.benchmarks.rl.cartpole import CartpoleReduced as Benchmark
     b = Benchmark(container_name='cartpole',

From 3b2fa7b363090cac165bef2e73fbb7de730ee7f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 15 Dec 2022 11:24:46 +0100
Subject: [PATCH 142/147] Refactor PR (#164)

* Refactor: Rename AbstractBenchmark Class
---
 .github/workflows/run_tests.yml               |   2 +-
 hpobench/abstract_benchmark.py                | 105 ++++++------------
 hpobench/benchmarks/rl/cartpole.py            |   3 +-
 .../surrogates/paramnet_benchmark.py          |   2 +-
 .../container/client_abstract_benchmark.py    |   6 +-
 ...mark => Singularity.ml_tabular_benchmarks} |   0
 hpobench/dependencies/ml/data_manager.py      |  18 +--
 hpobench/dependencies/mo/scalar.py            |   3 +-
 hpobench/util/clean_up_script.py              |   5 +-
 hpobench/util/container_utils.py              |   8 +-
 10 files changed, 57 insertions(+), 95 deletions(-)
 rename hpobench/container/recipes/ml/{Singularity.ml_tabular_benchmark => Singularity.ml_tabular_benchmarks} (100%)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index bfacc9d1..fefd4308 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -45,7 +45,7 @@ jobs:
             RUN_TESTS: true
             USE_SINGULARITY: true
             SINGULARITY_VERSION: "3.8"
-            
+
           - python-version: "3.10"
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index 57e837c5..6a2942af 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -1,20 +1,20 @@
 """ Base-class of all benchmarks """
 
 import abc
-from typing import Union, Dict, List, Tuple
 import functools
-
 import logging
+from typing import Union, Dict, List, Tuple
+
 import ConfigSpace
 import numpy as np
-
 from ConfigSpace.util import deactivate_inactive_hyperparameters
+
 from hpobench.util import rng_helper
 
 logger = logging.getLogger('AbstractBenchmark')
 
 
-class AbstractBenchmark(abc.ABC, metaclass=abc.ABCMeta):
+class _BaseAbstractBenchmark(abc.ABC, metaclass=abc.ABCMeta):
 
     def __init__(self, rng: Union[int, np.random.RandomState, None] = None, **kwargs):
         """
@@ -34,7 +34,7 @@ def __init__(self, rng: Union[int, np.random.RandomState, None] = None, **kwargs
             np.random.RandomState with seed `rng` is created. If type is None,
             create a new random state.
         """
-
+        super(_BaseAbstractBenchmark, self).__init__(**kwargs)
         self.rng = rng_helper.get_rng(rng=rng)
         self.configuration_space = self.get_configuration_space(self.rng.randint(0, 10000))
         self.fidelity_space = self.get_fidelity_space(self.rng.randint(0, 10000))
@@ -210,20 +210,14 @@ def _check_and_cast_fidelity(fidelity: Union[dict, ConfigSpace.Configuration, No
         fidelity_space.check_configuration(fidelity)
         return fidelity
 
-    @staticmethod
-    def _check_return_values(return_values: Dict) -> Dict:
-        """
-        The return values should contain the fields `function_value` and `cost`.
-        """
-        assert 'function_value' in return_values.keys()
-        assert 'cost' in return_values.keys()
-
-        return return_values
-
     def __call__(self, configuration: Dict, **kwargs) -> float:
         """ Provides interface to use, e.g., SciPy optimizers """
         return self.objective_function(configuration, **kwargs)['function_value']
 
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        raise NotImplementedError()
+
     @staticmethod
     @abc.abstractmethod
     def get_configuration_space(seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
@@ -269,74 +263,39 @@ def get_meta_information() -> Dict:
         raise NotImplementedError()
 
 
-class AbstractMultiObjectiveBenchmark(AbstractBenchmark):
+class AbstractSingleObjectiveBenchmark(_BaseAbstractBenchmark):
     """
-    Abstract Benchmark class for multi-objective benchmarks.
-    The only purpose of this class is to point out to users that this benchmark returns multiple
-    objective function values.
+    Abstract Benchmark class for single-objective benchmarks.
+    This corresponds to the old AbstractBenchmark class.
+
+    The only purpose of this class is to point out to users that this benchmark returns only a single
+    objective function value.
 
     When writing a benchmark, please make sure to inherit from the correct abstract class.
     """
-    @abc.abstractmethod
-    def objective_function(self, configuration: Union[ConfigSpace.Configuration, Dict],
-                           fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
-        """
-        Objective function.
-
-        Override this function to provide your multi-objective benchmark function. This
-        function will be called by one of the evaluate functions. For
-        flexibility, you have to return a dictionary with the only mandatory
-        key being `function_values`, the objective function values for the
-        `configuration` which was passed. By convention, all benchmarks are
-        minimization problems.
 
-        `function_value` is a dictionary that contains all available criteria.
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        """
+        The return values should contain the fields `function_value` and `cost`.
+        """
+        assert 'function_value' in return_values.keys()
+        assert 'cost' in return_values.keys()
+        return return_values
 
-        Parameters
-        ----------
-        configuration : Dict
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            It might be useful to pass a `rng` argument to the function call to
-            bypass the default "seed" generator. Only using the default random
-            state (`self.rng`) could lead to an overfitting towards the
-            `self.rng`'s seed.
 
-        Returns
-        -------
-        Dict
-            Must contain at least the key `function_value` and `cost`.
-            Note that `function_value` should be a Dict here.
-        """
-        raise NotImplementedError()
+# Ensure compatibility with older versions of the HPOBench
+AbstractBenchmark = AbstractSingleObjectiveBenchmark
 
-    @abc.abstractmethod
-    def objective_function_test(self, configuration: Union[ConfigSpace.Configuration, Dict],
-                                fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
-        """
-        If there is a different objective function for offline testing, e.g
-        testing a machine learning on a hold extra test set instead
-        on a validation set override this function here.
 
-        Parameters
-        ----------
-        configuration : Dict
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            see :py:func:`~HPOBench.abstract_benchmark.objective_function`
+class AbstractMultiObjectiveBenchmark(_BaseAbstractBenchmark):
+    """
+    Abstract Benchmark class for multi-objective benchmarks.
+    The only purpose of this class is to point out to users that this benchmark returns multiple
+    objective function values.
 
-        Returns
-        -------
-        Dict
-            Must contain at least the key `function_value` and `cost`.
-        """
-        raise NotImplementedError()
+    When writing a benchmark, please make sure to inherit from the correct abstract class.
+    """
 
     @staticmethod
     def _check_return_values(return_values: Dict) -> Dict:
diff --git a/hpobench/benchmarks/rl/cartpole.py b/hpobench/benchmarks/rl/cartpole.py
index 3bcaeab4..ea9ef053 100644
--- a/hpobench/benchmarks/rl/cartpole.py
+++ b/hpobench/benchmarks/rl/cartpole.py
@@ -20,12 +20,13 @@
 """
 
 import logging
+import os
 import time
 from typing import Union, Dict
 
 import ConfigSpace as CS
 import numpy as np
-import os
+
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 
 import tensorflow as tf  # noqa: E402
diff --git a/hpobench/benchmarks/surrogates/paramnet_benchmark.py b/hpobench/benchmarks/surrogates/paramnet_benchmark.py
index 2e809b7b..35c7f80d 100644
--- a/hpobench/benchmarks/surrogates/paramnet_benchmark.py
+++ b/hpobench/benchmarks/surrogates/paramnet_benchmark.py
@@ -61,8 +61,8 @@
 0.0.1:
 * First implementation
 """
-import warnings
 import logging
+import warnings
 from typing import Union, Dict
 
 import ConfigSpace as CS
diff --git a/hpobench/container/client_abstract_benchmark.py b/hpobench/container/client_abstract_benchmark.py
index 6bbc3489..d2963c00 100644
--- a/hpobench/container/client_abstract_benchmark.py
+++ b/hpobench/container/client_abstract_benchmark.py
@@ -14,12 +14,12 @@
 The name of the container (``container_name``) is defined either in its belonging
 container-benchmark definition. (hpobench/container/<type>/<name> or via ``container_name``.
 """
-import os
 import abc
-import sys
 import json
 import logging
+import os
 import subprocess
+import sys
 import time
 from pathlib import Path
 from typing import Optional, Union, Dict, List, Tuple
@@ -27,8 +27,8 @@
 
 import ConfigSpace as CS
 import Pyro4
-import Pyro4.util
 import Pyro4.errors
+import Pyro4.util
 import numpy as np
 from ConfigSpace.read_and_write import json as csjson
 from oslo_concurrency import lockutils
diff --git a/hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark b/hpobench/container/recipes/ml/Singularity.ml_tabular_benchmarks
similarity index 100%
rename from hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark
rename to hpobench/container/recipes/ml/Singularity.ml_tabular_benchmarks
diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml/data_manager.py
index 526c6756..ebc48c95 100644
--- a/hpobench/dependencies/ml/data_manager.py
+++ b/hpobench/dependencies/ml/data_manager.py
@@ -1,20 +1,20 @@
-import openml
-import numpy as np
-import pandas as pd
-from typing import Union
 from pathlib import Path
+from typing import Union
 
+import numpy as np
+import openml
+import pandas as pd
+from oslo_concurrency import lockutils
+from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-from oslo_concurrency import lockutils
+from sklearn.utils import check_random_state
 
-from hpobench.util.data_manager import DataManager
 from hpobench import config_file
+from hpobench.util.data_manager import DataManager
 
 
 class OpenMLDataManager(DataManager):
diff --git a/hpobench/dependencies/mo/scalar.py b/hpobench/dependencies/mo/scalar.py
index 3f434fde..185c2730 100644
--- a/hpobench/dependencies/mo/scalar.py
+++ b/hpobench/dependencies/mo/scalar.py
@@ -1,6 +1,7 @@
-import numpy as np
 from typing import Union
 
+import numpy as np
+
 try:
     from sklearn.preprocessing import MinMaxScaler, StandardScaler
 except ImportError:
diff --git a/hpobench/util/clean_up_script.py b/hpobench/util/clean_up_script.py
index 5fe9fd0c..771ab80f 100644
--- a/hpobench/util/clean_up_script.py
+++ b/hpobench/util/clean_up_script.py
@@ -1,7 +1,8 @@
+import logging
+import shutil
+
 from hpobench import config_file
 
-import shutil
-import logging
 logger = logging.getLogger('Clean-up')
 logger.setLevel(logging.INFO)
 
diff --git a/hpobench/util/container_utils.py b/hpobench/util/container_utils.py
index 7fee19e9..bb7221c3 100644
--- a/hpobench/util/container_utils.py
+++ b/hpobench/util/container_utils.py
@@ -1,11 +1,11 @@
-import os
+import enum
 import importlib
 import json
-import numpy as np
-import enum
-
+import os
 from typing import Any, Union
 
+import numpy as np
+
 from hpobench.util.rng_helper import serialize_random_state, deserialize_random_state
 
 

From ebff93569866985307409ca26667c59e0515426b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 15 Dec 2022 12:34:58 +0100
Subject: [PATCH 143/147] Nas201 - v0.0.6 (#165)

* Update 201 version 0.0.6

- scale the misclassification rate form 0 to 100 -> 0 and 1
- Update test cases
- link to the correct container version
- Fix Nas201 Test Cases
- remove a Github action trigger. It started all test cases twice.
- Fix GithubActions: Install correct xgboost version for 3.10
---
 .github/workflows/run_tests.yml               |   2 +-
 ci_scripts/install.sh                         |  18 +-
 extra_requirements/xgboost.json               |   3 +-
 hpobench/benchmarks/nas/nasbench_201.py       | 664 ++++++++----------
 .../container/benchmarks/nas/nasbench_201.py  |  12 +-
 tests/test_nasbench_201.py                    |  18 +-
 6 files changed, 309 insertions(+), 408 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index fefd4308..3d52b250 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -2,7 +2,7 @@
 
 name: Test Pull Requests
 
-on: [push, pull_request]
+on: [push]
 
 jobs:
   Tests:
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index fb62b5d2..d361600d 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -4,7 +4,7 @@ install_packages=""
 
 if [[ "$RUN_TESTS" == "true" ]]; then
     echo "Install tools for testing"
-    install_packages="${install_packages}xgboost,pytest,test_tabular_datamanager,"
+    install_packages="${install_packages}pytest,test_tabular_datamanager,"
     pip install codecov
 
     PYVERSION=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]*\).*/\1\2/')
@@ -14,9 +14,12 @@ if [[ "$RUN_TESTS" == "true" ]]; then
       # Since we are not using a "--upgrade" option later on, pip skips to install another scikit-learn version.
       echo "Install the right scikit-learn function for the param net tests."
       pip install --upgrade scikit-learn==0.23.2
-      install_packages="${install_packages}test_paramnet,"
+      install_packages="${install_packages}xgboost,test_paramnet,"
     else
       echo "Skip installing the extra paramnet tests."
+      # For 3.10, we need a different pandas version - this comes as a requirement for the old xgboost benchmark.
+      # building pandas<=1.5.0 does not work with 3.10 anymore. -> install a different version.
+      install_packages="${install_packages}xgboost_310,"
     fi
 
 else
@@ -42,7 +45,16 @@ if [[ "$RUN_LOCAL_EXAMPLES" == "true" ]]; then
     echo "Install packages for local examples"
     echo "Install swig"
     sudo apt-get update && sudo apt-get install -y build-essential swig
-    install_packages="${install_packages}xgboost,"
+
+    PYVERSION=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]*\).*/\1\2/')
+    if [[ "${PYVERSION}" != "310" ]]; then
+      # For 3.10, we need a different pandas version - this comes as a requirement for the old xgboost benchmark.
+      # building pandas<=1.5.0 does not work with 3.10 anymore. -> install a different version.
+      install_packages="${install_packages}xgboost,"
+    else
+      install_packages="${install_packages}xgboost_310,"
+    fi
+
 else
     echo "Skip installing packages for local examples"
 fi
diff --git a/extra_requirements/xgboost.json b/extra_requirements/xgboost.json
index 2789d2ef..eefc920c 100644
--- a/extra_requirements/xgboost.json
+++ b/extra_requirements/xgboost.json
@@ -1,3 +1,4 @@
 {
-  "xgboost": ["xgboost==0.90","pandas>=1.0.0,<1.1.5","openml==0.10.2","scikit-learn>=0.18.1"]
+  "xgboost": ["xgboost==0.90","pandas>=1.0.0,<1.1.5","openml==0.10.2","scikit-learn>=0.18.1"],
+  "xgboost_310": ["xgboost","pandas","openml==0.10.2","scikit-learn>=0.18.1"]
 }
\ No newline at end of file
diff --git a/hpobench/benchmarks/nas/nasbench_201.py b/hpobench/benchmarks/nas/nasbench_201.py
index 0c2324c2..1ca0beb3 100644
--- a/hpobench/benchmarks/nas/nasbench_201.py
+++ b/hpobench/benchmarks/nas/nasbench_201.py
@@ -30,6 +30,8 @@
 0.0.6
 * Add the multiobjective version of this benchmark by returning flops, model size, latency and missclassification rate
 * Integrate #138: Improve the docstrings about the seeds.
+* Scale the returned misclassification rate from range [0, 100] to [0, 1].
+* Improve naming in the result object ("*_precision" -> "*_misclassification_rate")
 
 0.0.5
 * Add for each benchmark a new one with a different fidelity space.
@@ -51,25 +53,23 @@
 * First implementation
 """
 import logging
-from typing import Union, Dict, List, Text, Tuple
 from copy import deepcopy
+from typing import Union, Dict, List, Text, Tuple
 
 import ConfigSpace as CS
 import numpy as np
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
-
+from hpobench.abstract_benchmark import AbstractSingleObjectiveBenchmark, AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import NASBench_201Data
 
-
 __version__ = '0.0.6'
 MAX_NODES = 4
 
 logger = logging.getLogger('NASBENCH201')
 
 
-class NasBench201BaseMOBenchmark(AbstractMultiObjectiveBenchmark):
+class _NasBench201BaseBenchmark:
     def __init__(self, dataset: str,
                  rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         """
@@ -153,13 +153,12 @@ def __init__(self, dataset: str,
             Random seed for the benchmark's random state.
         """  # noqa: E501
 
-        super(NasBench201BaseMOBenchmark, self).__init__(rng=rng)
-
         data_manager = NASBench_201Data(dataset=dataset)
 
         self.dataset = dataset
         self.data = data_manager.load()
-        self.config_to_structure = NasBench201BaseMOBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        self.config_to_structure = _NasBench201BaseMOBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        super(_NasBench201BaseBenchmark, self).__init__(rng=rng, **kwargs)
 
     def dataset_mapping(self, dataset):
         mapping = {'cifar10-valid': ('x-valid', 'ori-test'),
@@ -167,76 +166,115 @@ def dataset_mapping(self, dataset):
                    'cifar100': ('ori-test', 'x-test')}
         return mapping[dataset]
 
-    # pylint: disable=arguments-differ
-    @AbstractMultiObjectiveBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[Dict, CS.Configuration, None] = None,
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
-                           **kwargs) -> Dict:
+    @staticmethod
+    def config_to_structure_func(max_nodes: int):
+        """
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
         """
-        Objective function for the NASBench201 benchmark.
-        This functions sends a query to NASBench201 and evaluates the configuration.
-        As already explained in the class definition, different data sets are trained on different splits.
 
-        The table above gives a detailed summary over the available splits, epochs, and which identifier are used per
-        dataset.
+        def config_to_structure(config):
+            genotypes = []
+            for i in range(1, max_nodes):
+                x_list = []
+                for j in range(i):
+                    node_str = f'{i}<-{j}'
+                    op_name = config[node_str]
+                    x_list.append((op_name, j))
+                genotypes.append(tuple(x_list))
+            return _NasBench201BaseMOBenchmark._Structure(genotypes)
+
+        return config_to_structure
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Return the CS representation of the search space.
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
 
         Parameters
         ----------
-        configuration
-        fidelity: Dict, None
-            epoch: int - Values: [1, 200]
-                Number of epochs an architecture was trained.
-                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+        seed : int, None
+            Random seed for the configuration space.
 
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            Random seed to use in the benchmark.
+        Returns
+        -------
+        CS.ConfigurationSpace -
+            Containing the benchmark's hyperparameter
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
 
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        data_seed : List, Tuple, None, int
-            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
-            The user can specify which seed to use. If more than one seed is given, the results are averaged
-            across the seeds but then the training time is the sum of the costs per seed.
-            When this value is explicitly set to `None`, the function will chose randomly one out of [777, 888, 999].
+        search_space = _NasBench201BaseMOBenchmark.get_search_spaces('cell', 'nas-bench-201')
+        hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
+        cs.add_hyperparameters(hps)
+        return cs
 
-            Note:
-                For some architectures (configurations) no run was available. We've set missing values to an
-                available value from another seed. Therefore, it is possible that run results are exactly the same for
-                different seeds.
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 201.
 
-        kwargs
+        Fidelities
+        ----------
+         epoch: int
+            The loss / accuracy at `epoch`. Can be from 0 to 199.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
 
         Returns
         -------
-        Dict -
-            function_value : Dict
-                misclassification_rate : float
-                    1 - validation accuracy
-                num_flops : float
-                    Number of floating point operations in M
-                model_size : float
-                    Model size in MB
-                latency : float
-                    Time to evaluate a configuration in seconds
-            cost : time to train the network
-            info : Dict
-                train_precision : float
-                train_losses : float
-                train_cost : float
-                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
-                    this field is the sum of the training time per network
-                eval_precision : float
-                eval_losses : float
-                eval_cost : float
-                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
-                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
-                fidelity : Dict
-                    used fidelities in this evaluation
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('epoch', lower=1, upper=200, default_value=200)
+        ])
+        return fidel_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search',
+                'references': ['@article{dong2020bench,'
+                               'title   = {Nas-bench-201: Extending the scope of reproducible neural '
+                               '           architecture search},'
+                               'author  = {Dong, Xuanyi and Yang, Yi},'
+                               'journal = {arXiv preprint arXiv:2001.00326},'
+                               'year    = {2020}}',
+                               'https://openreview.net/forum?id=HJxyZkBKDr',
+                               ],
+                'code': 'https://github.com/D-X-Y/AutoDL-Projects',
+                }
+
+    @staticmethod
+    def get_search_spaces(xtype: str, name: str) -> List[Text]:
+        """ obtain the search space, i.e., a dict mapping the operation name into a python-function for this op
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/lib/models/__init__.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
         """
+        # pylint: disable=no-else-return
+        if xtype == 'cell':
+            NAS_BENCH_201 = ['none', 'skip_connect', 'nor_conv_1x1', 'nor_conv_3x3', 'avg_pool_3x3']
+            SearchSpaceNames = {'nas-bench-201': NAS_BENCH_201}
+            assert name in SearchSpaceNames, 'invalid name [{:}] in {:}'.format(name, SearchSpaceNames.keys())
+            return SearchSpaceNames[name]
+        else:
+            raise ValueError('invalid search-space type is {:}'.format(xtype))
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[Dict, CS.Configuration, None] = None,
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                               **kwargs) -> Dict:
+
         self.rng = rng_helper.get_rng(rng)
 
         if isinstance(data_seed, (List, Tuple)):
@@ -245,7 +283,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 logger.debug('There are some values more than once in the run_index. We remove the redundant entries.')
             data_seed = tuple(set(data_seed))
         elif isinstance(data_seed, int):
-            data_seed = (data_seed, )
+            data_seed = (data_seed,)
         elif data_seed is None:
             logger.debug('The data seed is explicitly set to None! A random seed will be selected.')
             data_seed = tuple(self.rng.choice((777, 888, 999), size=1))
@@ -254,7 +292,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             raise ValueError(f'data seed has unknown data type {type(data_seed)}, '
                              f'but should be tuple or int (777,888,999)')
 
-        assert len(set(data_seed) - {777, 888, 999}) == 0,\
+        assert len(set(data_seed) - {777, 888, 999}) == 0, \
             f'data seed can only contain the elements 777, 888, 999, but was {data_seed}'
 
         structure = self.config_to_structure(configuration)
@@ -291,44 +329,112 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         return {
             'function_value': {
-                'misclassification_rate': float(100 - np.mean(valid_accuracies)),
+                # The original benchmark returned the accuracy with range [0, 100].
+                # We cast it to a minimization problem with range [0-1] to have a more standardized return value.
+                'misclassification_rate': 0.01 * float(100 - np.mean(valid_accuracies)),
                 'num_flops': float(np.mean(num_flops)),
                 'model_size': float(np.mean(model_size)),
                 'latency': float(np.mean(latency)),
             },
             'cost': float(np.sum(valid_times) + np.sum(train_times)),
             'info': {
-                'train_precision': float(100 - np.mean(train_accuracies)),
+                'train_misclassification_rate': 0.01 * float(100 - np.mean(train_accuracies)),
                 'train_losses': float(np.mean(train_losses)),
                 'train_cost': float(np.sum(train_times)),
-                'valid_precision': float(100 - np.mean(valid_accuracies)),
+                'valid_misclassification_rate': 0.01 * float(100 - np.mean(valid_accuracies)),
                 'valid_losses': float(np.mean(valid_losses)),
                 'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
-                'test_precision': float(100 - np.mean(test_accuracies)),
+                'test_misclassification_rate': 0.01 * float(100 - np.mean(test_accuracies)),
                 'test_losses': float(np.mean(test_losses)),
                 'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
                 'fidelity': fidelity
             }
         }
 
+    def _mo_objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                    fidelity: Union[Dict, CS.Configuration, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
+        # The result dict should contain already all necessary information -> Just swap the function value from valid
+        # to test and the corresponding time cost
+        assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
+
+        if 'data_seed' in kwargs:
+            all_seeds_available = all([seed in kwargs['data_seed'] for seed in (777, 888, 999)])
+            if not all_seeds_available:
+                logger.warning('You have not specified all available seeds for the '
+                               '`objective_function_test`. However, we are going to ignore them, '
+                               ' because we report test values only as mean across all seeds.'
+                               f' Your given seeds: {kwargs["seed"]}')
+            del kwargs['data_seed']
+
+        result = self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                             data_seed=(777, 888, 999),
+                                             rng=rng, **kwargs)
+        result['function_value']['misclassification_rate'] = result['info']['test_misclassification_rate']
+        result['cost'] = result['info']['test_cost']
+        return result
+
+    class _Structure:
+        def __init__(self, genotype):
+            assert isinstance(genotype, (list, tuple)), 'invalid class of genotype : {:}'.format(type(genotype))
+            self.node_num = len(genotype) + 1
+            self.nodes = []
+            self.node_N = []
+            for idx, node_info in enumerate(genotype):
+                assert isinstance(node_info, (list, tuple)), 'invalid class of node_info : {:}'.format(type(node_info))
+                assert len(node_info) >= 1, 'invalid length : {:}'.format(len(node_info))
+                for node_in in node_info:
+                    assert isinstance(node_in, (list, tuple)), 'invalid class of in-node : {:}'.format(type(node_in))
+                    assert len(node_in) == 2 and node_in[1] <= idx, 'invalid in-node : {:}'.format(node_in)
+                self.node_N.append(len(node_info))
+                self.nodes.append(tuple(deepcopy(node_info)))
+
+        def tostr(self):
+            """ Helper function: Create a string representation of the configuration """
+            strings = []
+            for node_info in self.nodes:
+                string = '|'.join([x[0] + '~{:}'.format(x[1]) for x in node_info])
+                string = '|{:}|'.format(string)
+                strings.append(string)
+            return '+'.join(strings)
+
+        def __repr__(self):
+            return (
+                '{name}({node_num} nodes with {node_info})'.format(name=self.__class__.__name__, node_info=self.tostr(),
+                                                                   **self.__dict__))
+
+        def __len__(self):
+            return len(self.nodes) + 1
+
+        def __getitem__(self, index):
+            return self.nodes[index]
+
+
+class _NasBench201BaseMOBenchmark(_NasBench201BaseBenchmark, AbstractMultiObjectiveBenchmark):
+    # pylint: disable=arguments-differ
     @AbstractMultiObjectiveBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                           **kwargs) -> Dict:
         """
-        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 200).
-        The test function uses all data set seeds (777, 888, 999).
+        Objective function for the NASBench201 benchmark.
+        This functions sends a query to NASBench201 and evaluates the configuration.
+        As already explained in the class definition, different data sets are trained on different splits.
 
-        See also :py:meth:`~hpobench.benchmarks.nas.nasbench_201.objective_function`
+        The table above gives a detailed summary over the available splits, epochs, and which identifier are used per
+        dataset.
 
         Parameters
         ----------
         configuration
         fidelity: Dict, None
-            epoch: int - Values: [200]
+            epoch: int - Values: [1, 200]
                 Number of epochs an architecture was trained.
-                Note: We only have test performance on the last epoch.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
             Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
         rng : np.random.RandomState, int, None
             Random seed to use in the benchmark.
@@ -336,6 +442,16 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
             To prevent overfitting on a single seed, it is possible to pass a
             parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
             If this parameter is not given, the default random state is used.
+        data_seed : List, Tuple, None, int
+            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
+            The user can specify which seed to use. If more than one seed is given, the results are averaged
+            across the seeds but then the training time is the sum of the costs per seed.
+            When this value is explicitly set to `None`, the function will chose randomly one out of [777, 888, 999].
+
+            Note:
+                For some architectures (configurations) no run was available. We've set missing values to an
+                available value from another seed. Therefore, it is possible that run results are exactly the same for
+                different seeds.
 
         kwargs
 
@@ -344,292 +460,110 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         Dict -
             function_value : Dict
                 misclassification_rate : float
-                    1 - test accuracy
+                    1 - validation accuracy
                 num_flops : float
                     Number of floating point operations in M
                 model_size : float
                     Model size in MB
                 latency : float
                     Time to evaluate a configuration in seconds
-            cost : time to the network + time to validate
+            cost : time to train the network
             info : Dict
-                train_precision
-                train_losses
-                train_cost
-                eval_precision
-                eval_losses
-                eval_cost
-                fidelity : used fidelities in this evaluation
+                train_misclassification_rate : float
+                train_losses : float
+                train_cost : float
+                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
+                    this field is the sum of the training time per network
+                eval_misclassification_rate : float
+                eval_losses : float
+                eval_cost : float
+                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
+                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
+                fidelity : Dict
+                    used fidelities in this evaluation
         """
+        return self._mo_objective_function(configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed,
+                                           **kwargs)
 
-        # The result dict should contain already all necessary information -> Just swap the function value from valid
-        # to test and the corresponding time cost
-        assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
-
-        if 'data_seed' in kwargs:
-            all_seeds_available = all([seed in kwargs['data_seed'] for seed in (777, 888, 999)])
-            if not all_seeds_available:
-                logger.warning('You have not specified all available seeds for the '
-                               '`objective_function_test`. However, we are going to ignore them, '
-                               ' because we report test values only as mean across all seeds.'
-                               f' Your given seeds: {kwargs["seed"]}')
-            del kwargs['data_seed']
-
-        result = self.objective_function(configuration=configuration, fidelity=fidelity,
-                                         data_seed=(777, 888, 999),
-                                         rng=rng, **kwargs)
-        result['function_value']['misclassification_rate'] = result['info']['test_precision']
-        result['cost'] = result['info']['test_cost']
-        return result
-
-    @staticmethod
-    def config_to_structure_func(max_nodes: int):
-        """
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
-        """
-        def config_to_structure(config):
-            genotypes = []
-            for i in range(1, max_nodes):
-                x_list = []
-                for j in range(i):
-                    node_str = f'{i}<-{j}'
-                    op_name = config[node_str]
-                    x_list.append((op_name, j))
-                genotypes.append(tuple(x_list))
-            return NasBench201BaseMOBenchmark._Structure(genotypes)
-        return config_to_structure
-
-    @staticmethod
-    def get_search_spaces(xtype: str, name: str) -> List[Text]:
-        """ obtain the search space, i.e., a dict mapping the operation name into a python-function for this op
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/lib/models/__init__.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
         """
-        # pylint: disable=no-else-return
-        if xtype == 'cell':
-            NAS_BENCH_201 = ['none', 'skip_connect', 'nor_conv_1x1', 'nor_conv_3x3', 'avg_pool_3x3']
-            SearchSpaceNames = {'nas-bench-201': NAS_BENCH_201}
-            assert name in SearchSpaceNames, 'invalid name [{:}] in {:}'.format(name, SearchSpaceNames.keys())
-            return SearchSpaceNames[name]
-        else:
-            raise ValueError('invalid search-space type is {:}'.format(xtype))
+        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 200).
+        The test function uses all data set seeds (777, 888, 999).
 
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Return the CS representation of the search space.
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+        See also :py:meth:`~hpobench.benchmarks.nas.nasbench_201.objective_function`
 
         Parameters
         ----------
-        seed : int, None
-            Random seed for the configuration space.
-
-        Returns
-        -------
-        CS.ConfigurationSpace -
-            Containing the benchmark's hyperparameter
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
-
-        search_space = NasBench201BaseMOBenchmark.get_search_spaces('cell', 'nas-bench-201')
-        hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
-        cs.add_hyperparameters(hps)
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the NAS Benchmark 201.
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [200]
+                Number of epochs an architecture was trained.
+                Note: We only have test performance on the last epoch.
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
 
-        Fidelities:
-         - epoch: int
-         The loss / accuracy at `epoch`. Can be from 0 to 199.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
 
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
+        kwargs
 
         Returns
         -------
-        ConfigSpace.ConfigurationSpace
+        Dict -
+            function_value : Dict
+                misclassification_rate : float
+                    1 - test accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
+            cost : time to the network + time to validate
+            info : Dict
+                train_misclassification_rate
+                train_losses
+                train_cost
+                eval_misclassification_rate
+                eval_losses
+                eval_cost
+                fidelity : used fidelities in this evaluation
         """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformIntegerHyperparameter('epoch', lower=1, upper=200, default_value=200)
-        ])
-
-        return fidel_space
+        return self._mo_objective_function_test(configuration=configuration, fidelity=fidelity, rng=rng, **kwargs)
 
     @staticmethod
     def get_objective_names() -> List[str]:
         return ['misclassification_rate', 'num_flops', 'model_size', 'latency']
 
-    @staticmethod
-    def get_meta_information() -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search',
-                'references': ['@article{dong2020bench,'
-                               'title   = {Nas-bench-201: Extending the scope of reproducible neural '
-                               '           architecture search},'
-                               'author  = {Dong, Xuanyi and Yang, Yi},'
-                               'journal = {arXiv preprint arXiv:2001.00326},'
-                               'year    = {2020}}',
-                               'https://openreview.net/forum?id=HJxyZkBKDr',
-                               ],
-                'code': 'https://github.com/D-X-Y/AutoDL-Projects',
-                }
 
-    class _Structure:
-        def __init__(self, genotype):
-            assert isinstance(genotype, (list, tuple)), 'invalid class of genotype : {:}'.format(type(genotype))
-            self.node_num = len(genotype) + 1
-            self.nodes = []
-            self.node_N = []
-            for idx, node_info in enumerate(genotype):
-                assert isinstance(node_info, (list, tuple)), 'invalid class of node_info : {:}'.format(type(node_info))
-                assert len(node_info) >= 1, 'invalid length : {:}'.format(len(node_info))
-                for node_in in node_info:
-                    assert isinstance(node_in, (list, tuple)), 'invalid class of in-node : {:}'.format(type(node_in))
-                    assert len(node_in) == 2 and node_in[1] <= idx, 'invalid in-node : {:}'.format(node_in)
-                self.node_N.append(len(node_info))
-                self.nodes.append(tuple(deepcopy(node_info)))
-
-        def tostr(self):
-            """ Helper function: Create a string representation of the configuration """
-            strings = []
-            for node_info in self.nodes:
-                string = '|'.join([x[0] + '~{:}'.format(x[1]) for x in node_info])
-                string = '|{:}|'.format(string)
-                strings.append(string)
-            return '+'.join(strings)
-
-        def __repr__(self):
-            return (
-                '{name}({node_num} nodes with {node_info})'.format(name=self.__class__.__name__, node_info=self.tostr(),
-                                                                   **self.__dict__))
-
-        def __len__(self):
-            return len(self.nodes) + 1
-
-        def __getitem__(self, index):
-            return self.nodes[index]
-
-
-class Cifar10ValidNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+class Cifar10ValidNasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201MOBenchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+class Cifar100NasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201MOBenchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+class ImageNetNasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201MOBenchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
 
 
-class NasBench201SOBenchmark(AbstractBenchmark):
-    def __init__(self, dataset: str,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-        """
-        Benchmark interface to the NASBench201 Benchmarks. The NASBench201 contains
-        results for architectures on 4 different data sets.
-
-        We have split the "api" file from NASBench201 in separate files per data set.
-        The original "api" file contains all data sets, but loading this single file took too much RAM.
-
-        We recommend to not call this base class directly but using the correct subclass below.
-
-        The parameter ``dataset`` indicates which data set was used for training.
-
-        For each data set the metrics
-        'train_acc1es', 'train_losses', 'train_times', 'eval_acc1es', 'eval_times', 'eval_losses' are available.
-        However, the data sets report them on different data splits (train, train + valid, test, valid or test+valid).
-
-        We summarize all information about the data sets in the following tables.
-
-        Datastet        Metric      Avail.Epochs    Explanation             returned by HPOBENCH
-        ----------------------------------------------------------------------------------------
-        cifar10-valid   train       [0-199]         training set
-        cifar10-valid   x-valid     [0-199]         validation set          objective function
-        cifar10-valid   x-test
-        cifar10-valid   ori-test    199             test set                objective function test
-
-        cifar100        train       [0-199]         training set
-        cifar100        x-valid     199             validation set
-        cifar100        x-test      199             test set                objective function test
-        cifar100        ori-test    [0-199]         validation + test set   objective function
-
-        ImageNet16-120  train       [0-199]         training set
-        ImageNet16-120  x-valid     199             validation set
-        ImageNet16-120  x-test      199             test set                objective function test
-        ImageNet16-120  ori-test    [0-199]         validation + test set   objective function
-
-
-        We have also extracted the incumbents per split. We report the incumbent accuracy and loss performance
-        i) by taking the maximum value across all seeds and configurations
-        ii) averaged across the three available seeds
-
-                                    i) The best possible incumbents (NO AVG!)                       ii) The "average" incumbent
-        Datastet        Metric      (Index of Arch, Accuracy)       (Index, Loss)                   (Index of Arch, Accuracy)       (Index, Loss)
-        ----------------------------------------------------------------------------------------------------------------------------------------------------------
-        cifar10-valid   train       (258, 100.0)                    (2778, 0.001179278278425336)    (10154, 100)                    (2778, 0.0013082386429297428)
-        cifar10-valid   x-valid     (6111, 91.71999999023437)       (14443, 0.3837750501537323)     (6111, 91.60666665039064)       (3888, 0.3894046771335602)
-        cifar10-valid   x-test
-        cifar10-valid   ori-test    (14174, 91.65)                  (3385, 0.3850496160507202)      (1459, 91.52333333333333)       (3385, 0.3995230517864227)
-
-        cifar100        train       (9930, 99.948)                  (9930, 0.012630240231156348)    (9930, 99.93733333333334)       (9930, 0.012843489621082942)
-        cifar100        x-valid     (13714, 73.71999998779297)      (13934, 1.1490126512527465)     (9930, 73.4933333577474)        (7361, 1.1600867895126343)
-        cifar100        x-test      (1459, 74.28000004882813)       (15383, 1.1427113876342774)     (9930, 73.51333332112631)       (7337, 1.1747569534301758)
-        cifar100        ori-test    (9930, 73.88)                   (13706, 1.1610547459602356)     (9930, 73.50333333333333)       (7361, 1.1696554500579834)
-
-        ImageNet16-120  train       (9930, 73.2524719841793)        (9930, 0.9490517352046979)      (9930, 73.22918040138735)       (9930, 0.9524298415108582)
-        ImageNet16-120  x-valid     (13778, 47.39999985758463)      (10721, 2.0826991437276203)     (10676, 46.73333327229818)      (10721, 2.0915397168795264)
-        ImageNet16-120  x-test      (857, 48.03333317057292)        (12887, 2.0940088628133138)     (857, 47.31111100599501)        (11882, 2.106453532218933)
-        ImageNet16-120  ori-test    (857, 47.083333353678384)       (11882, 2.0950548852284747)     (857, 46.8444444647895)         (11882, 2.1028235816955565)
-
-
-        Note:
-        - The parameter epoch is 0 indexed!
-        - In the original data, the training splits are always marked with the key 'train' but they use different
-          identifiers to refer to the available evaluation splits. We report them also in the table below.
-        - We exclude the data set cifar10 from this benchmark.
-        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
-          missing values with the values from an available seed.
-
-         Some further remarks:
-        - cifar10-valid is trained on the train split and tested on the validation split.
-        - The train metrics are dictionaries with epochs (e.g. 0, 1, 2) as key and the metric as value.
-          The evaluation metrics, however, have as key the identifiers, e.g. ori-test@0, with 0 indicating the epoch.
-          Also, each data set reports values for all 200 epochs for a metric on the specified split
-          and a single value on the 200th epoch for the other splits.
-
-        Parameters
-        ----------
-        dataset : str
-            One of cifar10-valid, cifar10, cifar100, ImageNet16-120.
-        rng : np.random.RandomState, int, None
-            Random seed for the benchmark's random state.
-        """  # noqa: E501
-
-        super(NasBench201SOBenchmark, self).__init__(rng=rng, **kwargs)
-        self.mo_benchmark = NasBench201BaseMOBenchmark(rng=rng, dataset=dataset, **kwargs)
+class _NasBench201SOBenchmark(_NasBench201BaseBenchmark, AbstractSingleObjectiveBenchmark):
 
     # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
+    @AbstractSingleObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
@@ -674,15 +608,15 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : training precision
+            function_value : training misclassification_rate
             cost : time to train the network
             info : Dict
-                train_precision : float
+                train_misclassification_rate : float
                 train_losses : float
                 train_cost : float
                     Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
                     this field is the sum of the training time per network
-                eval_precision : float
+                eval_misclassification_rate : float
                 eval_losses : float
                 eval_cost : float
                     Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
@@ -690,14 +624,13 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 fidelity : Dict
                     used fidelities in this evaluation
         """
-        results = self.mo_benchmark.objective_function(
+        results = self._mo_objective_function(
             configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed, **kwargs
         )
-
         results['function_value'] = results['function_value']['misclassification_rate']
         return results
 
-    @AbstractBenchmark.check_parameters
+    @AbstractSingleObjectiveBenchmark.check_parameters
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
@@ -729,90 +662,44 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : evaluation precision
+            function_value : evaluation misclassification_rate
             cost : time to the network + time to validate
             info : Dict
-                train_precision
+                train_misclassification_rate
                 train_losses
                 train_cost
-                eval_precision
+                eval_misclassification_rate
                 eval_losses
                 eval_cost
                 fidelity : used fidelities in this evaluation
         """
 
-        results = self.mo_benchmark.objective_function_test(
+        results = self._mo_objective_function_test(
             configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
         )
-
         results['function_value'] = results['function_value']['misclassification_rate']
         return results
 
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Return the CS representation of the search space.
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
-
-        Parameters
-        ----------
-        seed : int, None
-            Random seed for the configuration space.
-
-        Returns
-        -------
-        CS.ConfigurationSpace -
-            Containing the benchmark's hyperparameter
-        """
-        return NasBench201BaseMOBenchmark.get_configuration_space(seed=seed)
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the NAS Benchmark 201.
-
-        Fidelities:
-         - epoch: int
-         The loss / accuracy at `epoch`. Can be from 0 to 199.
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        return NasBench201BaseMOBenchmark.get_fidelity_space(seed=seed)
-
-    @staticmethod
-    def get_meta_information() -> Dict:
-        """ Returns the meta information for the benchmark """
-        return NasBench201BaseMOBenchmark.get_meta_information()
 
-
-class Cifar10ValidNasBench201Benchmark(NasBench201SOBenchmark):
+class Cifar10ValidNasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201Benchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201Benchmark(NasBench201SOBenchmark):
+class Cifar100NasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201Benchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201Benchmark(NasBench201SOBenchmark):
+class ImageNetNasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201Benchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
 
 
-class _NasBench201BaseBenchmarkOriginal(NasBench201SOBenchmark):
+class _NasBench201SOBenchmarkOriginal(_NasBench201SOBenchmark):
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -825,7 +712,8 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         experiments from DEHB
         [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)
 
-        Fidelities:
+        Fidelities
+        ----------
         epoch: int
             The loss / accuracy at `epoch`.
 
@@ -851,26 +739,26 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
-        meta_information = NasBench201SOBenchmark.get_meta_information()
+        meta_information = _NasBench201SOBenchmark.get_meta_information()
         meta_information['note'] = \
             'This version of the benchmark implements the fidelity space defined in the DEHB paper.' \
             'See [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)'
         return meta_information
 
 
-class Cifar10ValidNasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class Cifar10ValidNasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201BenchmarkOriginal, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class Cifar100NasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201BenchmarkOriginal, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class ImageNetNasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_201.py b/hpobench/container/benchmarks/nas/nasbench_201.py
index 2a948c6b..83b6f488 100644
--- a/hpobench/container/benchmarks/nas/nasbench_201.py
+++ b/hpobench/container/benchmarks/nas/nasbench_201.py
@@ -10,7 +10,7 @@ class Cifar10ValidNasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar10ValidNasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class Cifar100NasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar100NasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -26,7 +26,7 @@ class ImageNetNasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(ImageNetNasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -34,7 +34,7 @@ class Cifar10ValidNasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar10ValidNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
@@ -42,7 +42,7 @@ class Cifar100NasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar100NasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
@@ -50,7 +50,7 @@ class ImageNetNasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
index 42efbfea..29ef18ec 100644
--- a/tests/test_nasbench_201.py
+++ b/tests/test_nasbench_201.py
@@ -35,18 +35,18 @@ def test_nasbench201_cifar10valid(enable_debug):
         '3<-2': 'nor_conv_3x3'
     }
     result = b.objective_function(configuration=config, fidelity={'epoch': 199}, data_seed=(777, 888, 999))
-    assert result['function_value'] == pytest.approx(9.78, abs=0.1)
+    assert result['function_value'] == pytest.approx(0.0978, abs=0.1)
     assert result['cost'] == pytest.approx(11973.20, abs=0.1)
-    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
     assert result['info']['valid_cost'] == result['cost']
 
     result = b.objective_function_test(configuration=config, fidelity={'epoch': 200})
-    assert result['function_value'] == pytest.approx(9.70, abs=0.1)
+    assert result['function_value'] == pytest.approx(0.0970, abs=0.1)
     assert result['cost'] == pytest.approx(10426.33, abs=0.2)
-    assert result['info']['test_precision'] == result['function_value']
+    assert result['info']['test_misclassification_rate'] == result['function_value']
     assert result['info']['test_cost'] == result['cost']
 
-    with pytest.raises(ValueError):
+    with pytest.raises(AssertionError):
         result = b.objective_function_test(configuration=config, fidelity={'epoch': 10})
 
 
@@ -64,9 +64,9 @@ def test_nasbench201_cifar100(enable_debug):
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
     assert result is not None
-    assert result['function_value'] == pytest.approx(29.5233, abs=0.1)
+    assert result['function_value'] == pytest.approx(0.295233, abs=0.1)
     assert result['cost'] == pytest.approx(19681.70, abs=0.1)
-    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
     assert result['info']['valid_cost'] == result['cost']
 
 
@@ -83,9 +83,9 @@ def test_nasbench201_Image(enable_debug):
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
     assert result is not None
-    assert result['function_value'] == pytest.approx(55.2167, abs=0.1)
+    assert result['function_value'] == pytest.approx(0.552167, abs=0.1)
     assert result['cost'] == pytest.approx(57119.22, abs=0.1)
-    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
     assert result['info']['valid_cost'] == result['cost']
 
 

From 4414e3d60efc5ad98b2f8ba0044490cd55f2870d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 15 Dec 2022 13:05:28 +0100
Subject: [PATCH 144/147] Nas101 + Nas1shot1 v0.0.5 - Multi Objective (#166)

* Nas101 v.0.0.5
* Nas1shot1 v.0.0.5
* Test Cases for 101
---
 extra_requirements/nasbench_1shot1.json       |   2 +-
 hpobench/benchmarks/nas/nasbench_101.py       | 465 +++++++++++++-----
 hpobench/benchmarks/nas/nasbench_1shot1.py    | 328 +++++++++---
 hpobench/benchmarks/nas/tabular_benchmarks.py |   1 -
 .../container/benchmarks/nas/nasbench_101.py  |  32 +-
 .../benchmarks/nas/nasbench_1shot1.py         |  32 +-
 tests/test_nasbench_101.py                    |  82 +++
 7 files changed, 755 insertions(+), 187 deletions(-)
 create mode 100644 tests/test_nasbench_101.py

diff --git a/extra_requirements/nasbench_1shot1.json b/extra_requirements/nasbench_1shot1.json
index 7523d0f2..b008c789 100644
--- a/extra_requirements/nasbench_1shot1.json
+++ b/extra_requirements/nasbench_1shot1.json
@@ -1,3 +1,3 @@
 {
-  "nasbench_1shot1": ["tensorflow==1.15.0","matplotlib","seaborn", "networkx", "tqdm"]
+  "nasbench_1shot1": ["protobuf==3.20.1", "tensorflow==1.15.0", "matplotlib", "seaborn", "networkx", "tqdm"]
 }
\ No newline at end of file
diff --git a/hpobench/benchmarks/nas/nasbench_101.py b/hpobench/benchmarks/nas/nasbench_101.py
index f7ee1b20..c0f80737 100644
--- a/hpobench/benchmarks/nas/nasbench_101.py
+++ b/hpobench/benchmarks/nas/nasbench_101.py
@@ -42,6 +42,11 @@
 
 Changelog:
 ==========
+0.0.5
+* ADD Multi Objective version. Introduce objectives:
+  - misclassification_rate (0, 1)     - lower is better
+  - trainable_parameters   (0, 10**8) - lower is better
+
 0.0.4
 * New container release due to a general change in the communication between container and HPOBench.
   Works with HPOBench >= v0.0.8
@@ -61,23 +66,22 @@
 
 """
 import logging
-
 from pathlib import Path
-from typing import Union, Dict, Any, Tuple, List
+from typing import Union, Dict, Any, Tuple, List, Type
 
 import ConfigSpace as CS
 import numpy as np
-from tabular_benchmarks.nas_cifar10 import NASCifar10
 from nasbench import api
 from nasbench.api import OutOfDomainError
 from nasbench.lib import graph_util
+from tabular_benchmarks.nas_cifar10 import NASCifar10
 
-from hpobench import config_file
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench import config_file
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import NASBench_101DataManager
 
-__version__ = '0.0.4'
+__version__ = '0.0.5'
 logger = logging.getLogger('NasBench101')
 
 MAX_EDGES = 9
@@ -85,17 +89,19 @@
 DEFAULT_API_FILE = config_file.data_dir / "nasbench_101"
 
 
-class NASCifar10BaseBenchmark(AbstractBenchmark):
-    def __init__(self, benchmark: NASCifar10,
+class _NAS101BaseBenchmark:
+    def __init__(self,
+                 benchmark_type: Type[NASCifar10],
                  data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 **kwargs):
         """
         Baseclass for the tabular benchmarks https://github.com/automl/nas_benchmarks/tree/master/tabular_benchmarks.
         Please install the benchmark first. Place the data under ``data_path``.
 
         Parameters
         ----------
-        benchmark : NASCifar10
+        benchmark_type : Type[NASCifar10]
             Type of the benchmark to use. Don't call this class directly. Instantiate via subclasses (see below).
         data_path : str, Path, None
             Path to the folder, which contains the downloaded file nasbench_full.tfrecord.
@@ -103,21 +109,76 @@ def __init__(self, benchmark: NASCifar10,
             Random seed for the benchmarks
         """
 
-        super(NASCifar10BaseBenchmark, self).__init__(rng=rng)
-
-        self.benchmark = benchmark
+        data_path = self._try_download_api_file(data_path)
         self.data_path = data_path
+        self.rng = rng
+        self.benchmark: NASCifar10 = benchmark_type(data_dir=str(data_path), multi_fidelity=True)
+        super(_NAS101BaseBenchmark, self).__init__(rng=rng, **kwargs)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         raise NotImplementedError
 
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           run_index: Union[int, Tuple, None] = (0, 1, 2),
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'Tabular Benchmarks for Hyperparameter Optimization and Neural Architecture Search',
+                'references': ['@article{klein2019tabular,'
+                               'title   = {Tabular benchmarks for joint architecture and hyperparameter optimization},'
+                               'author  = {Klein, Aaron and Hutter, Frank},'
+                               'journal = {arXiv preprint arXiv:1905.04970},'
+                               'year    = {2019}}',
+                               'https://arxiv.org/abs/1905.04970',
+                               ],
+                'code': 'https://github.com/automl/nas_benchmarks',
+                }
+
+    @staticmethod
+    def _get_configuration_space(benchmark: Any, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """ Helper function to pass a seed to the configuration space """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = benchmark.get_configuration_space()
+        cs.seed(seed)
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 101.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.OrdinalHyperparameter('budget', sequence=[4, 12, 36, 108], default_value=108)
+        ])
+
+        return fidel_space
+
+    @staticmethod
+    def _try_download_api_file(save_to: Union[Path, str, None]):
+        data_manager = NASBench_101DataManager(save_to)
+        data_manager.download()
+        return data_manager.save_dir
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[CS.Configuration, Dict, None] = None,
+                               run_index: Union[int, Tuple, None] = (0, 1, 2),
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               **kwargs) -> Dict:
         """
         Query the NAS-benchmark using a given configuration and a epoch (=budget).
 
@@ -144,7 +205,12 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : validation error
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
             cost : runtime
             info : Dict
                 fidelity : used fidelities in this evaluation
@@ -176,6 +242,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         test_accuracies = []
         training_times = []
         additional = {}
+        failure = False
 
         for run_id in run_index:
             data = self._query_benchmark(config=configuration, budget=fidelity['budget'], run_index=run_id)
@@ -186,25 +253,31 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             training_times.append(data['training_time'])
 
             # Since those information are the same for all run ids, just store one of them.
-            additional = {'trainable_parameters': data['trainable_parameters'],
+            # Also, if the configuration is invalid, set the number of parameters to its upper limit.
+            trainable_parameters = data['trainable_parameters']
+            failure = trainable_parameters == 0
+            trainable_parameters = 10**8 if trainable_parameters == 0 else trainable_parameters
+
+            additional = {'trainable_parameters': trainable_parameters,
                           'module_operations': data['module_operations']}
 
-        return {'function_value': float(1 - np.mean(valid_accuracies)),
+        return {'function_value': {'misclassification_rate': float(1 - np.mean(valid_accuracies)),
+                                   'trainable_parameters': additional['trainable_parameters']},
                 'cost': float(np.sum(training_times)),
                 'info': {'fidelity': fidelity,
                          'train_accuracies': train_accuracies,
                          'valid_accuracies': valid_accuracies,
                          'test_accuracies': test_accuracies,
                          'training_times': training_times,
+                         'failure': 1 if failure else 0,
                          'data': additional
                          }
                 }
 
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
+    def _mo_objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                    fidelity: Union[CS.Configuration, Dict, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
         """
         Validate a configuration on the maximum available budget.
 
@@ -222,83 +295,29 @@ def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
         Returns
         -------
         Dict -
-            function_value : test error
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
             cost : runtime
             info : Dict
                 fidelity : used fidelities in this evaluation
         """
 
-        result = self.objective_function(configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng)
-        result['function_value'] = float(1 - np.mean(result['info']['test_accuracies']))
+        result = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng
+        )
+        result['function_value']['misclassification_rate'] = float(1 - np.mean(result['info']['test_accuracies']))
 
         return result
 
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError
-
-    @staticmethod
-    def get_meta_information() -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'Tabular Benchmarks for Hyperparameter Optimization and Neural Architecture Search',
-                'references': ['@article{klein2019tabular,'
-                               'title   = {Tabular benchmarks for joint architecture and hyperparameter optimization},'
-                               'author  = {Klein, Aaron and Hutter, Frank},'
-                               'journal = {arXiv preprint arXiv:1905.04970},'
-                               'year    = {2019}}',
-                               'https://arxiv.org/abs/1905.04970',
-                               ],
-                'code': 'https://github.com/automl/nas_benchmarks',
-                }
-
-    @staticmethod
-    def _get_configuration_space(benchmark: Any, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """ Helper function to pass a seed to the configuration space """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = benchmark.get_configuration_space()
-        cs.seed(seed)
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the NAS Benchmark 101.
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.OrdinalHyperparameter('budget', sequence=[4, 12, 36, 108], default_value=108)
-        ])
-
-        return fidel_space
-
-    @staticmethod
-    def _try_download_api_file(save_to: Union[Path, str, None]):
-        data_manager = NASBench_101DataManager(save_to)
-        data_manager.download()
-        return data_manager.save_dir
-
-
-class NASCifar10ABenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
 
+class _QueryA(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10A
-        benchmark = NASCifar10A(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10ABenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryA, self).__init__(benchmark_type=NASCifar10A)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -315,7 +334,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10A
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10A, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10A, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         """
@@ -372,15 +391,10 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
-class NASCifar10BBenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
-
+class _QueryB(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10B
-        benchmark = NASCifar10B(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10BBenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryB, self).__init__(benchmark_type=NASCifar10B, **kwargs)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -397,9 +411,10 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10B
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10B, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10B, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
+
         """
         Copied from the 'objective_function' from nas_cifar10.py
         We adapted the file in such a way, that the complete result is returned. The original implementation returns
@@ -408,6 +423,8 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         Parameters
         ----------
         config : Dict
+        run_index : int
+            Specifies the seed to use. Can be one of 0, 1, 2.
         budget : int
             The number of epochs. Must be one of: 4 12 36 108. Otherwise a accuracy of 0 is returned.
 
@@ -415,6 +432,7 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         -------
         Dict
         """
+
         failure = {"test_accuracy": 0, "train_accuracy": 0, "validation_accuracy": 0, "training_time": 0,
                    "info": "failure", "trainable_parameters": 0, "module_operations": 0}
 
@@ -439,6 +457,7 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         labeling = [config["op_node_%d" % i] for i in range(5)]
         labeling = ['input'] + list(labeling) + ['output']
         model_spec = api.ModelSpec(matrix, labeling)
+
         try:
             data = modified_query(self.benchmark, run_index=run_index, model_spec=model_spec, epochs=budget)
         except api.OutOfDomainError:
@@ -453,15 +472,10 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
-class NASCifar10CBenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
-
+class _QueryC(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10C
-        benchmark = NASCifar10C(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10CBenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryC, self).__init__(benchmark_type=NASCifar10C, **kwargs)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -478,7 +492,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10C
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10C, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10C, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         """
@@ -538,6 +552,221 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
+class _NASCifar10BaseMOBenchmark(_NAS101BaseBenchmark, AbstractMultiObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS-benchmark using a given configuration and a epoch (=budget).
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        return self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=run_index, rng=rng, **kwargs
+        )
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+
+        return self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'trainable_parameters']
+
+
+class _NASCifar10BaseSOBenchmark(_NAS101BaseBenchmark, AbstractBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS-benchmark using a given configuration and a epoch (=budget).
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        result_dict = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=run_index, rng=rng, **kwargs
+        )
+
+        # swap function_value dict to value
+        result_dict['function_value'] = result_dict['function_value']['misclassification_rate']
+        return result_dict
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                equals misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        result_dict = self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+        # swap function_value dict to value
+        result_dict['function_value'] = result_dict['function_value']['misclassification_rate']
+        return result_dict
+
+
+class NASCifar10ABenchmark(_QueryA, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10ABenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10AMOBenchmark(_QueryA, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10AMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10BBenchmark(_QueryB, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10BBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10BMOBenchmark(_QueryB, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10BMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10CBenchmark(_QueryC, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10CBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10CMOBenchmark(_QueryC, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10CMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
 def modified_query(benchmark, model_spec, run_index: int, epochs=108, stop_halfway=False):
     """
     NOTE:
@@ -607,3 +836,11 @@ def modified_query(benchmark, model_spec, run_index: int, epochs=108, stop_halfw
         benchmark.dataset.total_epochs_spent += epochs
 
     return data
+
+
+__all__ = ["NASCifar10ABenchmark",
+           "NASCifar10AMOBenchmark",
+           "NASCifar10BBenchmark",
+           "NASCifar10BMOBenchmark",
+           "NASCifar10CBenchmark",
+           "NASCifar10CMOBenchmark"]
diff --git a/hpobench/benchmarks/nas/nasbench_1shot1.py b/hpobench/benchmarks/nas/nasbench_1shot1.py
index 4d8231a0..5d94631e 100644
--- a/hpobench/benchmarks/nas/nasbench_1shot1.py
+++ b/hpobench/benchmarks/nas/nasbench_1shot1.py
@@ -34,7 +34,7 @@
 pip install .[nasbench_1shot1]
 
 pip install git+https://github.com/google-research/nasbench.git@master
-git clone https://github.com/automl/nasbench-1shot1/tree/master/nasbench_analysis/
+git clone https://github.com/automl/nasbench-1shot1
 
 3. Environment setup
 ====================
@@ -46,6 +46,9 @@
 
 Changelog:
 ==========
+0.0.5
+* Add MO Version
+
 0.0.4
 * New container release due to a general change in the communication between container and HPOBench.
   Works with HPOBench >= v0.0.8
@@ -62,34 +65,33 @@
 
 """
 import logging
-
+from ast import literal_eval
 from pathlib import Path
 from typing import Union, Dict, Any, Tuple, List
-from ast import literal_eval
 
 import ConfigSpace as CS
 import numpy as np
 from nasbench import api
 from nasbench.api import OutOfDomainError
-
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.data_manager import NASBench_101DataManager
-from hpobench.util import rng_helper
-
 from nasbench_analysis.search_spaces.search_space_1 import SearchSpace1  # noqa
 from nasbench_analysis.search_spaces.search_space_2 import SearchSpace2  # noqa
 from nasbench_analysis.search_spaces.search_space_3 import SearchSpace3  # noqa
 from nasbench_analysis.utils import INPUT, OUTPUT, CONV1X1, CONV3X3, MAXPOOL3X3  # noqa
 
-__version__ = '0.0.4'
+from hpobench.abstract_benchmark import AbstractSingleObjectiveBenchmark, AbstractMultiObjectiveBenchmark
+from hpobench.util import rng_helper
+from hpobench.util.data_manager import NASBench_101DataManager
+
+__version__ = '0.0.5'
 logger = logging.getLogger('NasBench1shot1')
 
 
-class NASBench1shot1BaseBenchmark(AbstractBenchmark):
+class _NASBench1shot1BaseBenchmark:
+
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         """
-        Baseclass for the nasbench 1shot1 benchmarks.
+        Baseclass for the all nasbench 1shot1 benchmarks.
         Please install the benchmark first. Place the data under ``data_path``.
 
         Parameters
@@ -99,18 +101,18 @@ def __init__(self, data_path: Union[Path, str, None] = None,
         rng : np.random.RandomState, int, None
             Random seed for the benchmarks
         """
-        super(NASBench1shot1BaseBenchmark, self).__init__(rng=rng)
+
         data_manager = NASBench_101DataManager(data_path)
         self.api = data_manager.load()
         self.search_space = None
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+        self.rng = rng
+        super(_NASBench1shot1BaseBenchmark, self).__init__(rng=rng)
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[CS.Configuration, Dict, None] = None,
+                               run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               **kwargs) -> Dict:
         """
         Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
         Only data for the budgets 4, 12, 36, 108 are available.
@@ -171,7 +173,8 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                           'module_operations': data['module_operations']}
             failure = failure or ('info' in data and data['info'] == 'failure')
 
-        return {'function_value': float(1 - np.mean(valid_accuracies)),
+        return {'function_value': {'misclassification_rate': float(1 - np.mean(valid_accuracies)),
+                                   'trainable_parameters': additional['trainable_parameters']},
                 'cost': float(np.sum(training_times)),
                 'info': {'fidelity': fidelity,
                          'train_accuracies': train_accuracies,
@@ -179,50 +182,24 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                          'test_accuracies': test_accuracies,
                          'training_times': training_times,
                          'data': additional,
-                         'failure': 'False' if not failure else 'True'
+                         'failure': 0 if not failure else 1
                          }
                 }
 
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
-        """
-        Validate a configuration on the maximum available budget (108) and on all three seeds.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
-            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
-            function. If this parameter is not given, the default random state is used.
-        kwargs
+    def _mo_objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                    fidelity: Union[CS.Configuration, Dict, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
 
-        Returns
-        -------
-        Dict -
-            function_value : test error on largest fidelity.
-            cost : runtime
-            info : Dict
-                train_accuracies
-                test_accuracies
-                valid_accuracies
-                training_times
-                fidelity : used fidelities in this evaluation
-                data : additional data such as trainable parameters and used operations
-        """
         assert fidelity['budget'] == 108, 'Only test data for the 108th epoch is available.'
-        result = self.objective_function(configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng)
-        result['function_value'] = float(1 - np.mean(result['info']['test_accuracies']))
+        result = self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                             run_index=(0, 1, 2), rng=rng, **kwargs)
+        result['function_value']['misclassification_rate'] = float(1 - np.mean(result['info']['test_accuracies']))
         return result
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError
+        raise NotImplementedError()
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -264,7 +241,6 @@ def get_meta_information() -> Dict:
                 }
 
     def _check_run_index(self, run_index):
-
         if isinstance(run_index, int):
             assert 0 <= run_index <= 2, f'run_index must be in [0, 2], not {run_index}'
             run_index = (run_index, )
@@ -426,7 +402,223 @@ def _get_configuration_space(search_space: Any, seed: Union[int, None] = None) -
         return cs
 
 
-class NASBench1shot1SearchSpace1Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1BaseMOBenchmark(_NASBench1shot1BaseBenchmark, AbstractMultiObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
+        Only data for the budgets 4, 12, 36, 108 are available.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple/List, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation error
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        return self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                           run_index=run_index, rng=rng, **kwargs)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget (108) and on all three seeds.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : test error on largest fidelity.
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        return self._mo_objective_function_test(configuration=configuration, fidelity=fidelity,
+                                                rng=rng, **kwargs)
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'trainable_parameters']
+
+
+class NASBench1shot1BaseSOBenchmark(_NASBench1shot1BaseBenchmark, AbstractSingleObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
+        Only data for the budgets 4, 12, 36, 108 are available.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple/List, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation error
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        result = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, rng=rng, run_index=run_index, **kwargs
+        )
+        result['info'].update(result['function_value'])
+        result['function_value'] = result['function_value']['misclassification_rate']
+        return result
+
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget (108) and on all three seeds.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : test error on largest fidelity.
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+
+        result = self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+        result['info'].update(result['function_value'])
+        result['function_value'] = result['function_value']['misclassification_rate']
+        return result
+
+
+class NASBench1shot1SearchSpace1MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace1MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace1()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
+
+
+class NASBench1shot1SearchSpace2MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace2MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace2()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
+
+
+class NASBench1shot1SearchSpace3MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace3MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace3()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+
+
+class NASBench1shot1SearchSpace1Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace1Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -434,10 +626,10 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
 
 
-class NASBench1shot1SearchSpace2Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1SearchSpace2Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace2Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -445,10 +637,10 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
 
 
-class NASBench1shot1SearchSpace3Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1SearchSpace3Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace3Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -456,4 +648,14 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+
+
+__all__ = [
+    "NASBench1shot1SearchSpace1Benchmark",
+    "NASBench1shot1SearchSpace2Benchmark",
+    "NASBench1shot1SearchSpace3Benchmark",
+    "NASBench1shot1SearchSpace1MOBenchmark",
+    "NASBench1shot1SearchSpace3MOBenchmark",
+    "NASBench1shot1SearchSpace3MOBenchmark",
+]
diff --git a/hpobench/benchmarks/nas/tabular_benchmarks.py b/hpobench/benchmarks/nas/tabular_benchmarks.py
index fd7404a0..5db34f2f 100644
--- a/hpobench/benchmarks/nas/tabular_benchmarks.py
+++ b/hpobench/benchmarks/nas/tabular_benchmarks.py
@@ -50,7 +50,6 @@
 * First implementation
 """
 import logging
-
 from pathlib import Path
 from typing import Union, Dict, Tuple, List
 
diff --git a/hpobench/container/benchmarks/nas/nasbench_101.py b/hpobench/container/benchmarks/nas/nasbench_101.py
index 7984d786..a47e96a2 100644
--- a/hpobench/container/benchmarks/nas/nasbench_101.py
+++ b/hpobench/container/benchmarks/nas/nasbench_101.py
@@ -3,14 +3,14 @@
 
 """ Benchmark for the Tabular Benchmark from hpobench/benchmarks/nas/nasbench_101.py """
 
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
 
 
 class NASCifar10ABenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10ABenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10ABenchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class NASCifar10BBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10BBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10BBenchmark, self).__init__(**kwargs)
 
 
@@ -26,5 +26,29 @@ class NASCifar10CBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10CBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10CBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10AMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10AMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10AMOBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10BMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10BMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10BMOBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10CMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10CMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10CMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_1shot1.py b/hpobench/container/benchmarks/nas/nasbench_1shot1.py
index a88dcf9a..bef0bf16 100644
--- a/hpobench/container/benchmarks/nas/nasbench_1shot1.py
+++ b/hpobench/container/benchmarks/nas/nasbench_1shot1.py
@@ -3,14 +3,14 @@
 
 """ Benchmark for the nasbench 1shot1 benchmarks from hpobench/benchmarks/nas/nasbench_1shot1.py """
 
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
 
 
 class NASBench1shot1SearchSpace1Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace1Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace1Benchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class NASBench1shot1SearchSpace2Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace2Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace2Benchmark, self).__init__(**kwargs)
 
 
@@ -26,5 +26,29 @@ class NASBench1shot1SearchSpace3Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace3Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace1MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace1MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace1MOBenchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace2MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace2MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace2MOBenchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace3MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace3MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace3MOBenchmark, self).__init__(**kwargs)
diff --git a/tests/test_nasbench_101.py b/tests/test_nasbench_101.py
new file mode 100644
index 00000000..67ac7f65
--- /dev/null
+++ b/tests/test_nasbench_101.py
@@ -0,0 +1,82 @@
+import pytest
+import numpy as np
+
+from hpobench.container.benchmarks.nas.nasbench_101 import (
+    NASCifar10ABenchmark, NASCifar10BBenchmark, NASCifar10CBenchmark,
+    NASCifar10AMOBenchmark, NASCifar10BMOBenchmark, NASCifar10CMOBenchmark,
+)
+
+from hpobench.util.container_utils import disable_container_debug, enable_container_debug
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
+
+# from hpobench.util.test_utils import enable_all_tests
+# enable_all_tests()
+
+
+@pytest.fixture(scope='module')
+def enable_debug():
+    enable_container_debug()
+    yield
+    disable_container_debug()
+
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
+def test_nasbench101_A_SO(enable_debug):
+
+    b = NASCifar10ABenchmark(rng=0)
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    assert len(b.get_fidelity_space()) == 1
+
+    config = {
+        'edge_0': 0, 'edge_1': 0, 'edge_10': 0, 'edge_11': 1, 'edge_12': 1, 'edge_13': 0, 'edge_14': 1, 'edge_15': 0,
+        'edge_16': 0, 'edge_17': 1, 'edge_18': 1, 'edge_19': 0, 'edge_2': 0, 'edge_20': 1, 'edge_3': 0, 'edge_4': 0,
+        'edge_5': 1, 'edge_6': 1, 'edge_7': 0, 'edge_8': 0, 'edge_9': 0, 'op_node_0': 'maxpool3x3',
+        'op_node_1': 'conv1x1-bn-relu', 'op_node_2': 'conv3x3-bn-relu', 'op_node_3': 'conv3x3-bn-relu',
+        'op_node_4': 'conv3x3-bn-relu'
+    }
+
+    result = b.objective_function(configuration=config, fidelity={'budget': 108}, run_index=(0, 1, 2))
+    assert result['function_value'] == pytest.approx(0.1659655372301737, abs=0.1)
+    assert result['cost'] == pytest.approx(853.5010070800781, abs=0.1)
+    assert 1 - np.mean(result['info']['valid_accuracies']) == result['function_value']
+
+    with pytest.raises(AssertionError):
+        result = b.objective_function_test(configuration=config, fidelity={'epoch': 109})
+
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
+def test_nasbench101_C_MO(enable_debug):
+    b = NASCifar10CMOBenchmark(rng=0)
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    assert len(b.get_fidelity_space()) == 1
+
+    config = {
+        'edge_0': 0.9446689170495839, 'edge_1': 0.1289262976548533, 'edge_10': 0.09710127579306127,
+        'edge_11': 0.09394051075844168, 'edge_12': 0.5722519057908734, 'edge_13': 0.30157481667454933,
+        'edge_14': 0.9194826137446735, 'edge_15': 0.3599780644783639, 'edge_16': 0.589909976354571,
+        'edge_17': 0.4536968445560453, 'edge_18': 0.21550767711355845, 'edge_19': 0.18327983621407862,
+        'edge_2': 0.5864101661863267, 'edge_20': 0.47837030703998806, 'edge_3': 0.05342718178682526,
+        'edge_4': 0.6956254456388572, 'edge_5': 0.3068100995451961, 'edge_6': 0.399025321703102,
+        'edge_7': 0.15941446344895593, 'edge_8': 0.23274412927905685, 'edge_9': 0.0653042071517802, 'num_edges': 9,
+        'op_node_0': 'conv1x1-bn-relu', 'op_node_1': 'maxpool3x3', 'op_node_2': 'conv1x1-bn-relu',
+        'op_node_3': 'maxpool3x3', 'op_node_4': 'maxpool3x3'
+    }
+
+    result = b.objective_function(configuration=config, fidelity={'budget': 108}, run_index=(0, 1, 2))
+    assert result['function_value']['misclassification_rate'] == pytest.approx(0.11985842386881507, abs=0.1)
+    assert result['function_value']['trainable_parameters'] == 1115277
+    assert result['cost'] == pytest.approx(3175.9591064453125, abs=0.1)
+    assert 1 - np.mean(result['info']['valid_accuracies']) == result['function_value']['misclassification_rate']
+
+    with pytest.raises(AssertionError):
+        result = b.objective_function_test(configuration=config, fidelity={'epoch': 109})

From 47f8b5b08b565c65ac95fa2cd8fc28fad71b8c50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 15 Dec 2022 14:34:11 +0100
Subject: [PATCH 145/147] MO CNN and MO Adult v0.0.2 (#167)

Update to 0.0.2
---
 hpobench/benchmarks/mo/adult_benchmark.py         | 12 +++++++-----
 hpobench/benchmarks/mo/cnn_benchmark.py           | 15 +++++++++------
 .../container/benchmarks/mo/adult_benchmark.py    |  2 +-
 hpobench/container/benchmarks/mo/cnn_benchmark.py |  4 ++--
 tests/test_adult.py                               |  6 +++---
 tests/test_mo_cnn.py                              |  1 +
 6 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/hpobench/benchmarks/mo/adult_benchmark.py b/hpobench/benchmarks/mo/adult_benchmark.py
index a12e8a70..30631cae 100644
--- a/hpobench/benchmarks/mo/adult_benchmark.py
+++ b/hpobench/benchmarks/mo/adult_benchmark.py
@@ -1,6 +1,8 @@
 """
 Changelog:
 ==========
+0.0.2:
+* Change the objective value from accuracy to misclassification rate. (1 - accuracy)
 
 0.0.1:
 * First implementation of the Multi-Objective Fair Adult Benchmark.
@@ -127,7 +129,7 @@ def get_meta_information() -> Dict:
     @staticmethod
     def get_objective_names() -> List[str]:
         """Get a list of objectives evaluated in the objective_function. """
-        return ['accuracy', 'DSP', 'DEO', 'DFP']
+        return ['misclassification_rate', 'DSP', 'DEO', 'DFP']
 
     @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
@@ -165,7 +167,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         -------
         Dict -
             function_value : Dict - validation metrics after training on train
-                accuracy: float
+                misclassification_rate: float: 1 - validation accuracy
                 DSO: float
                 DEO: float
                 DFP: float
@@ -247,7 +249,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         elapsed_time = time.time() - ts_start
 
-        return {'function_value': {'accuracy': float(val_accuracy),
+        return {'function_value': {'misclassification_rate': 1 - float(val_accuracy),
                                    'DSO': float(val_statistical_disparity),
                                    'DEO': float(val_unequal_opportunity),
                                    'DFP': float(val_unequalized_odds)
@@ -310,7 +312,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         -------
         Dict -
             function_value : Dict - test metrics reported after training on (train+valid)
-                accuracy: float
+                misclassification_rate: float: 1 - test accuracy
                 DSO: float
                 DEO: float
                 DFP: float
@@ -381,7 +383,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         logger.debug(f"config:{configuration}, test_score: {test_accuracy}, train score:{train_accuracy},"
                      f"dsp:{test_statistical_disparity}, deo :{test_unequal_opportunity}, dfp :{test_unequalized_odds}")
 
-        return {'function_value': {'accuracy': float(test_accuracy),
+        return {'function_value': {'misclassification_rate': 1 - float(test_accuracy),
                                    'DSO': float(test_statistical_disparity),
                                    'DEO': float(test_unequal_opportunity),
                                    'DFP': float(test_unequalized_odds)
diff --git a/hpobench/benchmarks/mo/cnn_benchmark.py b/hpobench/benchmarks/mo/cnn_benchmark.py
index d8bfd939..516b459a 100644
--- a/hpobench/benchmarks/mo/cnn_benchmark.py
+++ b/hpobench/benchmarks/mo/cnn_benchmark.py
@@ -1,6 +1,9 @@
 """
 Changelog:
 ==========
+0.0.2:
+* Rename the returned function value
+  'negative_accuracy' -> 'misclassification_rate'
 
 0.0.1:
 * First implementation of the Multi-Objective CNN Benchmark.
@@ -22,7 +25,7 @@
 from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import CNNDataManager
 
-__version__ = '0.0.1'
+__version__ = '0.0.2'
 
 logger = logging.getLogger('MO_CNN')
 
@@ -284,7 +287,7 @@ def get_meta_information() -> Dict:
     @staticmethod
     def get_objective_names() -> List[str]:
         """Get the names of the objectives reported in the objective function."""
-        return ['accuracy', 'model_size']
+        return ['misclassification_rate', 'model_size']
 
     def init_model(self, config: Union[CS.Configuration, Dict]) -> Net:
         """
@@ -361,7 +364,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         -------
         Dict -
             function_value : Dict
-                negative_accuracy: float
+                misclassification_rate: float
                     1 - validation accuracy
                 log_model_size: float
                     log10 of the number of parameters
@@ -435,7 +438,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         elapsed_time = time.time() - time_in
 
-        return {'function_value': {'negative_accuracy': 1 - val_accuracy,
+        return {'function_value': {'misclassification_rate': 1 - val_accuracy,
                                    'log_model_size': float(np.log10(num_params))},
                 'cost': float(training_runtime),
                 'info': {'train_accuracy': train_accuracy,
@@ -479,7 +482,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         -------
         Dict -
             function_value : Dict
-                negative_accuracy: float
+                misclassification_rate: float
                     1 - test accuracy
                 log_model_size: float
                     log10 of the number of parameters
@@ -546,7 +549,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
 
         elapsed_time = time.time() - time_in
 
-        return {'function_value': {'negative_accuracy': 1 - test_accuracy,
+        return {'function_value': {'misclassification_rate': 1 - test_accuracy,
                                    'log_model_size': float(np.log10(num_params))},
                 'cost': training_runtime,
                 'info': {'train_accuracy': train_accuracy,
diff --git a/hpobench/container/benchmarks/mo/adult_benchmark.py b/hpobench/container/benchmarks/mo/adult_benchmark.py
index dbdcaf4d..34baf1b9 100644
--- a/hpobench/container/benchmarks/mo/adult_benchmark.py
+++ b/hpobench/container/benchmarks/mo/adult_benchmark.py
@@ -8,5 +8,5 @@ class AdultBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'AdultBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'fair_adult')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         super(AdultBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/mo/cnn_benchmark.py b/hpobench/container/benchmarks/mo/cnn_benchmark.py
index c9a1d009..9e5cfe6f 100644
--- a/hpobench/container/benchmarks/mo/cnn_benchmark.py
+++ b/hpobench/container/benchmarks/mo/cnn_benchmark.py
@@ -8,7 +8,7 @@ class FlowerCNNBenchmark(AbstractMOBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FlowerCNNBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         kwargs['gpu'] = kwargs.get('gpu', True)
         super(FlowerCNNBenchmark, self).__init__(**kwargs)
 
@@ -17,6 +17,6 @@ class FashionCNNBenchmark(AbstractMOBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FashionCNNBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         kwargs['gpu'] = kwargs.get('gpu', True)
         super(FashionCNNBenchmark, self).__init__(**kwargs)
diff --git a/tests/test_adult.py b/tests/test_adult.py
index d7a030b7..b52c37ed 100644
--- a/tests/test_adult.py
+++ b/tests/test_adult.py
@@ -28,10 +28,10 @@ def test_adult_benchmark():
     result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
 
     assert result_1['info']['valid_accuracy'] == pytest.approx(0.7539, rel=0.001)
-    assert result_1['info']['valid_accuracy'] == result_1['function_value']['accuracy']
+    assert 1 - result_1['info']['valid_accuracy'] == result_1['function_value']['misclassification_rate']
     assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
     assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
 
     result_1 = benchmark.objective_function_test(test_config, rng=1, fidelity={'budget': 3})
-    assert result_1['function_value']['accuracy'] == pytest.approx(0.76377, rel=0.001)
-    assert result_1['function_value']['accuracy'] == result_1['info']['test_accuracy']
+    assert 1 - result_1['function_value']['misclassification_rate'] == pytest.approx(0.76377, rel=0.001)
+    assert 1 - result_1['function_value']['misclassification_rate'] == result_1['info']['test_accuracy']
diff --git a/tests/test_mo_cnn.py b/tests/test_mo_cnn.py
index f721dfc3..cded9444 100644
--- a/tests/test_mo_cnn.py
+++ b/tests/test_mo_cnn.py
@@ -20,6 +20,7 @@ def test_mo_cnn_seeding():
         assert result_1['function_value'][metric] == pytest.approx(result_2['function_value'][metric], abs=0.001)
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_mo_cnn_benchmark():
     from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
 

From 46f719d83174cfabb3b257baded6ea0299a04cec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Wed, 22 Feb 2023 09:22:36 +0100
Subject: [PATCH 146/147] Towards compatibility with windows os. (#170)

---
 hpobench/config.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/hpobench/config.py b/hpobench/config.py
index 9d7964e0..cd46c6e5 100644
--- a/hpobench/config.py
+++ b/hpobench/config.py
@@ -64,7 +64,16 @@ def __init__(self):
 
         # Options for the singularity container
         self.socket_dir = Path(self.socket_dir).expanduser().absolute()
-        self.container_dir = self.cache_dir / f'hpobench-{os.getuid()}'
+
+        # os.getuid is only for posix os. Make it compatible with windows
+        # https://stackoverflow.com/questions/842059/is-there-a-portable-way-to-get-the-current-username-in-python
+        if os.name == 'nt':
+            import getpass
+            user_name = getpass.getuser()
+        else:
+            user_name = os.getuid()
+
+        self.container_dir = self.cache_dir / f'hpobench-{user_name}'
         self.container_source = 'oras://gitlab.tf.uni-freiburg.de:5050/muelleph/hpobench-registry'
         self.pyro_connect_max_wait = 400
 

From 236e54262fd11a288b8b501f5e1da923e421574c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Wed, 22 Feb 2023 09:57:07 +0100
Subject: [PATCH 147/147] Raw YAHPO Benchmarks (#153)

* Yahpo Raw: You can run fair/iml/rbv2 experiments from yahpo.

The installation is quite complicated and described in the recipe.

* Introduce YAHPO Surrogate V0.0.2
It also goes hand in hand with an update for the yahpo surrogate benchmarks.

---------

Co-authored-by: ayushi-3536 <77584036+ayushi-3536@users.noreply.github.com>
Co-authored-by: ayushi-3536 <ayushi-3536@github.com>
Co-authored-by: Dominik Woiwode <dwoiwode@users.noreply.github.com>
---
 extra_requirements/yahpo_gym.json             |   3 +-
 hpobench/benchmarks/ml/__init__.py            |  22 --
 hpobench/benchmarks/ml/rbv2_benchmark.py      |   0
 hpobench/benchmarks/ml/yahpo_benchmark.py     | 317 ++++++++++++++++++
 hpobench/benchmarks/surrogates/yahpo_gym.py   | 195 +++++++----
 hpobench/container/benchmarks/ml/__init__.py  |   5 +-
 .../benchmarks/ml/yahpo_benchmark.py          |  21 ++
 .../benchmarks/surrogates/yahpo_gym.py        |   4 +-
 .../recipes/ml/Singularity.YahpoRawBenchmark  |  82 +++++
 .../recipes/ml/Singularity.rbv2Benchmark      |   0
 .../surrogates/Singularity.YAHPOGymBenchmark  |   6 +-
 hpobench/util/data_manager.py                 |  40 +++
 tests/test_yahpo_raw.py                       |  12 +
 13 files changed, 621 insertions(+), 86 deletions(-)
 create mode 100644 hpobench/benchmarks/ml/rbv2_benchmark.py
 create mode 100644 hpobench/benchmarks/ml/yahpo_benchmark.py
 create mode 100644 hpobench/container/benchmarks/ml/yahpo_benchmark.py
 create mode 100644 hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark
 create mode 100644 hpobench/container/recipes/ml/Singularity.rbv2Benchmark
 create mode 100644 tests/test_yahpo_raw.py

diff --git a/extra_requirements/yahpo_gym.json b/extra_requirements/yahpo_gym.json
index 77bea14d..10f4e390 100644
--- a/extra_requirements/yahpo_gym.json
+++ b/extra_requirements/yahpo_gym.json
@@ -1,3 +1,4 @@
 {
-  "yahpo_gym": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym"]
+  "yahpo_gym": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym"],
+  "yahpo_gym_raw": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym", "rpy2>=3.5.0", "openml==0.10.2", "gitpython>=3.1"]
 }
diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index 64e399cd..e69de29b 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -1,22 +0,0 @@
-from hpobench.benchmarks.ml.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
-from hpobench.benchmarks.ml.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
-from hpobench.benchmarks.ml.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
-from hpobench.benchmarks.ml.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
-    RandomForestBenchmarkMF
-from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
-from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
-
-try:
-    from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-except ImportError:
-    pass
-
-
-__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
-           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
-           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
-           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
-           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
-           'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
-           ]
diff --git a/hpobench/benchmarks/ml/rbv2_benchmark.py b/hpobench/benchmarks/ml/rbv2_benchmark.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/benchmarks/ml/yahpo_benchmark.py b/hpobench/benchmarks/ml/yahpo_benchmark.py
new file mode 100644
index 00000000..d06d23fc
--- /dev/null
+++ b/hpobench/benchmarks/ml/yahpo_benchmark.py
@@ -0,0 +1,317 @@
+"""
+How to use this benchmark:
+--------------------------
+
+We recommend using the containerized version of this benchmark.
+If you want to use this benchmark locally (without running it via the corresponding container),
+you need to perform the following steps.
+
+Prerequisites: 1) Install Conda
+===============================
+Conda environment in which the HPOBench is installed (pip install .). Activate your environment.
+```
+conda activate <Name_of_Conda_HPOBench_environment>
+```
+
+Prerequisites: 2) Install R
+===========================
+
+Install R (4.0.5 - IMPORTANT!) and the required dependencies:  # works also with higher R versions(?)
+
+``` bash
+Rscript -e 'install.packages("remotes", repos = "http://cran.r-project.org")'
+
+# Install OpenML dependencies
+Rscript -e 'install.packages("curl", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("httr", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("farff", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("OpenML", repos = "http://cran.r-project.org")' \
+
+# Install rbv2 dependencies
+Rscript -e 'remotes::install_version("BBmisc", version = "1.11", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("glmnet", version = "2.0-16", upgrade = "never", repos = "http://cran.r-project.o")' \
+&& Rscript -e 'remotes::install_version("rpart", version = "4.1-13", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("e1071", version = "1.7-0.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("xgboost", version = "0.82.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("ranger", version = "0.11.2", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("RcppHNSW", version = "0.1.0", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("mlr", version = "2.14", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_github("mlr-org/mlr3misc", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("mlrCPO", version = "0.3.6", upgrade = "never", repos = "http://cran.r-projt.org")' \
+&& Rscript -e 'remotes::install_github("pfistfl/rbv2", upgrade = "never")' \
+&& Rscript -e 'remotes::install_version("testthat", version = "3.1.4", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_github("sumny/iaml", upgrade = "never")'
+```
+Prerequisites: 3) Install rpy2
+==============================
+Installing the connector between R and python might be a little bit tricky.
+Official installation guide: https://rpy2.github.io/doc/latest/html/introduction.html
+
+We received in some cases the error: "/opt/R/4.0.5/lib/R/library/methods/libs/methods.so: undefined symbol".
+To solve this error, we had to execute the following command:
+```
+export LD_LIBRARY_PATH=$(python -m rpy2.situation LD_LIBRARY_PATH):${LD_LIBRARY_PATH}
+```
+
+1. Download data:
+=================
+Normally, the data will be downloaded automatically.
+
+If you want to download the data on your own, you can download the data with the following command:
+
+``` bash
+git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git
+```
+
+Later, you have to give yahpo the link to the data.
+
+```python
+from yahpo_gym import local_config
+local_config.init_config()
+local_config.set_data_path("path-to-data")
+```
+
+The data consist of surrogates for different data sets. Each surrogate is a compressed ONNX neural network.
+
+
+2. Install HPOBench:
+====================
+```
+git clone HPOBench
+cd /path/to/HPOBench
+pip install .[yahpo_gym_raw]
+```
+
+Changelog:
+==========
+0.0.1:
+* First implementation
+"""  # noqa: E501
+
+import logging
+from pathlib import Path
+from typing import Union, Dict, List
+
+import pandas as pd
+import ConfigSpace as CS
+import numpy as np
+import rpy2.robjects as robjects
+from rpy2.robjects.packages import importr
+from yahpo_gym.benchmark_set import BenchmarkSet
+
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('YAHPO-Raw')
+
+
+class YAHPOGymMORawBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self, scenario: str, instance: str,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 data_dir: Union[Path, str, None] = None):
+        """
+        Parameters
+        ----------
+        scenario : str
+            Name for the learner. Must be one of [
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_xgboost", "rbv2_svm", "rbv2_aknn", "rbv2_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet", "iaml_xgboost"
+            ]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+            https://slds-lmu.github.io/yahpo_gym/scenarios.html#instances
+        rng : np.random.RandomState, int, None
+        """
+
+        assert scenario.startswith('rbv2_') or scenario.startswith('iaml_'), \
+            'Currently, we only support the experiments with rbv2_ and iaml from yahpo. ' \
+            f'The scenario has to start with either rbv2_ or iaml_, but was {scenario}'
+
+        from hpobench.util.data_manager import YAHPODataManager
+        self.data_manager = YAHPODataManager(data_dir=data_dir)
+        self.data_manager.load()
+
+        self.scenario = scenario
+        self.instance = instance
+        self.benchset = BenchmarkSet(scenario, active_session=True)
+        self.benchset.set_instance(instance)
+
+        logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
+        super(YAHPOGymMORawBenchmark, self).__init__(rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_opt_space(drop_fidelity_params=True, seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_fidelity_space(seed=seed)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        # Cast python dict to R list:
+        parameters = {**configuration, **fidelity}
+        r_list = YAHPOGymMORawBenchmark._cast_dict_to_rlist(parameters)
+
+        # Call the random bot evaluation method
+        if self.scenario.startswith('rbv2_'):
+
+            # Establish a connection to the R package
+            rbv2pkg = importr('rbv2')
+
+            learner = self.scenario.replace('rbv2_', 'classif.')
+            r_out = rbv2pkg.eval_config(
+                learner=learner, task_id=int(configuration['task_id']), configuration=r_list
+            )
+            # Extract the run data frame via replications and cast the R list (result) back to a python dictionary
+            result_r_df = r_out[0][0][0][4]
+            result_dict = YAHPOGymMORawBenchmark._cast_to_dict(result_r_df)
+            result_df = pd.DataFrame(result_dict)
+            result = result_df.mean(axis=0)
+            result = result.to_dict()
+            time_cols = [col for col in result_df.columns if 'time' in col]
+            times = {col: result_df.loc[:, col].sum() for col in time_cols}
+            result.update(times)
+
+        elif self.scenario.startswith('iaml_'):
+
+            iaml = importr('iaml')
+            out = iaml.eval_yahpo(scenario=robjects.StrVector([self.scenario]), configuration=r_list)
+            result = YAHPOGymMORawBenchmark._cast_to_dict(out)
+
+        elif self.scenario.startswith('fair_'):
+
+            fair_pkg = importr('fair')
+            out = fair_pkg.eval_yahpo(scenario=robjects.StrVector([self.scenario]), configuration=r_list)
+            result = YAHPOGymMORawBenchmark._cast_to_dict(out)
+
+        else:
+            raise NotImplementedError()
+
+        objectives = {target: value for target, value in result.items() if target in self.benchset.config.y_names}
+        additional = {target: value for target, value in result.items() if target not in self.benchset.config.y_names}
+
+        return {
+            'function_value': objectives,
+            'cost': result['timetrain'],
+            'info': {'fidelity': fidelity, 'additional_info': additional}
+        }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    @staticmethod
+    def get_meta_information():
+        """ Returns the meta information for the benchmark """
+        return {'name': 'YAHPO Gym',
+                'references': ['@misc{pfisterer2021yahpo,',
+                               'title={YAHPO Gym -- Design Criteria and a new Multifidelity Benchmark '
+                               '       for Hyperparameter Optimization},',
+                               'author={Florian Pfisterer and Lennart Schneider and Julia Moosbauer '
+                               '        and Martin Binder and Bernd Bischl},',
+                               'eprint={2109.03670},',
+                               'archivePrefix={arXiv},',
+                               'year={2021}}'],
+                'code': ['https://github.com/pfistfl/yahpo_gym/yahpo_gym',
+                         'https://github.com/pfistfl/rbv2/',
+                         'https://github.com/sumny/iaml',
+                         'https://github.com/sumny/fair']
+                }
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+    @staticmethod
+    def _cast_dict_to_rlist(py_dict):
+        """ Convert a python dictionary to a RPy2 ListVector"""
+        pairs = [f'{key} = {value}' if not isinstance(value, str) else f'{key} = \"{value}\"'
+                 for key, value in py_dict.items()]
+        pairs = ",".join(pairs)
+        str_list = f"list({pairs})"
+        r_list = robjects.r(str_list)
+        return r_list
+
+    @staticmethod
+    def _cast_to_dict(r_list_object) -> Dict:
+        """
+        Convert an RPy2 ListVector to a Python dict.
+        Source: https://ogeek.cn/qa/?qa=815151/
+        """
+        result = {}
+        for i, name in enumerate(r_list_object.names):
+            if isinstance(r_list_object[i], robjects.ListVector):
+                result[name] = YAHPOGymMORawBenchmark._cast_to_dict(r_list_object[i])
+            elif len(r_list_object[i]) == 1:
+                result[name] = r_list_object[i][0]
+            else:
+                result[name] = r_list_object[i]
+        return result
+
+
+class YAHPOGymRawBenchmark(AbstractBenchmark):
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+            https://slds-lmu.github.io/yahpo_gym/scenarios.html#instances
+        objective : str
+            Name of the (single-crit) objective. See `self.benchset.config.y_names`.
+            Initialized to None, picks the first element in y_names.
+        rng : np.random.RandomState, int, None
+        """
+        self.backbone = YAHPOGymMORawBenchmark(scenario=scenario, instance=instance, rng=rng)
+        self.objective = objective
+        super(YAHPOGymRawBenchmark, self).__init__(rng=rng)
+
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        mo_results = self.backbone.objective_function(configuration=configuration,
+                                                      fidelity=fidelity,
+                                                      **kwargs)
+
+        # If not objective is set, we just grab the first returned entry.
+        if self.objective is None:
+            self.objective = self.backbone.benchset.config.y_names[0]
+
+        obj_value = mo_results['function_value'][self.objective]
+
+        return {'function_value': obj_value,
+                "cost": mo_results['cost'],
+                'info': {'fidelity': fidelity,
+                         'additional_info': mo_results['info']['additional_info'],
+                         'objectives': mo_results['function_value']}}
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_configuration_space(seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        return YAHPOGymMORawBenchmark.get_meta_information()
diff --git a/hpobench/benchmarks/surrogates/yahpo_gym.py b/hpobench/benchmarks/surrogates/yahpo_gym.py
index 19522700..ad552acd 100644
--- a/hpobench/benchmarks/surrogates/yahpo_gym.py
+++ b/hpobench/benchmarks/surrogates/yahpo_gym.py
@@ -29,54 +29,104 @@
 
 Changelog:
 ==========
+0.0.2:
+
+* Add support for multi-objective benchmarks
+* Add support for fairness benchmarks and interpretability benchmarks.
+For these new benchmarks (fairness and interpretability), we recommend the following benchmarks and objectives:
+For the entire list of available benchmarks, please take a look in the yahpo benchmark documentation.
+
+Benchmark Name      |   Scenario    |   Objectives
+--------------------|---------------|--------------
+fair_fgrrm          | 7592          | mmce, feo
+                    | 14965         | mmce, feo
+--------------------|---------------|--------------
+fair_rpart          | 317599        | mmce, ffomr
+                    | 7592          | mmce, feo
+--------------------|---------------|--------------
+fair_ranger         | 317599        | mmce, fpredp
+                    | 14965         | mmce, fpredp
+--------------------|---------------|--------------
+fair_xgboost        | 317599        | mmce, ffomr
+                    | 7592          | mmce, ffnr
+--------------------|---------------|--------------
+fair_super          | 14965         | mmce, feo
+                    | 317599        | mmce, ffnr
+--------------------|---------------|--------------
+
+
+Benchmark Name      |   Scenario    |   Objectives
+--------------------|---------------|--------------
+iaml_glmnet          | 1489         | mmce, nf
+                    | 40981         | mmce, nf
+--------------------|---------------|--------------
+iaml_rpart          | 1489          | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_ranger         | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_xgboost        | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_super          | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+
 0.0.1:
 * First implementation
 """
-import os
 import logging
+from pathlib import Path
 from typing import Union, Dict, List
 
 import ConfigSpace as CS
 import numpy as np
-
 from yahpo_gym.benchmark_set import BenchmarkSet
-from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark, AbstractBenchmark
 
-__version__ = '0.0.1'
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark, AbstractSingleObjectiveBenchmark
+from hpobench.util.data_manager import YAHPODataManager
 
-logger = logging.getLogger('YAHPOGym')
+__version__ = '0.0.2'
 
+logger = logging.getLogger('YAHPOGym')
 
-class YAHPOGymMOBenchmark(AbstractMultiObjectiveBenchmark):
 
+class YAHPOGymBaseBenchmark:
     def __init__(self, scenario: str, instance: str,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
                  rng: Union[np.random.RandomState, int, None] = None):
         """
-        For a list of available scenarios and instances see
-        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Base Benchmark for all single and multi objective yahpo surrogate benchmarks.
         Parameters
         ----------
         scenario : str
-            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
-            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
         instance : str
             A valid instance for the scenario. See `self.benchset.instances`.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
         rng : np.random.RandomState, int, None
         """
-
-        # When in the containerized version, redirect to the data inside the container.
-        if 'YAHPO_CONTAINER' in os.environ:
-            from yahpo_gym.local_config import LocalConfiguration
-            local_config = LocalConfiguration()
-            local_config.init_config(data_path='/home/data/yahpo_data')
+        self.data_manager = YAHPODataManager(data_dir=data_dir)
+        self.data_manager.load()
 
         self.scenario = scenario
         self.instance = instance
-        self.benchset = BenchmarkSet(scenario, active_session=True)
+        self.benchset = BenchmarkSet(scenario, active_session=True, multithread=multi_thread)
         self.benchset.set_instance(instance)
 
         logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
-        super(YAHPOGymMOBenchmark, self).__init__(rng=rng)
+        super(YAHPOGymBaseBenchmark, self).__init__(rng=rng)
 
     # pylint: disable=arguments-differ
     def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -86,8 +136,7 @@ def get_configuration_space(self, seed: Union[int, None] = None) -> CS.Configura
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         return self.benchset.get_fidelity_space(seed=seed)
 
-    @AbstractMultiObjectiveBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[CS.Configuration, Dict, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
 
@@ -103,17 +152,6 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 "cost": cost,
                 'info': {'fidelity': fidelity}}
 
-    @AbstractMultiObjectiveBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) \
-            -> Dict:
-        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
-
-    # pylint: disable=arguments-differ
-    def get_objective_names(self) -> List[str]:
-        return self.benchset.config.y_names
-
     @staticmethod
     def get_meta_information():
         """ Returns the meta information for the benchmark """
@@ -130,9 +168,11 @@ def get_meta_information():
                 'code': 'https://github.com/pfistfl/yahpo_gym/yahpo_gym'}
 
 
-class YAHPOGymBenchmark(AbstractBenchmark):
+class YAHPOGymMOBenchmark(YAHPOGymBaseBenchmark, AbstractMultiObjectiveBenchmark):
 
     def __init__(self, scenario: str, instance: str, objective: str = None,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
                  rng: Union[np.random.RandomState, int, None] = None):
         """
         For a list of available scenarios and instances see
@@ -140,33 +180,88 @@ def __init__(self, scenario: str, instance: str, objective: str = None,
         Parameters
         ----------
         scenario : str
-            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
-            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
+        rng : np.random.RandomState, int, None
+        """
+        self.objective = objective
+        super(YAHPOGymMOBenchmark, self).__init__(scenario=scenario, instance=instance, rng=rng, data_dir=data_dir, multi_thread=multi_thread)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        return self._mo_objective_function(configuration, fidelity, rng, **kwargs)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) \
+            -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+
+class YAHPOGymBenchmark(YAHPOGymBaseBenchmark, AbstractSingleObjectiveBenchmark):
+
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
         instance : str
             A valid instance for the scenario. See `self.benchset.instances`.
         objective : str
             Name of the (single-crit) objective. See `self.benchset.config.y_names`.
             Initialized to None, picks the first element in y_names.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
         rng : np.random.RandomState, int, None
         """
-
-        self.backbone = YAHPOGymMOBenchmark(scenario=scenario, instance=instance, rng=rng)
         self.objective = objective
+        super(YAHPOGymBenchmark, self).__init__(scenario=scenario, instance=instance, rng=rng, data_dir=data_dir, multi_thread=multi_thread)
 
-        super(YAHPOGymBenchmark, self).__init__(rng=rng)
-
-    @AbstractBenchmark.check_parameters
+    @AbstractSingleObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
 
-        mo_results = self.backbone.objective_function(configuration=configuration,
-                                                      fidelity=fidelity,
-                                                      **kwargs)
+        mo_results = self._mo_objective_function(configuration=configuration,
+                                                 fidelity=fidelity,
+                                                 **kwargs)
 
         # If not objective is set, we just grab the first returned entry.
         if self.objective is None:
-            self.objective = self.backbone.benchset.config.y_names[0]
+            self.objective = self.benchset.config.y_names[0]
 
         obj_value = mo_results['function_value'][self.objective]
 
@@ -174,20 +269,8 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 "cost": mo_results['cost'],
                 'info': {'fidelity': fidelity, 'objectives': mo_results['function_value']}}
 
-    @AbstractBenchmark.check_parameters
+    @AbstractSingleObjectiveBenchmark.check_parameters
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, CS.Configuration, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
         return self.objective_function(configuration, fidelity=fidelity, rng=rng)
-
-    # pylint: disable=arguments-differ
-    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return self.backbone.get_configuration_space(seed=seed)
-
-    # pylint: disable=arguments-differ
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return self.backbone.get_fidelity_space(seed=seed)
-
-    @staticmethod
-    def get_meta_information() -> Dict:
-        return YAHPOGymMOBenchmark.get_meta_information()
diff --git a/hpobench/container/benchmarks/ml/__init__.py b/hpobench/container/benchmarks/ml/__init__.py
index ed2ce40f..f342f5f8 100644
--- a/hpobench/container/benchmarks/ml/__init__.py
+++ b/hpobench/container/benchmarks/ml/__init__.py
@@ -6,7 +6,7 @@
 from hpobench.container.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
 from hpobench.container.benchmarks.ml.tabular_benchmark import TabularBenchmark
 from hpobench.container.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-
+from hpobench.container.benchmarks.ml.yahpo_benchmark import YAHPOGymRawBenchmark, YAHPOGymMORawBenchmark
 
 __all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
            'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
@@ -14,4 +14,5 @@
            'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
            'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
            'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
+           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
+           'YAHPOGymRawBenchmark', 'YAHPOGymMORawBenchmark']
diff --git a/hpobench/container/benchmarks/ml/yahpo_benchmark.py b/hpobench/container/benchmarks/ml/yahpo_benchmark.py
new file mode 100644
index 00000000..e4d9cf0c
--- /dev/null
+++ b/hpobench/container/benchmarks/ml/yahpo_benchmark.py
@@ -0,0 +1,21 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient, \
+    AbstractBenchmarkClient
+
+
+class YAHPOGymMORawBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMORawBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_raw')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymMORawBenchmark, self).__init__(**kwargs)
+
+
+class YAHPOGymRawBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymRawBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_raw')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymRawBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/surrogates/yahpo_gym.py b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
index 9774975d..64cee463 100644
--- a/hpobench/container/benchmarks/surrogates/yahpo_gym.py
+++ b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
@@ -8,7 +8,7 @@ class YAHPOGymBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         super(YAHPOGymBenchmark, self).__init__(**kwargs)
 
 
@@ -16,5 +16,5 @@ class YAHPOGymMOBenchmark(AbstractMOBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMOBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         super(YAHPOGymMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark b/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark
new file mode 100644
index 00000000..e79dab4b
--- /dev/null
+++ b/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark
@@ -0,0 +1,82 @@
+Bootstrap: docker
+From: rpy2/rpy2:latest
+
+
+%labels
+MAINTAINER pfistererf@googlemail.com
+VERSION v0.0.1
+
+%help
+    This is the recipe for the Raw YAHPO Benchmarks.
+
+
+%post
+    cd /home
+
+    ####################### INSTALL THE R + BASE DEPENDENCIES #################
+    FILE="libssl1.1_1.1.1f-1ubuntu2_amd64.deb"
+    wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/${FILE}
+    sudo dpkg -i ${FILE}
+
+    FILE="libssl-dev_1.1.1f-1ubuntu2_amd64.deb"
+    wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/${FILE}
+    sudo dpkg -i ${FILE}
+
+    sudo apt-get install openssl
+    sudo apt-get install libcurl4-openssl-dev git
+
+    # Instal R-Packages
+    cd /home \
+    && Rscript -e 'install.packages("remotes", repos = "http://cran.r-project.org")'
+
+    # Install OpenML dependencies
+    Rscript -e 'install.packages("curl", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("openssl", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("httr", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("farff", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("OpenML", repos = "http://cran.r-project.org")'
+
+    # Install rbv2 dependencies
+    Rscript -e 'remotes::install_version("BBmisc", version = "1.11", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("rpart", version = "4.1-13", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("e1071", version = "1.7-0.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("xgboost", version = "0.82.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("ranger", version = "0.11.2", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("RcppHNSW", version = "0.1.0", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("mlr", version = "2.14", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_github("mlr-org/mlr3misc", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("mlrCPO", version = "0.3.6", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("testthat", version = "3.1.4", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("glmnet", version = "4.1-3", upgrade = "never", repos = "http://cran.r-project.org")'
+    # ################################ BASE DEPENDENCIES ################################
+
+    Rscript -e 'remotes::install_github("pfistfl/rbv2", upgrade = "never", dependencies = True)' \
+    && Rscript -e 'remotes::install_github("sumny/iaml", upgrade = "never", dependencies = True)' \
+    && Rscript -e 'remotes::install_github("sumny/fair", upgrade = "never", dependencies = True)'
+
+    cd /home \
+    && mkdir data && cd data \
+    && git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git \
+
+    # Upgrade pip
+    python3 -m pip install --upgrade pip
+
+    # Install HPOBench
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout development \
+    && echo "Please never push a recipe that checks out any other branch than development or master" \
+    && pip uninstall -y rpy2 \
+    && pip install .[yahpo_gym_raw]
+    # && git checkout development \
+
+    # Clean Up.
+    echo "Please don't touch the following lines" \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge \
+
+%runscript
+    python3 -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml.yahpo_benchmark $@
\ No newline at end of file
diff --git a/hpobench/container/recipes/ml/Singularity.rbv2Benchmark b/hpobench/container/recipes/ml/Singularity.rbv2Benchmark
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
index 66ee63b1..98914ed1 100644
--- a/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
+++ b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
@@ -3,7 +3,7 @@ From: python:3.7-slim
 
 %labels
 MAINTAINER pfistererf@googlemail.com
-VERSION v0.0.1
+VERSION v0.0.2
 
 %help
     This is a template for a Singularity recipe
@@ -20,10 +20,10 @@ VERSION v0.0.1
 
     cd /home \
     && mkdir data && cd data \
-    && git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git\
+    && git clone --depth 1 -b fair https://github.com/slds-lmu/yahpo_data.git
 
     cd /home \
-    && git clone https://github.com/pfistfl/HPOBench.git \
+    && git clone https://github.com/automl/HPOBench.git \
     && cd HPOBench \
     && echo "Please never push a recipe that checks out any other branch than development or master" \
     && git checkout master \
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index c72305e1..c9b058dc 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -15,6 +15,7 @@
 import gzip
 import json
 import logging
+import os
 import pickle
 import tarfile
 from io import BytesIO
@@ -1225,3 +1226,42 @@ def _load_json(path):
         with open(path, "r") as f:
             data = json.load(f)
         return data
+
+
+class YAHPODataManager(DataManager):
+    def __init__(self, data_dir: Union[Path, str, None]):
+        super(YAHPODataManager, self).__init__()
+
+        if data_dir is None:
+            data_dir = hpobench.config_file.data_dir / "yahpo_data"
+        self.data_dir = Path(data_dir)
+        self.logger.info(f'Read data from data directory: {data_dir}')
+
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/lock_yahpo_raw', delay=0.5)
+    def _try_download(self):
+        """Clone the data repository."""
+        if not self.data_dir.exists():
+            self.logger.info(
+                'Try to download data from https://github.com/slds-lmu/yahpo_data/tree/fair'
+            )
+            # Create the data directory if not existing
+            self.create_save_directory(self.data_dir.parent)
+
+            import git
+            git.Repo.clone_from(url='https://github.com/slds-lmu/yahpo_data.git',
+                                to_path=str(self.data_dir),
+                                branch='fair',
+                                multi_options=['--depth 1'])
+            self.logger.info(f'Successfully cloned data from repo to {self.data_dir}')
+
+    def load(self):
+        from yahpo_gym.local_config import LocalConfiguration
+        local_config = LocalConfiguration()
+
+        # When in the containerized version, redirect to the data inside the container.
+        if 'YAHPO_CONTAINER' in os.environ:
+            local_config.init_config(data_path='/home/data/yahpo_data')
+        else:
+            self._try_download()
+            local_config.init_config(data_path=str(self.data_dir))
diff --git a/tests/test_yahpo_raw.py b/tests/test_yahpo_raw.py
new file mode 100644
index 00000000..65694603
--- /dev/null
+++ b/tests/test_yahpo_raw.py
@@ -0,0 +1,12 @@
+from hpobench.container.benchmarks.ml.yahpo_benchmark import YAHPOGymMORawBenchmark
+
+
+def test_mo_benchmark():
+
+    b = YAHPOGymMORawBenchmark(scenario="iaml_xgboost", instance="40981",)
+    cfg = b.get_configuration_space().get_default_configuration()
+    b.objective_function(cfg)
+
+
+if __name__ == '__main__':
+    test_mo_benchmark()
\ No newline at end of file