emdgroup · Hrovatin · Feb 11, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `BCUT2D` encoding for `SubstanceParameter`
 - Stored benchmarking results now include the Python environment and version
 - `qPSTD` acquisition function
+- BoTorch kernel presets.
+- Additional benchmarks
+- BoTorch kernel presets.
 
 ### Changed
 - Acquisition function indicator `is_mc` has been removed in favor of new indicators 
@@ -20,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.12.2] - 2025-01-31
 ### Changed
 - More robust settings for the GP fitting
+- The `beta` parameter of `UCB` and `qUCB` can now also take negative values
 
 ## [0.12.1] - 2025-01-29
 ### Changed

@@ -8,16 +8,14 @@
 from attr.converters import optional as optional_c
 from attr.validators import optional as optional_v
 from attrs import define, field, fields
-from attrs.validators import ge, gt, instance_of, le
+from attrs.validators import gt, instance_of, le
 from typing_extensions import override
 
 from baybe.acquisition.base import AcquisitionFunction
 from baybe.searchspace import SearchSpace
 from baybe.utils.basic import classproperty
-from baybe.utils.sampling_algorithms import (
-    DiscreteSamplingMethod,
-    sample_numerical_df,
-)
+from baybe.utils.sampling_algorithms import DiscreteSamplingMethod, sample_numerical_df
+from baybe.utils.validation import finite_float
 
 
 ########################################################################################
@@ -264,12 +262,15 @@ class UpperConfidenceBound(AcquisitionFunction):
 
     abbreviation: ClassVar[str] = "UCB"
 
-    beta: float = field(converter=float, validator=ge(0.0), default=0.2)
+    beta: float = field(converter=float, validator=finite_float, default=0.2)
     """Trade-off parameter for mean and variance.
 
-    A value of zero makes the acquisition mechanism consider the posterior predictive
-    mean only, resulting in pure exploitation. Higher values shift the focus more and
-    more toward exploration.
+    * ``beta > 0``: Rewards uncertainty, takes more risk.
+      Limit ``inf``: Pure exploration
+    * ``beta < 0``: Punishes uncertainty, takes less risk.
+      Limit ``-inf``: Pure exploitation
+    * ``beta = 0``: Discards knowledge about uncertainty, i.e. neither rewards nor
+      punishes it, is risk-neutral.
     """
 
 
@@ -279,13 +280,8 @@ class qUpperConfidenceBound(AcquisitionFunction):
 
     abbreviation: ClassVar[str] = "qUCB"
 
-    beta: float = field(converter=float, validator=ge(0.0), default=0.2)
-    """Trade-off parameter for mean and variance.
-
-    A value of zero makes the acquisition mechanism consider the posterior predictive
-    mean only, resulting in pure exploitation. Higher values shift the focus more and
-    more toward exploration.
-    """
+    beta: float = field(converter=float, validator=finite_float, default=0.2)
+    """See :paramref:`UpperConfidenceBound.beta`."""
 
 
 @define(frozen=True)

@@ -42,8 +42,9 @@
 )
 from baybe.utils.basic import UNSPECIFIED, UnspecifiedType, is_all_instance
 from baybe.utils.boolean import eq_dataframe
-from baybe.utils.dataframe import filter_df, fuzzy_row_match
+from baybe.utils.dataframe import _ValidatedDataFrame, filter_df, fuzzy_row_match
 from baybe.utils.plotting import to_string
+from baybe.utils.validation import validate_parameter_input, validate_target_input
 
 if TYPE_CHECKING:
     from botorch.posteriors import Posterior
@@ -264,48 +265,25 @@ def add_measurements(
         Each addition of data is considered a new batch. Added results are checked for
         validity. Categorical values need to have an exact match. For numerical values,
         a campaign flag determines if values that lie outside a specified tolerance
-        are accepted.
-        Note that this modifies the provided data in-place.
+        are accepted. Possible validation exceptions are documented in
+        :func:`baybe.utils.validation.validate_target_input` and
+        :func:`baybe.utils.validation.validate_parameter_input`.
 
         Args:
             data: The data to be added (with filled values for targets). Preferably
                 created via :func:`baybe.campaign.Campaign.recommend`.
             numerical_measurements_must_be_within_tolerance: Flag indicating if
                 numerical parameters need to be within their tolerances.
-
-        Raises:
-            ValueError: If one of the targets has missing values or NaNs in the provided
-                dataframe.
-            TypeError: If the target has non-numeric entries in the provided dataframe.
         """
         # Invalidate recommendation cache first (in case of uncaught exceptions below)
         self._cached_recommendation = pd.DataFrame()
 
-        # Check if all targets have valid values
-        for target in self.targets:
-            if data[target.name].isna().any():
-                raise ValueError(
-                    f"The target '{target.name}' has missing values or NaNs in the "
-                    f"provided dataframe. Missing target values are not supported."
-                )
-            if data[target.name].dtype.kind not in "iufb":
-                raise TypeError(
-                    f"The target '{target.name}' has non-numeric entries in the "
-                    f"provided dataframe. Non-numeric target values are not supported."
-                )
-
-        # Check if all targets have valid values
-        for param in self.parameters:
-            if data[param.name].isna().any():
-                raise ValueError(
-                    f"The parameter '{param.name}' has missing values or NaNs in the "
-                    f"provided dataframe. Missing parameter values are not supported."
-                )
-            if param.is_numerical and (data[param.name].dtype.kind not in "iufb"):
-                raise TypeError(
-                    f"The numerical parameter '{param.name}' has non-numeric entries in"
-                    f" the provided dataframe."
-                )
+        # Validate target and parameter input values
+        validate_target_input(data, self.targets)
+        validate_parameter_input(
+            data, self.parameters, numerical_measurements_must_be_within_tolerance
+        )
+        data.__class__ = _ValidatedDataFrame
 
         # Read in measurements and add them to the database
         self.n_batches_done += 1
@@ -320,20 +298,14 @@ def add_measurements(
         # Update metadata
         if self.searchspace.type in (SearchSpaceType.DISCRETE, SearchSpaceType.HYBRID):
             idxs_matched = fuzzy_row_match(
-                self.searchspace.discrete.exp_rep,
-                data,
-                self.parameters,
-                numerical_measurements_must_be_within_tolerance,
+                self.searchspace.discrete.exp_rep, data, self.parameters
             )
             self._searchspace_metadata.loc[idxs_matched, _MEASURED] = True
 
         # Telemetry
         telemetry_record_value(TELEM_LABELS["COUNT_ADD_RESULTS"], 1)
         telemetry_record_recommended_measurement_percentage(
-            self._cached_recommendation,
-            data,
-            self.parameters,
-            numerical_measurements_must_be_within_tolerance,
+            self._cached_recommendation, data, self.parameters
         )
 
     def toggle_discrete_candidates(  # noqa: DOC501
@@ -423,8 +395,10 @@ def recommend(
             )
 
         # Invalidate cached recommendation if pending experiments are provided
-        if (pending_experiments is not None) and (len(pending_experiments) > 0):
+        if (pending_experiments is not None) and not pending_experiments.empty:
             self._cached_recommendation = pd.DataFrame()
+            validate_parameter_input(pending_experiments, self.parameters)
+            pending_experiments.__class__ = _ValidatedDataFrame
 
         # If there are cached recommendations and the batch size of those is equal to
         # the previously requested one, we just return those

@@ -6,6 +6,7 @@
 from attrs.converters import optional as optional_c
 from attrs.validators import ge, gt, in_, instance_of
 from attrs.validators import optional as optional_v
+from gpytorch.constraints import Interval
 from typing_extensions import override
 
 from baybe.kernels.base import BasicKernel
@@ -180,6 +181,12 @@ class RBFKernel(BasicKernel):
     )
     """An optional initial value for the kernel lengthscale."""
 
+    # TODO replace with baybe constraint if possible
+    lengthscale_constraint: Interval | None = field(
+        default=None, validator=optional_v(instance_of(Interval))
+    )
+    """An optional prior on the kernel lengthscale constraint."""
+
 
 @define(frozen=True)
 class RFFKernel(BasicKernel):

@@ -15,6 +15,8 @@
 from baybe.searchspace.continuous import SubspaceContinuous
 from baybe.searchspace.core import SearchSpaceType
 from baybe.searchspace.discrete import SubspaceDiscrete
+from baybe.utils.dataframe import _ValidatedDataFrame
+from baybe.utils.validation import validate_parameter_input, validate_target_input
 
 _DEPRECATION_ERROR_MESSAGE = (
     "The attribute '{}' is no longer available for recommenders. "
@@ -96,6 +98,25 @@ def recommend(
         measurements: pd.DataFrame | None = None,
         pending_experiments: pd.DataFrame | None = None,
     ) -> pd.DataFrame:
+        # Validation
+        if (
+            measurements is not None
+            and not isinstance(measurements, _ValidatedDataFrame)
+            and not measurements.empty
+            and objective is not None
+            and searchspace is not None
+        ):
+            validate_target_input(measurements, objective.targets)
+            validate_parameter_input(measurements, searchspace.parameters)
+            measurements.__class__ = _ValidatedDataFrame
+        if (
+            pending_experiments is not None
+            and not isinstance(pending_experiments, _ValidatedDataFrame)
+            and searchspace is not None
+        ):
+            validate_parameter_input(pending_experiments, searchspace.parameters)
+            pending_experiments.__class__ = _ValidatedDataFrame
+
         if searchspace.type is SearchSpaceType.CONTINUOUS:
             return self._recommend_continuous(
                 subspace_continuous=searchspace.continuous, batch_size=batch_size

@@ -17,6 +17,8 @@
 from baybe.searchspace import SearchSpace
 from baybe.surrogates import CustomONNXSurrogate, GaussianProcessSurrogate
 from baybe.surrogates.base import IndependentGaussianSurrogate, SurrogateProtocol
+from baybe.utils.dataframe import _ValidatedDataFrame
+from baybe.utils.validation import validate_parameter_input, validate_target_input
 
 
 @define
@@ -104,11 +106,21 @@ def recommend(
                 f"that an objective is specified."
             )
 
-        if (measurements is None) or (len(measurements) == 0):
+        # Experimental input validation
+        if (measurements is None) or measurements.empty:
             raise NotImplementedError(
                 f"Recommenders of type '{BayesianRecommender.__name__}' do not support "
                 f"empty training data."
             )
+        if not isinstance(measurements, _ValidatedDataFrame):
+            validate_target_input(measurements, objective.targets)
+            validate_parameter_input(measurements, searchspace.parameters)
+            measurements.__class__ = _ValidatedDataFrame
+        if pending_experiments is not None and not isinstance(
+            pending_experiments, _ValidatedDataFrame
+        ):
+            validate_parameter_input(pending_experiments, searchspace.parameters)
+            pending_experiments.__class__ = _ValidatedDataFrame
 
         if (
             isinstance(self._surrogate_model, IndependentGaussianSurrogate)

@@ -35,7 +35,7 @@ def recommend(
                 f"experiments from the candidate set, adjust the search space "
                 f"accordingly."
             )
-        if (measurements is not None) and (len(measurements) != 0):
+        if (measurements is not None) and not measurements.empty:
             warnings.warn(
                 f"'{self.recommend.__name__}' was called with a non-empty "
                 f"set of measurements but '{self.__class__.__name__}' does not "

@@ -118,7 +118,7 @@ def simulate_experiment(
         campaign = deepcopy(campaign)
 
         # Add the initial data
-        if initial_data is not None:
+        if (initial_data is not None) and not initial_data.empty:
             campaign.add_measurements(initial_data)
 
         # For impute_mode 'ignore', do not recommend space entries that are not

@@ -1,5 +1,6 @@
 """Gaussian process surrogate presets."""
 
+from baybe.surrogates.gaussian_process.presets.botorch import BotorchKernelFactory
 from baybe.surrogates.gaussian_process.presets.core import (
     GaussianProcessPreset,
     make_gp_from_preset,
@@ -10,6 +11,7 @@
 __all__ = [
     "DefaultKernelFactory",
     "EDBOKernelFactory",
+    "BotorchKernelFactory",
     "make_gp_from_preset",
     "GaussianProcessPreset",
 ]
@@ -0,0 +1,56 @@
+"""Presets adapted from BoTorch."""
+
+from __future__ import annotations
+
+from math import log, sqrt
+from typing import TYPE_CHECKING
+
+from attrs import define
+from gpytorch.constraints import GreaterThan
+from typing_extensions import override
+
+from baybe.kernels.basic import RBFKernel
+from baybe.parameters import TaskParameter
+from baybe.priors.basic import LogNormalPrior
+from baybe.searchspace import SearchSpace
+from baybe.surrogates.gaussian_process.kernel_factory import KernelFactory
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+    from baybe.kernels.base import Kernel
+
+
+@define
+class BotorchKernelFactory(KernelFactory):
+    """A kernel factory for Gaussian process surrogates adapted from BoTorch.
+
+    References:
+        * https://github.com/pytorch/botorch/blob/a018a5ffbcbface6229d6c39f7ac6ef9baf5765e/botorch/models/multitask.py#L220
+        * https://github.com/pytorch/botorch/blob/a018a5ffbcbface6229d6c39f7ac6ef9baf5765e/botorch/models/utils/gpytorch_modules.py#L100
+
+    """
+
+    @override
+    def __call__(
+        self, searchspace: SearchSpace, train_x: Tensor, train_y: Tensor
+    ) -> Kernel:
+        ard_num_dims = train_x.shape[-1] - len(
+            [
+                param
+                for param in searchspace.discrete.parameters
+                if isinstance(param, TaskParameter)
+            ]
+        )
+        lengthscale_prior = LogNormalPrior(
+            loc=sqrt(2) + log(ard_num_dims) * 0.5, scale=sqrt(3)
+        )
+
+        return RBFKernel(
+            lengthscale_prior=lengthscale_prior,
+            lengthscale_constraint=GreaterThan(
+                2.5e-2,
+                transform=None,
+                initial_value=lengthscale_prior.to_gpytorch().mode,
+            ),
+        )
@@ -214,7 +214,6 @@ def telemetry_record_recommended_measurement_percentage(
     cached_recommendation: pd.DataFrame,
     measurements: pd.DataFrame,
     parameters: Sequence[Parameter],
-    numerical_measurements_must_be_within_tolerance: bool,
 ) -> None:
     """Submit the percentage of added measurements.
 
@@ -232,31 +231,17 @@ def telemetry_record_recommended_measurement_percentage(
         measurements: The measurements which are supposed to be checked against cached
             recommendations.
         parameters: The list of parameters spanning the entire search space.
-        numerical_measurements_must_be_within_tolerance: If ``True``, numerical
-            parameter entries are matched with the reference elements only if there is
-            a match within the parameter tolerance. If ``False``, the closest match
-            is considered, irrespective of the distance.
     """
     if is_enabled():
-        if len(cached_recommendation) > 0:
+        if cached_recommendation.empty:
+            _submit_scalar_value(TELEM_LABELS["NAKED_INITIAL_MEASUREMENTS"], 1)
+        else:
             recommended_measurements_percentage = (
-                len(
-                    fuzzy_row_match(
-                        cached_recommendation,
-                        measurements,
-                        parameters,
-                        numerical_measurements_must_be_within_tolerance,
-                    )
-                )
+                len(fuzzy_row_match(cached_recommendation, measurements, parameters))
                 / len(cached_recommendation)
                 * 100.0
             )
             _submit_scalar_value(
                 TELEM_LABELS["RECOMMENDED_MEASUREMENTS_PERCENTAGE"],
                 recommended_measurements_percentage,
             )
-        else:
-            _submit_scalar_value(
-                TELEM_LABELS["NAKED_INITIAL_MEASUREMENTS"],
-                1,
-            )