diff --git a/.gitignore b/.gitignore
index f91d209dc..febccef61 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,5 @@ notebook_test_stats.csv
 **/*.quarto_ipynb
 
 **/.jupyter_cache
+
+scripts/*
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cd2151ae2..34eced53b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,8 @@ and this project adheres to [Pragmatic Versioning](https://github.com/experiment
 - Configurable `noise_constraint` support for GP-based surrogates (`SingleTaskGP`, `MixedSingleTaskGP`, `TanimotoGP`, `MultiTaskGP`, and `RobustSingleTaskGP`) and corresponding linear/polynomial wrappers.
 - Optional `initial_value` field on the `GreaterThan`, `LessThan`, and `Positive` prior constraint data models (already present on `Interval`), letting users opt-in to a warm-start of the constrained gpytorch parameter at construction time.
 - Generalized NChooseK constraint support in DoE: `min_count > 0` is now supported, non-zero lower bounds (`lb > 0`) are allowed for NChooseK features, overlapping NChooseK constraints (shared features) are handled via incremental pairwise merge with consistency filtering, and `nchoosek_constraints_as_bounds` generates deactivation patterns for all activity levels `k ∈ [min_count, max_count]`.
+- `PairwiseGPSurrogate`, a Gaussian process surrogate that learns a latent utility function from pairwise preference/comparison data, wrapping BoTorch's `PairwiseGP`. The pairwise likelihood is selectable via `likelihood="probit"` (default) or `"logit"`.
+- `SmoothedBoxPrior` prior, and a concrete instantiable `Interval` prior constraint.
 - Aggregation of duplicated experiments in the `cross_validate` method of trainable surrogates to avoid data leakage, controlled via the `aggregate` boolean flag, default `False`.
 
 ### Changed
diff --git a/_quarto.yml b/_quarto.yml
index 3a3478c4c..16fd1c03b 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -222,6 +222,7 @@ website:
             - docs/tutorials/advanced_examples/transfer_learning_bo.qmd
             - docs/tutorials/advanced_examples/octane_number.qmd
             - docs/tutorials/advanced_examples/llm_molecular.qmd
+            - docs/tutorials/advanced_examples/pairwise_gp.qmd
         - section: "Benchmarks"
           contents:
             - docs/tutorials/benchmarks/index.qmd
diff --git a/bofire/data_models/priors/api.py b/bofire/data_models/priors/api.py
index 7bdec0ffd..2c0db0772 100644
--- a/bofire/data_models/priors/api.py
+++ b/bofire/data_models/priors/api.py
@@ -23,6 +23,7 @@
     NormalPrior,
 )
 from bofire.data_models.priors.prior import Prior
+from bofire.data_models.priors.smoothedbox import SmoothedBoxPrior
 from bofire.data_models.unions import tagged_union
 
 
@@ -32,11 +33,13 @@
     LKJPrior,
     LogNormalPrior,
     DimensionalityScaledLogNormalPrior,
+    SmoothedBoxPrior,
 ]
 
 AnyPrior = tagged_union(*_PRIOR_TYPES)
 
 _PRIOR_CONSTRAINT_TYPES: list[type] = [
+    Interval,
     NonTransformedInterval,
     LogTransformedInterval,
     Positive,
@@ -49,7 +52,9 @@
 
 # these are priors that are generally applicable
 # and do not depend on problem specific extra parameters
-AnyGeneralPrior = tagged_union(GammaPrior, NormalPrior, LKJPrior, LogNormalPrior)
+AnyGeneralPrior = tagged_union(
+    GammaPrior, NormalPrior, LKJPrior, LogNormalPrior, SmoothedBoxPrior
+)
 
 # default priors of interest
 # botorch defaults
@@ -72,7 +77,7 @@
     sd_prior=GammaPrior(concentration=2.0, rate=0.15),
 )
 
-# prior for RobustSingleTaskGPSurrogate
+# priors for RobustSingleTaskGPSurrogate
 ROBUSTGP_LENGTHSCALE_CONSTRAINT = partial(
     NonTransformedInterval,
     lower_bound=0.05,
@@ -87,6 +92,34 @@
     initial_value=0.1,
 )
 
+
+# Priors for PairwiseGPSurrogate based on botorch defaults
+PAIRWISEGP_LENGTHSCALE_PRIOR = partial(
+    GammaPrior,
+    concentration=2.4,
+    rate=2.7,
+)
+
+PAIRWISEGP_LENGTHSCALE_CONSTRAINT = partial(
+    GreaterThan,
+    lower_bound=1e-4,
+    initial_value=0.5185,  # mode of the lengthscale GammaPrior(2.4, 2.7)
+)
+
+PAIRWISEGP_OUTPUTSCALE_PRIOR = partial(
+    SmoothedBoxPrior,
+    lower_bound=0.01,
+    upper_bound=100,
+    sigma=0.01,
+)
+
+PAIRWISEGP_OUTPUTSCALE_CONSTRAINT = partial(
+    Interval,
+    lower_bound=5e-3,
+    upper_bound=200,
+    initial_value=1,
+)
+
 # Hvarfner priors
 HVARFNER_NOISE_PRIOR = partial(LogNormalPrior, loc=-4, scale=1)
 HVARFNER_LENGTHSCALE_PRIOR = DimensionalityScaledLogNormalPrior
diff --git a/bofire/data_models/priors/interval.py b/bofire/data_models/priors/interval.py
index 7fafc8699..3295b24fb 100644
--- a/bofire/data_models/priors/interval.py
+++ b/bofire/data_models/priors/interval.py
@@ -1,4 +1,4 @@
-from typing import Any, Literal, Optional
+from typing import Literal, Optional
 
 from pydantic import PositiveFloat, model_validator
 
@@ -6,7 +6,7 @@
 
 
 class Interval(PriorConstraint):
-    """Abstract Interval class.
+    """Interval constraint on a GP hyperparameter.
 
     It is used to define interval constraints on GP hyperparameters.
 
@@ -19,7 +19,7 @@ class Interval(PriorConstraint):
             the raw parameter at its default (no warm-start).
     """
 
-    type: Any
+    type: Literal["Interval"] = "Interval"
     lower_bound: PositiveFloat
     upper_bound: PositiveFloat
     initial_value: Optional[PositiveFloat] = None
diff --git a/bofire/data_models/priors/smoothedbox.py b/bofire/data_models/priors/smoothedbox.py
new file mode 100644
index 000000000..34189f58e
--- /dev/null
+++ b/bofire/data_models/priors/smoothedbox.py
@@ -0,0 +1,37 @@
+from typing import Literal
+
+from pydantic import PositiveFloat, model_validator
+
+from bofire.data_models.priors.prior import Prior
+
+
+class SmoothedBoxPrior(Prior):
+    """A smoothed approximation of a uniform prior.
+
+    .. math::
+
+    \begin{equation*}
+        B = {x: a_i <= x_i <= b_i}
+        d(x, B) = min_{x' in B} |x - x'|
+        pdf(x) \\sim exp(- d(x, B)**2 / sqrt(2 * sigma^2))
+    \\end{equation*}
+
+    Attributes:
+        lower_bound: lower bound of the uniform prior
+        upper_bound: upper bound of the uniform prior
+        sigma: related to pdf(x)
+
+    """
+
+    type: Literal["SmoothedBoxPrior"] = "SmoothedBoxPrior"
+    lower_bound: float
+    upper_bound: float
+    sigma: PositiveFloat = 0.01
+
+    @model_validator(mode="after")
+    def validate_bounds(self):
+        if self.lower_bound >= self.upper_bound:
+            raise ValueError(
+                "The lower bound must be less than the upper bound for an interval."
+            )
+        return self
diff --git a/bofire/data_models/surrogates/api.py b/bofire/data_models/surrogates/api.py
index 062c1b18a..892d5b8fb 100644
--- a/bofire/data_models/surrogates/api.py
+++ b/bofire/data_models/surrogates/api.py
@@ -31,6 +31,7 @@
     MultiTaskGPHyperconfig,
     MultiTaskGPSurrogate,
 )
+from bofire.data_models.surrogates.pairwise_gp import PairwiseGPSurrogate
 from bofire.data_models.surrogates.polynomial import PolynomialSurrogate
 from bofire.data_models.surrogates.random_forest import RandomForestSurrogate
 from bofire.data_models.surrogates.robust_single_task_gp import (
@@ -72,6 +73,7 @@
     PiecewiseLinearGPSurrogate,
     AdditiveMapSaasSingleTaskGPSurrogate,
     EnsembleMapSaasSingleTaskGPSurrogate,
+    PairwiseGPSurrogate,
 )
 
 AnyTrainableSurrogate = tagged_union(
diff --git a/bofire/data_models/surrogates/botorch.py b/bofire/data_models/surrogates/botorch.py
index e36d99cc4..2f0e129f8 100644
--- a/bofire/data_models/surrogates/botorch.py
+++ b/bofire/data_models/surrogates/botorch.py
@@ -1,7 +1,8 @@
 from typing import Type
 
-from pydantic import Field, field_validator
+from pydantic import Field, field_validator, model_validator
 
+from bofire.data_models.domain.api import EngineeredFeatures
 from bofire.data_models.domain.features import Inputs
 from bofire.data_models.enum import CategoricalEncodingEnum
 from bofire.data_models.features.api import (
@@ -37,6 +38,9 @@ class BotorchSurrogate(Surrogate):
     categorical_encodings: InputTransformSpecs = Field(
         default_factory=dict, validate_default=True
     )
+    engineered_features: EngineeredFeatures = Field(
+        default_factory=lambda: EngineeredFeatures()
+    )
 
     @field_validator("input_preprocessing_specs")
     @classmethod
@@ -99,3 +103,8 @@ def validate_categorical_encodings(cls, v, info):
         v = cls._generate_default_categorical_encodings(inputs, v)
         inputs._validate_transform_specs(v)
         return v
+
+    @model_validator(mode="after")
+    def validate_engineered_features(self):
+        self.engineered_features.validate_inputs(self.inputs)
+        return self
diff --git a/bofire/data_models/surrogates/deterministic.py b/bofire/data_models/surrogates/deterministic.py
index 31cee404e..02abb8b03 100644
--- a/bofire/data_models/surrogates/deterministic.py
+++ b/bofire/data_models/surrogates/deterministic.py
@@ -2,7 +2,6 @@
 
 from pydantic import Field, field_validator, model_validator
 
-from bofire.data_models.domain.api import EngineeredFeatures
 from bofire.data_models.features.api import (
     AnyOutput,
     CategoricalInput,
@@ -67,9 +66,6 @@ def validate_mapping(self):
 
 class LinearDeterministicSurrogate(BotorchSurrogate):
     type: Literal["LinearDeterministicSurrogate"] = "LinearDeterministicSurrogate"
-    engineered_features: EngineeredFeatures = Field(
-        default_factory=lambda: EngineeredFeatures()
-    )
     coefficients: Annotated[Dict[str, float], Field(min_length=1)]
     intercept: float
 
@@ -93,7 +89,7 @@ def validate_input_types(self):
 
     @field_validator("engineered_features")
     @classmethod
-    def validate_engineered_features(cls, engineered_features, info):
+    def validate_linear_engineered_features(cls, engineered_features, info):
         for feat in engineered_features.get():
             if feat.n_transformed_inputs != 1:
                 raise ValueError(
diff --git a/bofire/data_models/surrogates/pairwise_gp.py b/bofire/data_models/surrogates/pairwise_gp.py
new file mode 100644
index 000000000..3ba3c2a66
--- /dev/null
+++ b/bofire/data_models/surrogates/pairwise_gp.py
@@ -0,0 +1,68 @@
+from typing import Literal, Type
+
+from pydantic import Field, model_validator
+
+from bofire.data_models.features.api import AnyOutput, ContinuousOutput
+from bofire.data_models.kernels.api import AnyKernel, RBFKernel, ScaleKernel
+from bofire.data_models.priors.api import (
+    PAIRWISEGP_LENGTHSCALE_CONSTRAINT,
+    PAIRWISEGP_LENGTHSCALE_PRIOR,
+    PAIRWISEGP_OUTPUTSCALE_CONSTRAINT,
+    PAIRWISEGP_OUTPUTSCALE_PRIOR,
+)
+from bofire.data_models.surrogates.botorch import BotorchSurrogate
+from bofire.data_models.surrogates.scaler import AnyScaler, Normalize
+from bofire.data_models.surrogates.trainable import TrainableSurrogate
+
+
+class PairwiseGPSurrogate(BotorchSurrogate, TrainableSurrogate):
+    """Pairwise Gaussian Process surrogate built on top of BoTorch's PairwiseGP.
+
+    Fits a latent utility function from binary winner/loser pair labels. The
+    `preferences` DataFrame references rows of the standard BoFire `experiments`
+    DataFrame by `labcode`; the single output feature represents the latent
+    utility inferred from those comparisons.
+
+    Attributes:
+        likelihood: The pairwise likelihood linking latent-utility differences
+            to preference probabilities -- ``"probit"`` (Gaussian comparison
+            noise, BoTorch's default) or ``"logit"`` (logistic noise, i.e. the
+            Bradley-Terry model).
+    """
+
+    type: Literal["PairwiseGPSurrogate"] = "PairwiseGPSurrogate"
+
+    kernel: AnyKernel = Field(
+        default_factory=lambda: ScaleKernel(
+            base_kernel=RBFKernel(
+                ard=True,
+                lengthscale_prior=PAIRWISEGP_LENGTHSCALE_PRIOR(),
+                lengthscale_constraint=PAIRWISEGP_LENGTHSCALE_CONSTRAINT(),
+            ),
+            outputscale_prior=PAIRWISEGP_OUTPUTSCALE_PRIOR(),
+            outputscale_constraint=PAIRWISEGP_OUTPUTSCALE_CONSTRAINT(),
+        )
+    )
+    scaler: AnyScaler = Field(default_factory=Normalize)
+    likelihood: Literal["probit", "logit"] = "probit"
+
+    @classmethod
+    def is_output_implemented(cls, my_type: Type[AnyOutput]) -> bool:
+        return isinstance(my_type, type(ContinuousOutput))
+
+    @model_validator(mode="after")
+    def validate_single_output(self):
+        if len(self.outputs) != 1:
+            raise ValueError(
+                "PairwiseGPSurrogate supports exactly one output (the latent utility)."
+            )
+        return self
+
+    @model_validator(mode="after")
+    def validate_scalekernel(self):
+        if not isinstance(self.kernel, ScaleKernel):
+            raise ValueError(
+                "PairwiseGPSurrogate.kernel must be a ScaleKernel "
+                "(BoTorch's PairwiseGP requires the covariance module to be a ScaleKernel)."
+            )
+        return self
diff --git a/bofire/data_models/surrogates/trainable.py b/bofire/data_models/surrogates/trainable.py
index 631b0ddcd..1fd722beb 100644
--- a/bofire/data_models/surrogates/trainable.py
+++ b/bofire/data_models/surrogates/trainable.py
@@ -1,10 +1,10 @@
 from typing import Annotated, Any, Literal, Optional
 
 import pandas as pd
-from pydantic import Field, field_validator, model_validator
+from pydantic import Field, field_validator
 
 from bofire.data_models.base import BaseModel
-from bofire.data_models.domain.api import Domain, EngineeredFeatures, Inputs, Outputs
+from bofire.data_models.domain.api import Domain, Inputs, Outputs
 from bofire.data_models.enum import RegressionMetricsEnum, UQRegressionMetricsEnum
 from bofire.data_models.features.api import ContinuousOutput
 from bofire.data_models.objectives.api import MaximizeObjective, MinimizeObjective
@@ -76,16 +76,6 @@ def _update_hyperparameters(surrogate_data, hyperparameters: pd.Series):
 
 class TrainableSurrogate(BaseModel):
     hyperconfig: Optional[Hyperconfig] = None
-    engineered_features: EngineeredFeatures = Field(
-        default_factory=lambda: EngineeredFeatures()
-    )
-
-    @model_validator(mode="after")
-    def validate_aggregations(self):
-        self.engineered_features.validate_inputs(
-            self.inputs  # ty: ignore[unresolved-attribute]
-        )
-        return self
 
     def update_hyperparameters(self, hyperparameters: pd.Series):
         if self.hyperconfig is not None:
diff --git a/bofire/priors/mapper.py b/bofire/priors/mapper.py
index f0b2a5eb9..d2f3d8e91 100644
--- a/bofire/priors/mapper.py
+++ b/bofire/priors/mapper.py
@@ -103,6 +103,25 @@ def map_DimensionalityScaledLogNormalPrior(
     )
 
 
+def map_SmoothedBoxPrior(
+    data_model: data_models.SmoothedBoxPrior,
+    **kwargs,
+) -> gpytorch.priors.smoothed_box_prior.SmoothedBoxPrior:
+    return gpytorch.priors.smoothed_box_prior.SmoothedBoxPrior(
+        a=data_model.lower_bound, b=data_model.upper_bound, sigma=data_model.sigma
+    )
+
+
+def map_Interval(
+    data_model: data_models.Interval,
+) -> gpytorch.constraints.Interval:
+    return gpytorch.constraints.Interval(
+        lower_bound=data_model.lower_bound,
+        upper_bound=data_model.upper_bound,
+        initial_value=data_model.initial_value,
+    )
+
+
 def map_NonTransformedInterval(
     data_model: data_models.NonTransformedInterval,
 ) -> NonTransformedInterval:
@@ -155,6 +174,8 @@ def map_LessThan(
     data_models.LKJPrior: map_LKJPrior,
     data_models.LogNormalPrior: map_LogNormalPrior,
     data_models.DimensionalityScaledLogNormalPrior: map_DimensionalityScaledLogNormalPrior,
+    data_models.SmoothedBoxPrior: map_SmoothedBoxPrior,
+    data_models.Interval: map_Interval,
     data_models.NonTransformedInterval: map_NonTransformedInterval,
     data_models.LogTransformedInterval: map_LogTransformedInterval,
     data_models.Positive: map_Positive,
diff --git a/bofire/surrogates/api.py b/bofire/surrogates/api.py
index dd3b149df..100c2f2f0 100644
--- a/bofire/surrogates/api.py
+++ b/bofire/surrogates/api.py
@@ -12,6 +12,8 @@
     RegressionMLPEnsemble,
 )
 from bofire.surrogates.multi_task_gp import MultiTaskGPSurrogate
+from bofire.surrogates.pairwise_gp import PairwiseGPSurrogate
+from bofire.surrogates.pairwise_trainable import PairwiseTrainableSurrogate
 from bofire.surrogates.random_forest import RandomForestSurrogate
 from bofire.surrogates.shape import PiecewiseLinearGPSurrogate
 from bofire.surrogates.single_task_gp import SingleTaskGPSurrogate
diff --git a/bofire/surrogates/botorch.py b/bofire/surrogates/botorch.py
index 83dc18187..28bf250b8 100644
--- a/bofire/surrogates/botorch.py
+++ b/bofire/surrogates/botorch.py
@@ -40,6 +40,7 @@ def __init__(
         self.categorical_encodings: InputTransformSpecs = (
             data_model.categorical_encodings
         )
+        self.engineered_features = data_model.engineered_features
         super().__init__(data_model=data_model, **kwargs)
 
     def _predict(self, transformed_X: pd.DataFrame):
@@ -101,24 +102,6 @@ def loads(self, data: str):
         buffer = io.BytesIO(base64.b64decode(data.encode()))
         self.model = torch.load(buffer, weights_only=False)
 
-
-class TrainableBotorchSurrogate(BotorchSurrogate, TrainableSurrogate):
-    def __init__(
-        self,
-        data_model: TrainableDataModel,
-        input_transform: Optional[Union[InputTransform, None]] = None,
-        **kwargs,
-    ):
-        self.scaler = data_model.scaler
-        self.output_scaler = data_model.output_scaler
-        self.engineered_features = data_model.engineered_features
-        self._input_transform: Union[InputTransform, None] = input_transform
-        super().__init__(data_model=data_model, **kwargs)
-
-    @property
-    def re_init_kwargs(self) -> dict:
-        return {"input_transform": self._input_transform}
-
     def get_feature_indices(
         self,
         feature_keys: List[str],
@@ -126,6 +109,10 @@ def get_feature_indices(
         """Returns the indices of the specified features (both original and engineered)
         after applying all input transforms.
 
+        Used as the ``features_to_idx_mapper`` when mapping feature-specific
+        kernels, so a kernel restricted to a subset of feature keys can resolve
+        them to tensor column indices.
+
         Args:
             feature_keys: The feature keys to get the indices for.
 
@@ -160,6 +147,23 @@ def get_feature_indices(
         )
         return indices
 
+
+class TrainableBotorchSurrogate(BotorchSurrogate, TrainableSurrogate):
+    def __init__(
+        self,
+        data_model: TrainableDataModel,
+        input_transform: Optional[Union[InputTransform, None]] = None,
+        **kwargs,
+    ):
+        self.scaler = data_model.scaler
+        self.output_scaler = data_model.output_scaler
+        self._input_transform: Union[InputTransform, None] = input_transform
+        super().__init__(data_model=data_model, **kwargs)
+
+    @property
+    def re_init_kwargs(self) -> dict:
+        return {"input_transform": self._input_transform}
+
     def _fit(self, X: pd.DataFrame, Y: pd.DataFrame, **kwargs):
         if self._input_transform is None:
             self._input_transform = get_input_transform(
diff --git a/bofire/surrogates/deterministic.py b/bofire/surrogates/deterministic.py
index 3871534c2..217134279 100644
--- a/bofire/surrogates/deterministic.py
+++ b/bofire/surrogates/deterministic.py
@@ -22,7 +22,6 @@ def __init__(
     ):
         self.intercept = data_model.intercept
         self.coefficients = data_model.coefficients
-        self.engineered_features = data_model.engineered_features
         super().__init__(data_model=data_model, **kwargs)
         self.model = AffineDeterministicModel(
             b=data_model.intercept,
diff --git a/bofire/surrogates/mapper.py b/bofire/surrogates/mapper.py
index 8270b070e..895f4bd74 100644
--- a/bofire/surrogates/mapper.py
+++ b/bofire/surrogates/mapper.py
@@ -18,6 +18,7 @@
 )
 from bofire.surrogates.mlp import ClassificationMLPEnsemble, RegressionMLPEnsemble
 from bofire.surrogates.multi_task_gp import MultiTaskGPSurrogate
+from bofire.surrogates.pairwise_gp import PairwiseGPSurrogate
 from bofire.surrogates.random_forest import RandomForestSurrogate
 from bofire.surrogates.robust_single_task_gp import RobustSingleTaskGPSurrogate
 from bofire.surrogates.shape import PiecewiseLinearGPSurrogate
@@ -99,6 +100,7 @@ def map_MixedSingleTaskGPSurrogate(
     data_models.CategoricalDeterministicSurrogate: CategoricalDeterministicSurrogate,
     data_models.AdditiveMapSaasSingleTaskGPSurrogate: AdditiveMapSaasSingleTaskGPSurrogate,
     data_models.EnsembleMapSaasSingleTaskGPSurrogate: EnsembleMapSaasSingleTaskGPSurrogate,
+    data_models.PairwiseGPSurrogate: PairwiseGPSurrogate,
 }
 
 
diff --git a/bofire/surrogates/pairwise_gp.py b/bofire/surrogates/pairwise_gp.py
new file mode 100644
index 000000000..fd18a992d
--- /dev/null
+++ b/bofire/surrogates/pairwise_gp.py
@@ -0,0 +1,73 @@
+from typing import Dict, Optional
+
+import botorch
+import torch
+from botorch.fit import fit_gpytorch_mll
+from botorch.models.likelihoods.pairwise import (
+    PairwiseLogitLikelihood,
+    PairwiseProbitLikelihood,
+)
+from botorch.models.pairwise_gp import PairwiseLaplaceMarginalLogLikelihood
+from botorch.models.transforms.input import InputTransform
+
+import bofire.kernels.api as kernels
+from bofire.data_models.surrogates.api import PairwiseGPSurrogate as DataModel
+from bofire.surrogates.botorch import BotorchSurrogate
+from bofire.surrogates.pairwise_trainable import PairwiseTrainableSurrogate
+
+
+# maps the serializable likelihood name to the BoTorch PairwiseLikelihood class
+PAIRWISE_LIKELIHOODS = {
+    "probit": PairwiseProbitLikelihood,
+    "logit": PairwiseLogitLikelihood,
+}
+
+
+class PairwiseGPSurrogate(BotorchSurrogate, PairwiseTrainableSurrogate):
+    def __init__(
+        self,
+        data_model: DataModel,
+        **kwargs,
+    ):
+        self.kernel = data_model.kernel
+        self.scaler = data_model.scaler
+        self.likelihood = data_model.likelihood
+        super().__init__(data_model=data_model, **kwargs)
+
+    model: Optional[botorch.models.PairwiseGP] = None
+    training_specs: Dict = {}
+
+    def _fit_pairwise(
+        self,
+        datapoints: torch.Tensor,
+        comparisons: torch.Tensor,
+        input_transform: Optional[InputTransform] = None,
+        **kwargs,
+    ):
+        if input_transform is not None:
+            n_dim = input_transform(datapoints).shape[-1]
+        else:
+            n_dim = datapoints.shape[-1]
+
+        self.model = botorch.models.PairwiseGP(
+            datapoints=datapoints,
+            comparisons=comparisons,
+            likelihood=PAIRWISE_LIKELIHOODS[self.likelihood](),
+            covar_module=kernels.map(
+                self.kernel,
+                batch_shape=torch.Size(),
+                active_dims=list(range(n_dim)),
+                features_to_idx_mapper=self.get_feature_indices,
+            ),
+            input_transform=input_transform,
+        )
+
+        mll = PairwiseLaplaceMarginalLogLikelihood(
+            likelihood=self.model.likelihood,
+            model=self.model,
+        )
+        fit_gpytorch_mll(mll, options=self.training_specs, max_attempts=50)
+
+    # `_predict` is inherited from BotorchSurrogate: it calls
+    # `posterior(X, observation_noise=True)`, and PairwiseGP ignores
+    # `observation_noise` (verified in scripts/pairwise_gp_checks.py).
diff --git a/bofire/surrogates/pairwise_trainable.py b/bofire/surrogates/pairwise_trainable.py
new file mode 100644
index 000000000..801cbc2d1
--- /dev/null
+++ b/bofire/surrogates/pairwise_trainable.py
@@ -0,0 +1,170 @@
+import warnings
+from abc import ABC, abstractmethod
+from typing import Callable, Dict, Optional
+
+import numpy as np
+import pandas as pd
+import torch
+from botorch.models.transforms.input import InputTransform
+
+from bofire.data_models.domain.api import EngineeredFeatures
+from bofire.data_models.domain.features import Inputs, Outputs
+from bofire.data_models.surrogates.scaler import AnyScaler
+from bofire.data_models.types import InputTransformSpecs
+from bofire.surrogates.utils import get_input_transform
+from bofire.utils.torch_tools import tkwargs
+
+
+class PairwiseTrainableSurrogate(ABC):
+    """Mixin for surrogates that train on pairwise preference data.
+
+    Structurally parallel to :class:`bofire.surrogates.trainable.TrainableSurrogate`
+    but with a different fit signature: ``fit(experiments, preferences)`` instead
+    of ``fit(experiments)``. The ``preferences`` DataFrame carries the label
+    signal; ``experiments`` provides the candidate designs referenced by
+    ``labcode``.
+    """
+
+    # These attributes are provided by Surrogate / BotorchSurrogate via
+    # multiple inheritance.
+    inputs: Inputs
+    outputs: Outputs
+    predict: Callable[..., pd.DataFrame]
+    input_preprocessing_specs: InputTransformSpecs
+    categorical_encodings: InputTransformSpecs
+    scaler: AnyScaler
+    engineered_features: EngineeredFeatures
+
+    PREFERENCE_COLUMNS = ("labcode_A", "labcode_B", "preference")
+
+    def fit(
+        self,
+        experiments: pd.DataFrame,
+        preferences: pd.DataFrame,
+        options: Optional[Dict] = None,
+    ):
+        """Fit the pairwise surrogate to preference data.
+
+        Args:
+            experiments: DataFrame with input columns plus a ``labcode`` column.
+                Output columns are ignored if present.
+            preferences: DataFrame with exactly the columns ``labcode_A``,
+                ``labcode_B``, ``preference``. ``preference`` is a float in
+                ``[-1, 1]``; sign determines the winner (``>0`` = A wins,
+                ``<0`` = B wins), magnitude is currently discarded, ``==0``
+                rows are dropped as ties.
+            options: Additional keyword arguments forwarded to ``_fit_pairwise``.
+        """
+        # validate experiment inputs (skip outputs validation: pairwise GP has
+        # no observed Y values in the experiments DataFrame).
+        experiments = self.inputs.validate_experiments(experiments, strict=False)
+
+        if "labcode" not in experiments.columns:
+            raise ValueError(
+                "PairwiseGPSurrogate requires a 'labcode' column on experiments "
+                "to reference preferences."
+            )
+
+        if experiments["labcode"].duplicated().any():
+            dups = (
+                experiments.loc[
+                    experiments["labcode"].duplicated(keep=False), "labcode"
+                ]
+                .unique()
+                .tolist()
+            )
+            raise ValueError(
+                f"Duplicate labcodes in experiments: {sorted(dups)}. "
+                "PairwiseGPSurrogate requires unique labcodes."
+            )
+
+        expected_cols = set(self.PREFERENCE_COLUMNS)
+        missing = expected_cols - set(preferences.columns)
+        if missing:
+            raise ValueError(
+                f"`preferences` is missing required columns: {sorted(missing)}. "
+                f"Expected at least {sorted(expected_cols)}."
+            )
+
+        valid_labcodes = set(experiments["labcode"].tolist())
+        ref_labcodes = set(preferences["labcode_A"].tolist()) | set(
+            preferences["labcode_B"].tolist()
+        )
+        unknown = ref_labcodes - valid_labcodes
+        if unknown:
+            raise ValueError(
+                "`preferences` references labcodes not present in experiments: "
+                f"{sorted(unknown)}."
+            )
+
+        # sign conversion: drop ties (preference == 0)
+        pref_values = preferences["preference"].astype(float)
+        tie_mask = pref_values == 0.0
+        n_ties = int(tie_mask.sum())
+        if n_ties > 0:
+            warnings.warn(
+                f"Dropping {n_ties} pair(s) with preference == 0 (ties).",
+                stacklevel=2,
+            )
+        preferences = preferences.loc[~tie_mask].reset_index(drop=True)
+
+        if len(preferences) == 0:
+            raise ValueError("No valid pairs remain after dropping ties.")
+
+        # build idx_map: labcode -> position in datapoints tensor
+        idx_map = {
+            labcode: i for i, labcode in enumerate(experiments["labcode"].tolist())
+        }
+
+        # winner/loser indices from sign of preference
+        pref_signs = preferences["preference"].astype(float).to_numpy()
+        labcode_A = preferences["labcode_A"].to_numpy()
+        labcode_B = preferences["labcode_B"].to_numpy()
+        winners = np.where(pref_signs > 0, labcode_A, labcode_B)
+        losers = np.where(pref_signs > 0, labcode_B, labcode_A)
+        winner_idx = np.array([idx_map[w] for w in winners], dtype=np.int64)
+        loser_idx = np.array([idx_map[loser] for loser in losers], dtype=np.int64)
+        comparisons = torch.from_numpy(np.stack([winner_idx, loser_idx], axis=1)).to(
+            dtype=torch.long
+        )
+
+        # datapoints tensor (float64), pre-transformed via BoFire's
+        # categorical preprocessing; BoTorch's input_transform is applied
+        # internally by the model.
+        X = experiments[self.inputs.get_keys()]
+        transformed_X = self.inputs.transform(X, self.input_preprocessing_specs)
+        datapoints = torch.from_numpy(transformed_X.values).to(**tkwargs)
+
+        input_transform = get_input_transform(
+            inputs=self.inputs,
+            engineered_features=self.engineered_features,
+            scaler_type=self.scaler,
+            categorical_encodings=self.categorical_encodings,
+            X=X,
+        )
+
+        options = options or {}
+        self._fit_pairwise(
+            datapoints=datapoints,
+            comparisons=comparisons,
+            input_transform=input_transform,
+            **options,
+        )
+
+    @abstractmethod
+    def _fit_pairwise(
+        self,
+        datapoints: torch.Tensor,
+        comparisons: torch.Tensor,
+        input_transform: Optional[InputTransform] = None,
+        **kwargs,
+    ):
+        """Fit the underlying pairwise model.
+
+        Args:
+            datapoints: Unique candidate features, shape ``(n, d)``, float64.
+            comparisons: Long tensor of shape ``(m, 2)`` where each row is
+                ``[winner_idx, loser_idx]`` into ``datapoints``.
+            input_transform: Optional BoTorch input transform to attach to
+                the underlying model.
+        """
diff --git a/docs/tutorials/advanced_examples/pairwise_gp.qmd b/docs/tutorials/advanced_examples/pairwise_gp.qmd
new file mode 100644
index 000000000..b8a32bc00
--- /dev/null
+++ b/docs/tutorials/advanced_examples/pairwise_gp.qmd
@@ -0,0 +1,191 @@
+---
+title: Preference Learning with the Pairwise GP
+jupyter: python3
+---
+
+Not every objective can be read off a number line. Sometimes the only signal
+available is a *comparison*: a taste panel says cake A is nicer than cake B, a
+chemist judges one crystallisation cleaner than another, a user clicks one
+layout over another. The quantity actually being compared — a latent *utility* —
+is never observed directly.
+
+BoFire's `PairwiseGPSurrogate` learns that latent utility from pairwise
+comparison data. It wraps BoTorch's `PairwiseGP`, a Gaussian process whose
+likelihood models the probability that one design is preferred over another.
+
+This tutorial fits a `PairwiseGPSurrogate` to synthetic preference data and
+checks that it recovers the hidden utility.
+
+## Imports
+
+```{python}
+import warnings
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from scipy.stats import kendalltau
+
+import bofire.surrogates.api as surrogates
+from bofire.data_models.domain.api import Inputs, Outputs
+from bofire.data_models.features.api import ContinuousInput, ContinuousOutput
+from bofire.data_models.surrogates.api import PairwiseGPSurrogate
+
+warnings.filterwarnings("ignore")
+rng = np.random.default_rng(42)
+```
+
+## A synthetic preference problem
+
+We work on a two-dimensional input space. The *latent utility* — the thing an
+expert implicitly scores when they pick a winner — is a smooth bump that peaks
+near `(0.7, 0.3)`. In a real preference experiment this function is unknown; we
+define it here only so we can check what the surrogate learned.
+
+```{python}
+def latent_utility(X: np.ndarray) -> np.ndarray:
+    """Ground-truth utility. Unobserved in a real preference experiment."""
+    peak = np.array([0.7, 0.3])
+    return -np.sum((X - peak) ** 2, axis=-1)
+```
+
+We draw a handful of candidate designs and give each a unique `labcode` — the
+identifier BoFire uses to link a design to the comparisons it appears in.
+
+```{python}
+n_candidates = 40
+X = rng.random((n_candidates, 2))
+labcodes = [f"design_{i:02d}" for i in range(n_candidates)]
+
+experiments = pd.DataFrame(X, columns=["x_1", "x_2"])
+experiments["labcode"] = labcodes
+experiments.head()
+```
+
+Now we simulate an expert comparing random pairs of designs. The expert prefers
+whichever design has the higher latent utility, but their judgement is noisy, so
+they occasionally pick the worse one.
+
+```{python}
+def make_comparisons(n_comparisons: int) -> pd.DataFrame:
+    rows = []
+    for _ in range(n_comparisons):
+        i, j = rng.choice(n_candidates, size=2, replace=False)
+        # noisy utilities -> the expert occasionally prefers the worse design
+        u_i = latent_utility(X[i]) + rng.normal(scale=0.05)
+        u_j = latent_utility(X[j]) + rng.normal(scale=0.05)
+        winner, loser = (i, j) if u_i > u_j else (j, i)
+        # record the winner in slot A or B at random: the *sign* of
+        # `preference` carries the label, not which slot the winner sits in
+        if rng.random() < 0.5:
+            rows.append((labcodes[winner], labcodes[loser], 1.0))  # A preferred
+        else:
+            rows.append((labcodes[loser], labcodes[winner], -1.0))  # B preferred
+    return pd.DataFrame(rows, columns=["labcode_A", "labcode_B", "preference"])
+
+
+preferences = make_comparisons(n_comparisons=120)
+preferences.head()
+```
+
+## Setting up the surrogate
+
+A `PairwiseGPSurrogate` needs an input space and exactly one output — the latent
+utility it will infer.
+
+```{python}
+inputs = Inputs(
+    features=[
+        ContinuousInput(key="x_1", bounds=(0.0, 1.0)),
+        ContinuousInput(key="x_2", bounds=(0.0, 1.0)),
+    ]
+)
+outputs = Outputs(features=[ContinuousOutput(key="utility")])
+
+surrogate_data = PairwiseGPSurrogate(inputs=inputs, outputs=outputs)
+surrogate = surrogates.map(surrogate_data)
+```
+
+Unlike a standard surrogate, `fit` takes *two* DataFrames — the designs and the
+comparisons between them:
+
+```{python}
+surrogate.fit(experiments, preferences)
+surrogate.is_fitted
+```
+
+## Inspecting the learned utility
+
+`predict` returns the posterior mean (`utility_pred`) and standard deviation
+(`utility_sd`) of the latent utility on new designs. Pairwise data pins down the
+*ranking* of designs, not the absolute utility values — the GP recovers utility
+only up to an arbitrary scale and offset — so we score it with the Kendall-Tau
+rank correlation against the true utility.
+
+```{python}
+test_X = rng.random((500, 2))
+test_df = pd.DataFrame(test_X, columns=["x_1", "x_2"])
+predictions = surrogate.predict(test_df)
+
+true_utility = latent_utility(test_X)
+tau = kendalltau(predictions["utility_pred"], true_utility).correlation
+print(f"Kendall-Tau rank correlation vs. true utility: {tau:.3f}")
+```
+
+A correlation close to 1 means the surrogate ranks unseen designs almost exactly
+as the latent utility would. We can see this directly by plotting the learned
+posterior mean next to the ground truth.
+
+```{python}
+fig, axes = plt.subplots(1, 2, figsize=(11, 4.5))
+
+grid = np.linspace(0, 1, 60)
+gx, gy = np.meshgrid(grid, grid)
+grid_df = pd.DataFrame({"x_1": gx.ravel(), "x_2": gy.ravel()})
+
+true_grid = latent_utility(grid_df.to_numpy()).reshape(gx.shape)
+pred_grid = surrogate.predict(grid_df)["utility_pred"].to_numpy().reshape(gx.shape)
+
+for ax, surface, title in [
+    (axes[0], true_grid, "True latent utility"),
+    (axes[1], pred_grid, "Learned posterior mean"),
+]:
+    contour = ax.contourf(gx, gy, surface, levels=20, cmap="viridis")
+    ax.scatter(
+        X[:, 0], X[:, 1], c="white", edgecolors="black", s=25,
+        label="compared designs",
+    )
+    ax.set(xlabel="x_1", ylabel="x_2", title=title)
+    fig.colorbar(contour, ax=ax)
+axes[0].legend(loc="upper left")
+plt.tight_layout()
+plt.show()
+```
+
+Both surfaces peak in the same region: from comparisons alone, the surrogate has
+located where utility is highest. The absolute contour values differ — pairwise
+learning identifies the latent utility only up to a monotone transform — but the
+*ranking*, which is what matters for optimization, is recovered.
+
+## The preference encoding
+
+The `preferences` DataFrame must have exactly these three columns:
+
+| column | meaning |
+|---|---|
+| `labcode_A` | first design in the comparison (must appear in `experiments`) |
+| `labcode_B` | second design in the comparison (must appear in `experiments`) |
+| `preference` | sign marks the winner: `> 0` → A preferred, `< 0` → B preferred |
+
+Only the **sign** of `preference` is used — its magnitude is currently ignored.
+A `preference` of exactly `0` denotes a tie; tied rows are dropped with a
+warning. Because the slot (A vs. B) is just bookkeeping, swapping `labcode_A`
+with `labcode_B` and flipping the sign of `preference` describes the very same
+comparison and yields the same fit.
+
+## Where to go next
+
+`PairwiseGPSurrogate` is the modelling building block for preference-based
+Bayesian optimization: combine it with an acquisition function such as BoTorch's
+*Expected Utility of the Best Option* (EUBO) to decide which pair of designs to
+put in front of the expert next.
diff --git a/tests/bofire/data_models/specs/kernels.py b/tests/bofire/data_models/specs/kernels.py
index d28cf7cbb..d404a1aa7 100644
--- a/tests/bofire/data_models/specs/kernels.py
+++ b/tests/bofire/data_models/specs/kernels.py
@@ -5,6 +5,7 @@
     GammaPrior,
     LogNormalPrior,
     NonTransformedInterval,
+    SmoothedBoxPrior,
 )
 from tests.bofire.data_models.specs.prior_constraints import specs as prior_constraints
 from tests.bofire.data_models.specs.priors import specs as priors
@@ -283,6 +284,14 @@
         .model_dump(),
     },
 )
+specs.add_valid(
+    kernels.ScaleKernel,
+    lambda: {
+        "base_kernel": specs.valid(kernels.RBFKernel).obj().model_dump(),
+        "outputscale_prior": priors.valid(SmoothedBoxPrior).obj().model_dump(),
+        "outputscale_constraint": None,
+    },
+)
 specs.add_valid(
     kernels.AdditiveKernel,
     lambda: {
diff --git a/tests/bofire/data_models/specs/prior_constraints.py b/tests/bofire/data_models/specs/prior_constraints.py
index c8dc2c3e6..5ca63e7d9 100644
--- a/tests/bofire/data_models/specs/prior_constraints.py
+++ b/tests/bofire/data_models/specs/prior_constraints.py
@@ -89,3 +89,43 @@
     error=ValueError,
     message="The initial value must be less than or equal to the upper bound.",
 )
+
+specs.add_valid(
+    priors.Interval,
+    lambda: {
+        "lower_bound": 0.005,
+        "upper_bound": 200.0,
+        "initial_value": 1.0,
+    },
+)
+
+specs.add_valid(
+    priors.Interval,
+    lambda: {
+        "lower_bound": 0.005,
+        "upper_bound": 200.0,
+        "initial_value": None,
+    },
+)
+
+specs.add_invalid(
+    priors.Interval,
+    lambda: {
+        "lower_bound": 2.0,
+        "upper_bound": 1.0,
+        "initial_value": 1.5,
+    },
+    error=ValueError,
+    message="The lower bound must be less than the upper bound for an interval.",
+)
+
+specs.add_invalid(
+    priors.Interval,
+    lambda: {
+        "lower_bound": 1.0,
+        "upper_bound": 10.0,
+        "initial_value": 11.0,
+    },
+    error=ValueError,
+    message="The initial value must be within the bounds of the interval.",
+)
diff --git a/tests/bofire/data_models/specs/priors.py b/tests/bofire/data_models/specs/priors.py
index b65088bda..8989497a5 100644
--- a/tests/bofire/data_models/specs/priors.py
+++ b/tests/bofire/data_models/specs/priors.py
@@ -105,3 +105,31 @@
         "scale_scaling": random.random(),
     },
 )
+
+
+specs.add_valid(
+    priors.SmoothedBoxPrior,
+    lambda: {
+        "lower_bound": 0.01,
+        "upper_bound": 100.0,
+        "sigma": 0.01,
+    },
+)
+
+specs.add_invalid(
+    priors.SmoothedBoxPrior,
+    lambda: {"lower_bound": 4.0, "upper_bound": 1.0, "sigma": 0.01},
+    error=ValueError,
+    message="The lower bound must be less than the upper bound for an interval.",
+)
+
+for sigma in [-1.0, 0.0]:
+    specs.add_invalid(
+        priors.SmoothedBoxPrior,
+        lambda sigma=sigma: {
+            "lower_bound": 0.01,
+            "upper_bound": 100.0,
+            "sigma": sigma,
+        },
+        error=ValidationError,
+    )
diff --git a/tests/bofire/data_models/specs/surrogates.py b/tests/bofire/data_models/specs/surrogates.py
index 97443acdf..66491967d 100644
--- a/tests/bofire/data_models/specs/surrogates.py
+++ b/tests/bofire/data_models/specs/surrogates.py
@@ -15,12 +15,17 @@
     HammingDistanceKernel,
     InfiniteWidthBNNKernel,
     MaternKernel,
+    RBFKernel,
     ScaleKernel,
     TanimotoKernel,
     WassersteinKernel,
 )
 from bofire.data_models.molfeatures.api import Fingerprints
 from bofire.data_models.priors.api import (
+    PAIRWISEGP_LENGTHSCALE_CONSTRAINT,
+    PAIRWISEGP_LENGTHSCALE_PRIOR,
+    PAIRWISEGP_OUTPUTSCALE_CONSTRAINT,
+    PAIRWISEGP_OUTPUTSCALE_PRIOR,
     ROBUSTGP_LENGTHSCALE_CONSTRAINT,
     ROBUSTGP_OUTPUTSCALE_CONSTRAINT,
     THREESIX_LENGTHSCALE_PRIOR,
@@ -731,6 +736,7 @@
         ).model_dump(),
         "input_preprocessing_specs": {"x_cat": CategoricalEncodingEnum.ORDINAL},
         "categorical_encodings": {"x_cat": CategoricalEncodingEnum.ORDINAL},
+        "engineered_features": EngineeredFeatures().model_dump(),
         "mapping": {"a": 0.1, "b": 0.2, "c": 1.0},
         "dump": None,
     },
@@ -1180,3 +1186,97 @@
     error=ValueError,
     message="Feature keys do not match input keys.",
 )
+
+specs.add_valid(
+    models.PairwiseGPSurrogate,
+    lambda: {
+        "inputs": Inputs(
+            features=[
+                ContinuousInput(key="a", bounds=[0, 1]),
+                ContinuousInput(key="b", bounds=[0, 1]),
+            ],
+        ).model_dump(),
+        "outputs": Outputs(
+            features=[features.valid(ContinuousOutput).obj()],
+        ).model_dump(),
+        "kernel": ScaleKernel(
+            base_kernel=RBFKernel(
+                ard=True,
+                lengthscale_prior=PAIRWISEGP_LENGTHSCALE_PRIOR(),
+                lengthscale_constraint=PAIRWISEGP_LENGTHSCALE_CONSTRAINT(),
+            ),
+            outputscale_prior=PAIRWISEGP_OUTPUTSCALE_PRIOR(),
+            outputscale_constraint=PAIRWISEGP_OUTPUTSCALE_CONSTRAINT(),
+        ).model_dump(),
+        "scaler": Normalize().model_dump(),
+        "likelihood": "probit",
+        "engineered_features": EngineeredFeatures().model_dump(),
+        "hyperconfig": None,
+        "input_preprocessing_specs": {},
+        "categorical_encodings": {},
+        "dump": None,
+    },
+)
+
+specs.add_valid(
+    models.PairwiseGPSurrogate,
+    lambda: {
+        "inputs": Inputs(
+            features=[
+                ContinuousInput(key="a", bounds=[0, 1]),
+                ContinuousInput(key="b", bounds=[0, 1]),
+            ],
+        ).model_dump(),
+        "outputs": Outputs(
+            features=[features.valid(ContinuousOutput).obj()],
+        ).model_dump(),
+        "kernel": ScaleKernel(
+            base_kernel=RBFKernel(
+                ard=True,
+                lengthscale_prior=PAIRWISEGP_LENGTHSCALE_PRIOR(),
+                lengthscale_constraint=PAIRWISEGP_LENGTHSCALE_CONSTRAINT(),
+            ),
+            outputscale_prior=PAIRWISEGP_OUTPUTSCALE_PRIOR(),
+            outputscale_constraint=PAIRWISEGP_OUTPUTSCALE_CONSTRAINT(),
+        ).model_dump(),
+        "scaler": Normalize().model_dump(),
+        "likelihood": "logit",
+        "engineered_features": EngineeredFeatures().model_dump(),
+        "hyperconfig": None,
+        "input_preprocessing_specs": {},
+        "categorical_encodings": {},
+        "dump": None,
+    },
+)
+
+specs.add_invalid(
+    models.PairwiseGPSurrogate,
+    lambda: {
+        "inputs": Inputs(
+            features=[ContinuousInput(key="a", bounds=[0, 1])],
+        ).model_dump(),
+        "outputs": Outputs(
+            features=[
+                ContinuousOutput(key="y1"),
+                ContinuousOutput(key="y2"),
+            ],
+        ).model_dump(),
+    },
+    error=ValueError,
+    message="PairwiseGPSurrogate supports exactly one output",
+)
+
+specs.add_invalid(
+    models.PairwiseGPSurrogate,
+    lambda: {
+        "inputs": Inputs(
+            features=[ContinuousInput(key="a", bounds=[0, 1])],
+        ).model_dump(),
+        "outputs": Outputs(
+            features=[features.valid(ContinuousOutput).obj()],
+        ).model_dump(),
+        "kernel": RBFKernel(ard=True).model_dump(),
+    },
+    error=ValueError,
+    message="PairwiseGPSurrogate.kernel must be a ScaleKernel",
+)
diff --git a/tests/bofire/surrogates/test_pairwise_gp.py b/tests/bofire/surrogates/test_pairwise_gp.py
new file mode 100644
index 000000000..bcce2b02d
--- /dev/null
+++ b/tests/bofire/surrogates/test_pairwise_gp.py
@@ -0,0 +1,301 @@
+import numpy as np
+import pandas as pd
+import pytest
+from botorch.models.likelihoods.pairwise import (
+    PairwiseLogitLikelihood,
+    PairwiseProbitLikelihood,
+)
+from botorch.models.pairwise_gp import PairwiseGP
+from pandas.testing import assert_frame_equal
+from scipy.stats import kendalltau
+
+import bofire.surrogates.api as surrogates
+from bofire.data_models.domain.api import Inputs, Outputs
+from bofire.data_models.features.api import (
+    CategoricalInput,
+    ContinuousInput,
+    ContinuousOutput,
+)
+from bofire.data_models.kernels.api import (
+    AdditiveKernel,
+    HammingDistanceKernel,
+    RBFKernel,
+    ScaleKernel,
+)
+from bofire.data_models.surrogates.api import PairwiseGPSurrogate
+
+
+DIM = 3
+
+
+def _utility(X: np.ndarray) -> np.ndarray:
+    """Latent ground-truth utility: weighted sum of the inputs."""
+    return X @ np.sqrt(np.arange(1, X.shape[1] + 1))
+
+
+def _make_domain():
+    inputs = Inputs(
+        features=[
+            ContinuousInput(key=f"x_{i + 1}", bounds=(0.0, 1.0)) for i in range(DIM)
+        ]
+    )
+    outputs = Outputs(features=[ContinuousOutput(key="utility")])
+    return inputs, outputs
+
+
+def _make_data(n_points: int = 30, n_comparisons: int = 80, seed: int = 0):
+    """Build (experiments, preferences) plus the true utility for scoring.
+
+    Each comparison places the winner in slot A or B at random, so the *sign*
+    of `preference` (not its slot) carries the label.
+    """
+    rng = np.random.default_rng(seed)
+    X = rng.random((n_points, DIM))
+    utility = _utility(X)
+    labcodes = [f"cand_{i}" for i in range(n_points)]
+    experiments = pd.DataFrame(X, columns=[f"x_{i + 1}" for i in range(DIM)])
+    experiments["labcode"] = labcodes
+
+    rows = []
+    for _ in range(n_comparisons):
+        i, j = rng.choice(n_points, size=2, replace=False)
+        winner, loser = (i, j) if utility[i] > utility[j] else (j, i)
+        if rng.random() < 0.5:
+            rows.append((labcodes[winner], labcodes[loser], 1.0))  # A preferred
+        else:
+            rows.append((labcodes[loser], labcodes[winner], -1.0))  # B preferred
+    preferences = pd.DataFrame(rows, columns=["labcode_A", "labcode_B", "preference"])
+    return experiments, preferences, utility
+
+
+def test_pairwise_gp_fit_and_predict():
+    inputs, outputs = _make_domain()
+    experiments, preferences, utility = _make_data()
+
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+
+    # predicting before fitting raises
+    with pytest.raises(ValueError, match="not fitted"):
+        surrogate.predict(experiments[[f"x_{i + 1}" for i in range(DIM)]])
+
+    surrogate.fit(experiments, preferences)
+    assert surrogate.is_fitted
+    assert isinstance(surrogate.model, PairwiseGP)
+
+    preds = surrogate.predict(experiments[[f"x_{i + 1}" for i in range(DIM)]])
+    assert list(preds.columns) == ["utility_pred", "utility_sd"]
+    assert len(preds) == len(experiments)
+    assert preds["utility_sd"].ge(0).all()
+    assert np.isfinite(preds.to_numpy()).all()
+
+    # the surrogate should recover the latent utility ranking
+    corr = kendalltau(preds["utility_pred"].to_numpy(), utility).correlation
+    assert corr > 0.8
+
+
+def test_pairwise_gp_sign_convention_is_consistent():
+    """Flipping every A/B slot and every preference sign must not change the fit."""
+    inputs, outputs = _make_domain()
+    experiments, preferences, _ = _make_data()
+
+    flipped = pd.DataFrame(
+        {
+            "labcode_A": preferences["labcode_B"],
+            "labcode_B": preferences["labcode_A"],
+            "preference": -preferences["preference"],
+        }
+    )
+
+    test_X = experiments[[f"x_{i + 1}" for i in range(DIM)]]
+
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    surrogate.fit(experiments, preferences)
+    preds = surrogate.predict(test_X)
+
+    surrogate_flipped = surrogates.map(
+        PairwiseGPSurrogate(inputs=inputs, outputs=outputs)
+    )
+    surrogate_flipped.fit(experiments, flipped)
+    preds_flipped = surrogate_flipped.predict(test_X)
+
+    assert_frame_equal(preds, preds_flipped)
+
+
+def test_pairwise_gp_dump_and_load():
+    inputs, outputs = _make_domain()
+    experiments, preferences, _ = _make_data()
+    test_X = experiments[[f"x_{i + 1}" for i in range(DIM)]]
+
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    surrogate.fit(experiments, preferences)
+    preds = surrogate.predict(test_X)
+    dump = surrogate.dumps()
+
+    reloaded = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    reloaded.loads(dump)
+    assert reloaded.is_fitted
+    assert_frame_equal(preds, reloaded.predict(test_X))
+
+
+def test_pairwise_gp_requires_labcode_column():
+    inputs, outputs = _make_domain()
+    experiments, preferences, _ = _make_data()
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    with pytest.raises(ValueError, match="labcode"):
+        surrogate.fit(experiments.drop(columns=["labcode"]), preferences)
+
+
+def test_pairwise_gp_rejects_duplicate_labcodes():
+    inputs, outputs = _make_domain()
+    experiments, preferences, _ = _make_data()
+    experiments.loc[1, "labcode"] = experiments.loc[0, "labcode"]
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    with pytest.raises(ValueError, match="Duplicate labcodes"):
+        surrogate.fit(experiments, preferences)
+
+
+def test_pairwise_gp_rejects_missing_preference_columns():
+    inputs, outputs = _make_domain()
+    experiments, preferences, _ = _make_data()
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    with pytest.raises(ValueError, match="missing required columns"):
+        surrogate.fit(experiments, preferences.drop(columns=["preference"]))
+
+
+def test_pairwise_gp_rejects_unknown_labcode():
+    inputs, outputs = _make_domain()
+    experiments, preferences, _ = _make_data()
+    preferences.loc[0, "labcode_A"] = "does_not_exist"
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    with pytest.raises(ValueError, match="not present in experiments"):
+        surrogate.fit(experiments, preferences)
+
+
+def test_pairwise_gp_drops_ties_with_warning():
+    inputs, outputs = _make_domain()
+    experiments, preferences, _ = _make_data()
+    preferences.loc[0, "preference"] = 0.0
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    with pytest.warns(UserWarning, match="ties"):
+        surrogate.fit(experiments, preferences)
+    assert surrogate.is_fitted
+
+
+def test_pairwise_gp_all_ties_raises():
+    inputs, outputs = _make_domain()
+    experiments, preferences, _ = _make_data()
+    preferences["preference"] = 0.0
+    surrogate = surrogates.map(PairwiseGPSurrogate(inputs=inputs, outputs=outputs))
+    with pytest.warns(UserWarning, match="ties"):
+        with pytest.raises(ValueError, match="No valid pairs"):
+            surrogate.fit(experiments, preferences)
+
+
+def test_pairwise_gp_data_model_validation():
+    inputs, _ = _make_domain()
+
+    # exactly one output is required
+    with pytest.raises(ValueError, match="exactly one output"):
+        PairwiseGPSurrogate(
+            inputs=inputs,
+            outputs=Outputs(
+                features=[ContinuousOutput(key="y1"), ContinuousOutput(key="y2")]
+            ),
+        )
+
+    # the kernel must be a ScaleKernel
+    with pytest.raises(ValueError, match="must be a ScaleKernel"):
+        PairwiseGPSurrogate(
+            inputs=inputs,
+            outputs=Outputs(features=[ContinuousOutput(key="utility")]),
+            kernel=RBFKernel(ard=True),
+        )
+
+
+@pytest.mark.parametrize(
+    "likelihood, expected_cls",
+    [
+        ("probit", PairwiseProbitLikelihood),
+        ("logit", PairwiseLogitLikelihood),
+    ],
+)
+def test_pairwise_gp_likelihood(likelihood, expected_cls):
+    """Both pairwise likelihoods fit and map to the right BoTorch class."""
+    inputs, outputs = _make_domain()
+    experiments, preferences, utility = _make_data()
+
+    surrogate = surrogates.map(
+        PairwiseGPSurrogate(inputs=inputs, outputs=outputs, likelihood=likelihood)
+    )
+    surrogate.fit(experiments, preferences)
+
+    assert isinstance(surrogate.model.likelihood, expected_cls)
+
+    preds = surrogate.predict(experiments[[f"x_{i + 1}" for i in range(DIM)]])
+    corr = kendalltau(preds["utility_pred"].to_numpy(), utility).correlation
+    assert corr > 0.8
+
+
+def test_pairwise_gp_likelihood_default_is_probit():
+    inputs, outputs = _make_domain()
+    assert PairwiseGPSurrogate(inputs=inputs, outputs=outputs).likelihood == "probit"
+
+
+def test_pairwise_gp_feature_specific_kernels():
+    """A kernel built from feature-specific sub-kernels over a mixed
+    continuous/categorical space fits and predicts.
+
+    Exercises the `features_to_idx_mapper` wiring: each sub-kernel is restricted
+    to a subset of feature keys, which must resolve to the right columns of the
+    (categorically encoded) datapoints tensor.
+    """
+    rng = np.random.default_rng(0)
+    n = 30
+    cats = ["a", "b", "c"]
+    cat_offset = {"a": 0.0, "b": 0.6, "c": 1.2}
+    cat_vals = rng.choice(cats, size=n)
+    X = rng.random((n, 2))
+    utility = X @ np.array([1.0, 2.0]) + np.array([cat_offset[c] for c in cat_vals])
+    labcodes = [f"cand_{i}" for i in range(n)]
+
+    experiments = pd.DataFrame(X, columns=["x_1", "x_2"])
+    experiments["cat"] = cat_vals
+    experiments["labcode"] = labcodes
+
+    rows = []
+    for _ in range(80):
+        i, j = rng.choice(n, size=2, replace=False)
+        winner, loser = (i, j) if utility[i] > utility[j] else (j, i)
+        rows.append((labcodes[winner], labcodes[loser], 1.0))
+    preferences = pd.DataFrame(rows, columns=["labcode_A", "labcode_B", "preference"])
+
+    inputs = Inputs(
+        features=[
+            ContinuousInput(key="x_1", bounds=(0.0, 1.0)),
+            ContinuousInput(key="x_2", bounds=(0.0, 1.0)),
+            CategoricalInput(key="cat", categories=cats),
+        ]
+    )
+    outputs = Outputs(features=[ContinuousOutput(key="utility")])
+    # an RBF restricted to the continuous features + a Hamming kernel on the
+    # categorical -- a kernel that is *not* the same over all inputs
+    kernel = ScaleKernel(
+        base_kernel=AdditiveKernel(
+            kernels=[
+                RBFKernel(ard=True, features=["x_1", "x_2"]),
+                HammingDistanceKernel(features=["cat"]),
+            ]
+        )
+    )
+
+    surrogate = surrogates.map(
+        PairwiseGPSurrogate(inputs=inputs, outputs=outputs, kernel=kernel)
+    )
+    surrogate.fit(experiments, preferences)
+    assert surrogate.is_fitted
+
+    preds = surrogate.predict(experiments[["x_1", "x_2", "cat"]])
+    assert list(preds.columns) == ["utility_pred", "utility_sd"]
+    assert np.isfinite(preds.to_numpy()).all()
+    corr = kendalltau(preds["utility_pred"].to_numpy(), utility).correlation
+    assert corr > 0.6