meta-pytorch
diff --git a/‎botorch/acquisition/knowledge_gradient.py‎
Lines changed: 1 addition & 1 deletion b/‎botorch/acquisition/knowledge_gradient.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎botorch/generation/gen.py‎
Lines changed: 4 additions & 8 deletions b/‎botorch/generation/gen.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎botorch/models/fully_bayesian_multitask.py‎
Lines changed: 41 additions & 57 deletions b/‎botorch/models/fully_bayesian_multitask.py‎
Lines changed: 41 additions & 57 deletions
diff --git a/‎botorch/models/gpytorch.py‎
Lines changed: 2 additions & 0 deletions b/‎botorch/models/gpytorch.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎botorch/optim/core.py‎
Lines changed: 2 additions & 2 deletions b/‎botorch/optim/core.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎botorch/optim/fit.py‎
Lines changed: 2 additions & 2 deletions b/‎botorch/optim/fit.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎botorch/optim/optimize.py‎
Lines changed: 6 additions & 32 deletions b/‎botorch/optim/optimize.py‎
Lines changed: 6 additions & 32 deletions
@@ -223,7 +223,7 @@ def evaluate(self, X: Tensor, bounds: Tensor, **kwargs: Any) -> Tensor:
             kwargs: Additional keyword arguments. This includes the options for
                 optimization of the inner problem, i.e. `num_restarts`, `raw_samples`,
                 an `options` dictionary to be passed on to the optimization helpers, and
-                a `scipy_options` dictionary to be passed to `scipy.optimize.minimize`.
+                a `scipy_options` dictionary to be passed to `scipy.minimize`.
 
         Returns:
             A Tensor of shape `b`. For t-batch b, the q-KG value of the design
 
@@ -56,10 +56,6 @@ def gen_candidates_scipy(
 
     Optimizes an acquisition function starting from a set of initial candidates
     using `scipy.optimize.minimize` via a numpy converter.
-    We use SLSQP, if constraints are present, and LBFGS-B otherwise.
-    As `scipy.optimize.minimize` does not support optimizating a batch of problems, we
-    treat optimizing a set of candidates as a single optimization problem by
-    summing together their acquisition values.
 
     Args:
         initial_conditions: Starting points for optimization, with shape
@@ -86,7 +82,7 @@ def gen_candidates_scipy(
             `optimize_acqf()`. The constraints will later be passed to the scipy
             solver.
         options: Options used to control the optimization including "method"
-            and "maxiter". Select method for `scipy.optimize.minimize` using the
+            and "maxiter". Select method for `scipy.minimize` using the
             "method" key. By default uses L-BFGS-B for box-constrained problems
             and SLSQP if inequality or equality constraints are present. If
             `with_grad=False`, then we use a two-point finite difference estimate
@@ -447,13 +443,13 @@ def _process_scipy_result(res: OptimizeResult, options: dict[str, Any]) -> None:
             or "Iteration limit reached" in res.message
         ):
             logger.info(
-                "`scipy.optimize.minimize` exited by reaching the iteration limit of "
+                "`scipy.minimize` exited by reaching the iteration limit of "
                 f"`maxiter: {options.get('maxiter')}`."
             )
         elif "EVALUATIONS EXCEEDS LIMIT" in res.message:
             logger.info(
-                "`scipy.optimize.minimize` exited by reaching the function evaluation "
-                f"limit of `maxfun: {options.get('maxfun')}`."
+                "`scipy.minimize` exited by reaching the function evaluation limit of "
+                f"`maxfun: {options.get('maxfun')}`."
             )
         elif "Optimization timed out after" in res.message:
             logger.info(res.message)
 
@@ -18,14 +18,12 @@
     reshape_and_detach,
     SaasPyroModel,
 )
-from botorch.models.gpytorch import BatchedMultiOutputGPyTorchModel
 from botorch.models.multitask import MultiTaskGP
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.posteriors.fully_bayesian import GaussianMixturePosterior, MCMC_DIM
-from gpytorch.distributions import MultivariateNormal
+from gpytorch.distributions.multivariate_normal import MultivariateNormal
 from gpytorch.kernels import MaternKernel
-from gpytorch.kernels.index_kernel import IndexKernel
 from gpytorch.kernels.kernel import Kernel
 from gpytorch.likelihoods.likelihood import Likelihood
 from gpytorch.means.mean import Mean
@@ -134,7 +132,7 @@ def sample_task_lengthscale(
 
     def load_mcmc_samples(
         self, mcmc_samples: dict[str, Tensor]
-    ) -> tuple[Mean, Kernel, Likelihood, Kernel]:
+    ) -> tuple[Mean, Kernel, Likelihood, Kernel, Parameter]:
         r"""Load the MCMC samples into the mean_module, covar_module, and likelihood."""
         tkwargs = {"device": self.train_X.device, "dtype": self.train_X.dtype}
         num_mcmc_samples = len(mcmc_samples["mean"])
@@ -144,32 +142,27 @@ def load_mcmc_samples(
             mcmc_samples=mcmc_samples
         )
 
-        latent_covar_module = MaternKernel(
+        task_covar_module = MaternKernel(
             nu=2.5,
             ard_num_dims=self.task_rank,
             batch_shape=batch_shape,
         ).to(**tkwargs)
-        latent_covar_module.lengthscale = reshape_and_detach(
-            target=latent_covar_module.lengthscale,
+        task_covar_module.lengthscale = reshape_and_detach(
+            target=task_covar_module.lengthscale,
             new_value=mcmc_samples["task_lengthscale"],
         )
-        latent_features = mcmc_samples["latent_features"]
-        task_covar = latent_covar_module(latent_features)
-        task_covar_module = IndexKernel(
-            num_tasks=self.num_tasks,
-            rank=self.task_rank,
-            batch_shape=latent_features.shape[:-2],
+        latent_features = Parameter(
+            torch.rand(
+                batch_shape + torch.Size([self.num_tasks, self.task_rank]),
+                requires_grad=True,
+                **tkwargs,
+            )
         )
-        task_covar_module.covar_factor = Parameter(
-            task_covar.cholesky().to_dense().detach()
+        latent_features = reshape_and_detach(
+            target=latent_features,
+            new_value=mcmc_samples["latent_features"],
         )
-
-        # NOTE: 'var' is implicitly assumed to be zero from the sampling procedure in
-        # the FBMTGP model but not in the regular MTGP. I dont how if the var parameter
-        # affects predictions in practice, but setting it to zero is consistent with the
-        # previous implementation.
-        task_covar_module.var = torch.zeros_like(task_covar_module.var)
-        return mean_module, covar_module, likelihood, task_covar_module
+        return mean_module, covar_module, likelihood, task_covar_module, latent_features
 
 
 class SaasFullyBayesianMultiTaskGP(MultiTaskGP):
@@ -368,6 +361,7 @@ def load_mcmc_samples(self, mcmc_samples: dict[str, Tensor]) -> None:
             self.covar_module,
             self.likelihood,
             self.task_covar_module,
+            self.latent_features,
         ) = self.pyro_model.load_mcmc_samples(mcmc_samples=mcmc_samples)
 
     def posterior(
@@ -397,7 +391,30 @@ def posterior(
 
     def forward(self, X: Tensor) -> MultivariateNormal:
         self._check_if_fitted()
-        return super().forward(X)
+        x_basic, task_idcs = self._split_inputs(X)
+
+        mean_x = self.mean_module(x_basic)
+        covar_x = self.covar_module(x_basic)
+
+        tsub_idcs = task_idcs.squeeze(-1)
+        if tsub_idcs.ndim > 1:
+            tsub_idcs = tsub_idcs.squeeze(-2)
+        latent_features = self.latent_features[:, tsub_idcs, :]
+
+        if X.ndim > 3:
+            # batch eval mode
+            # for X (batch_shape x num_samples x q x d), task_idcs[:,i,:,] are the same
+            # reshape X to (batch_shape x num_samples x q x d)
+            latent_features = latent_features.permute(
+                [-i for i in range(X.ndim - 1, 2, -1)]
+                + [0]
+                + [-i for i in range(2, 0, -1)]
+            )
+
+        # Combine the two in an ICM fashion
+        covar_i = self.task_covar_module(latent_features)
+        covar = covar_x.mul(covar_i)
+        return MultivariateNormal(mean_x, covar)
 
     def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
         r"""Custom logic for loading the state dict.
@@ -439,40 +456,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
             self.covar_module,
             self.likelihood,
             self.task_covar_module,
+            self.latent_features,
         ) = self.pyro_model.load_mcmc_samples(mcmc_samples=mcmc_samples)
         # Load the actual samples from the state dict
         super().load_state_dict(state_dict=state_dict, strict=strict)
-
-    def condition_on_observations(
-        self, X: Tensor, Y: Tensor, **kwargs: Any
-    ) -> BatchedMultiOutputGPyTorchModel:
-        """Conditions on additional observations for a Fully Bayesian model (either
-        identical across models or unique per-model).
-
-        Args:
-            X: A `batch_shape x num_samples x d`-dim Tensor, where `d` is
-                the dimension of the feature space and `batch_shape` is the number of
-                sampled models.
-            Y: A `batch_shape x num_samples x 1`-dim Tensor, where `d` is
-                the dimension of the feature space and `batch_shape` is the number of
-                sampled models.
-
-        Returns:
-            BatchedMultiOutputGPyTorchModel: A fully bayesian model conditioned on
-              given observations. The returned model has `batch_shape` copies of the
-              training data in case of identical observations (and `batch_shape`
-              training datasets otherwise).
-        """
-        if X.ndim == 2 and Y.ndim == 2:
-            # To avoid an error in GPyTorch when inferring the batch dimension, we add
-            # the explicit batch shape here. The result is that the conditioned model
-            # will have 'batch_shape' copies of the training data.
-            X = X.repeat(self.batch_shape + (1, 1))
-            Y = Y.repeat(self.batch_shape + (1, 1))
-
-        elif X.ndim < Y.ndim:
-            # We need to duplicate the training data to enable correct batch
-            # size inference in gpytorch.
-            X = X.repeat(*(Y.shape[:-2] + (1, 1)))
-
-        return super().condition_on_observations(X, Y, **kwargs)
@@ -816,6 +816,7 @@ def _apply_noise(
         self,
         X: Tensor,
         mvn: MultivariateNormal,
+        num_outputs: int,
         observation_noise: bool | Tensor,
     ) -> MultivariateNormal:
         """Adds the observation noise to the posterior.
@@ -947,6 +948,7 @@ def posterior(
             mvn = self._apply_noise(
                 X=X_full,
                 mvn=mvn,
+                num_outputs=num_outputs,
                 observation_noise=observation_noise,
             )
         # If single-output, return the posterior of a single-output model
 
@@ -78,8 +78,8 @@ def scipy_minimize(
         bounds: A dictionary mapping parameter names to lower and upper bounds.
         callback: A callable taking `parameters` and an OptimizationResult as arguments.
         x0: An optional initialization vector passed to scipy.optimize.minimize.
-        method: Solver type, passed along to scipy.optimize.minimize.
-        options: Dictionary of solver options, passed along to scipy.optimize.minimize.
+        method: Solver type, passed along to scipy.minimize.
+        options: Dictionary of solver options, passed along to scipy.minimize.
         timeout_sec: Timeout in seconds to wait before aborting the optimization loop
             if not converged (will return the best found solution thus far).
 
 
@@ -69,8 +69,8 @@ def fit_gpytorch_mll_scipy(
             Responsible for setting the `grad` attributes of `parameters`. If no closure
             is provided, one will be obtained by calling `get_loss_closure_with_grads`.
         closure_kwargs: Keyword arguments passed to `closure`.
-        method: Solver type, passed along to scipy.optimize.minimize.
-        options: Dictionary of solver options, passed along to scipy.optimize.minimize.
+        method: Solver type, passed along to scipy.minimize.
+        options: Dictionary of solver options, passed along to scipy.minimize.
         callback: Optional callback taking `parameters` and an OptimizationResult as its
             sole arguments.
         timeout_sec: Timeout in seconds after which to terminate the fitting loop
 
@@ -528,29 +528,7 @@ def optimize_acqf(
     retry_on_optimization_warning: bool = True,
     **ic_gen_kwargs: Any,
 ) -> tuple[Tensor, Tensor]:
-    r"""Optimize the acquisition function for a single or multiple joint candidates.
-
-    A high-level description (missing exceptions for special setups):
-
-    This function optimizes the acquisition function `acq_function` in two steps:
-
-    i) It will sample `raw_samples` random points using Sobol sampling in the bounds
-    `bounds` and pass on the "best" `num_restarts` many.
-    The default way to find these "best" is via `gen_batch_initial_conditions`
-    (deviating for some acq functions, see `get_ic_generator`),
-    which by default performs Boltzmann sampling on the acquisition function value
-    (The behavior of step (i) can be further controlled by specifying `ic_generator`
-    or `batch_initial_conditions`.)
-
-    ii) A batch of the `num_restarts` points (or joint sets of points)
-    with the highest acquisition values in the previous step are then further
-    optimized. This is by default done by LBFGS-B optimization, if no constraints are
-    present, and SLSQP, if constraints are present (can be changed to
-    other optmizers via `gen_candidates`).
-
-    While the optimization procedure runs on CPU by default for this function,
-    the acq_function can be implemented on GPU and simply move the inputs
-    to GPU internally.
+    r"""Generate a set of candidates via multi-start optimization.
 
     Args:
         acq_function: An AcquisitionFunction.
@@ -559,13 +537,10 @@ def optimize_acqf(
             +inf, respectively).
         q: The number of candidates.
         num_restarts: The number of starting points for multistart acquisition
-            function optimization. Even though the name suggests this happens
-            sequentually, it is done in parallel (using batched evaluations)
-            for up to `options.batch_limit` candidates (by default completely parallel).
+            function optimization.
         raw_samples: The number of samples for initialization. This is required
             if `batch_initial_conditions` is not specified.
-        options: Options for both optimization, passed to `gen_candidates`,
-            and initialization, passed to the `ic_generator` via the `options` kwarg.
+        options: Options for candidate generation.
         inequality_constraints: A list of tuples (indices, coefficients, rhs),
             with each tuple encoding an inequality constraint of the form
             `\sum_i (X[indices[i]] * coefficients[i]) >= rhs`. `indices` and
@@ -611,11 +586,10 @@ def optimize_acqf(
             acquisition values) given a tensor of initial conditions and an
             acquisition function. Other common inputs include lower and upper bounds
             and a dictionary of options, but refer to the documentation of specific
-            generation functions (e.g., botorch.optim.optimize.gen_candidates_scipy
-            and botorch.generation.gen.gen_candidates_torch) for method-specific
-            inputs. Default: `gen_candidates_scipy`
+            generation functions (e.g gen_candidates_scipy and gen_candidates_torch)
+            for method-specific inputs. Default: `gen_candidates_scipy`
         sequential: If False, uses joint optimization, otherwise uses sequential
-            optimization for optimizing multiple joint candidates (q > 1).
+            optimization.
         ic_generator: Function for generating initial conditions. Not needed when
             `batch_initial_conditions` are provided. Defaults to
             `gen_one_shot_kg_initial_conditions` for `qKnowledgeGradient` acquisition