Pathwise Thomspon sampling for ensemble models (#2877)

SebastianAment · facebook-github-bot · commit b03429945485 · 2025-06-12T08:00:05.000-07:00
Summary: Pull Request resolved: #2877 This commit adds support for pathwise Thompson sampling for ensemble models, including fully Bayesian SAAS models. Differential Revision: D75990595
diff --git a/botorch/acquisition/thompson_sampling.py b/botorch/acquisition/thompson_sampling.py
@@ -9,7 +9,7 @@
 from botorch.acquisition.objective import PosteriorTransform
 from botorch.models.model import Model
 from botorch.sampling.pathwise.posterior_samplers import get_matheron_path_model
-from botorch.utils.transforms import t_batch_mode_transform
+from botorch.utils.transforms import is_ensemble, t_batch_mode_transform
 from torch import Tensor
 
 
@@ -42,45 +42,91 @@ def __init__(
                 a PosteriorTransform that transforms the multi-output posterior into a
                 single-output posterior is required.
         """
-        if model._is_fully_bayesian:
-            raise NotImplementedError(
-                "PathwiseThompsonSampling is not supported for fully Bayesian models",
-            )
 
         super().__init__(model=model)
         self.batch_size: int | None = None
 
-    def redraw(self) -> None:
+    def redraw(self, batch_size: int) -> None:
+        sample_shape = (batch_size,)
         self.samples = get_matheron_path_model(
-            model=self.model, sample_shape=torch.Size([self.batch_size])
+            model=self.model, sample_shape=torch.Size(sample_shape)
         )
+        if is_ensemble(self.model):
+            # the ensembling dimension is assumed to be part of the batch shape
+            # could add a dedicated proporty to keep track of the ensembling dimension
+            # i.e. generalizing num_mcmc_samples in AbstractFullyBayesianSingleTaskGP
+            model_batch_shape = self.model.batch_shape
+            if len(model_batch_shape) > 1:
+                raise NotImplementedError(
+                    "Ensemble models with more than one ensemble dimension are not "
+                    "yet supported."
+                )
+            num_ensemble = model_batch_shape[0]
+            self.ensemble_indices = torch.randint(
+                0,
+                num_ensemble,
+                (*sample_shape, 1, self.model.num_outputs),
+            )
 
     @t_batch_mode_transform()
     def forward(self, X: Tensor) -> Tensor:
         r"""Evaluate the pathwise posterior sample draws on the candidate set X.
 
         Args:
-            X: A `(b1 x ... bk) x 1 x d`-dim batched tensor of `d`-dim design points.
+            X: A `batch_shape x q x d`-dim batched tensor of `d`-dim design points.
 
         Returns:
-            A `(b1 x ... bk) x [num_models for fully bayesian]`-dim tensor of
-            evaluations on the posterior sample draws.
+            A `batch_shape [x m]`-dim tensor of evaluations on the posterior sample
+            draws, where `m` is the number of outputs of the model.
         """
         batch_size = X.shape[-2]
         q_dim = -2
-
         # batch_shape x q x 1 x d
         X = X.unsqueeze(-2)
         if self.batch_size is None:
             self.batch_size = batch_size
-            self.redraw()
+            self.redraw(batch_size=batch_size)
         elif self.batch_size != batch_size:
             raise ValueError(
                 BATCH_SIZE_CHANGE_ERROR.format(self.batch_size, batch_size)
             )
-
-        # posterior_values.shape post-squeeze:
+        # batch_shape x q [x num_ensembles] x 1 x m
+        posterior_values = self.samples(X)
+        # batch_shape x q [x num_ensembles] x m
+        posterior_values = posterior_values.squeeze(-2)
         # batch_shape x q x m
-        posterior_values = self.samples(X).squeeze(-2)
-        # sum over batch dim and squeeze num_objectives dim (-1)
-        return posterior_values.sum(q_dim).squeeze(-1)
+        posterior_values = self.select_from_ensemble_models(values=posterior_values)
+        # NOTE: can leverage batched L-BFGS computation instead of summing in the future
+        # sum over batch dim and squeeze num_objectives dim (-1): batch_shape [x m]
+        acqf_vals = posterior_values.sum(q_dim).squeeze(-1)
+        return acqf_vals
+
+    def select_from_ensemble_models(self, values: Tensor):
+        """Subselecting a value associated with a single sample in the ensemble for each
+        element of samples that is not associated with an ensemble dimension. NOTE: uses
+        `self.model` and `is_ensemble` to determine whether or not an ensembling
+        dimension is present.
+
+        Args:
+            values: A `batch_shape x num_draws x q [x num_ensemble] x m`-dim Tensor.
+
+        Returns:
+            A`batch_shape x num_draws x q x m`-dim where each element was chosen
+            independently randomly from the ensemble dimension.
+        """
+        if not is_ensemble(self.model):
+            return values
+
+        ensemble_dim = -2
+        # `ensemble_indices` are fixed so that the acquisition function becomes
+        # deterministic for the same input and can be optimized with LBFGS.
+        # ensemble indices have shape num_paths x 1 x m
+        self.ensemble_indices = self.ensemble_indices.to(device=values.device)
+        index = self.ensemble_indices
+        input_batch_shape = values.shape[:-3]
+        index = index.expand(*input_batch_shape, *index.shape)
+        # samples is batch_shape x q x num_ensemble x m
+        values_wo_ensemble = torch.gather(values, dim=ensemble_dim, index=index)
+        return values_wo_ensemble.squeeze(
+            ensemble_dim
+        )  # removing the ensemble dimension
diff --git a/botorch/sampling/pathwise/paths.py b/botorch/sampling/pathwise/paths.py
@@ -147,6 +147,7 @@ def __init__(
         bias_module: Module | None = None,
         input_transform: TInputTransform | None = None,
         output_transform: TOutputTransform | None = None,
+        is_ensemble: bool = False,
     ):
         r"""Initializes a GeneralizedLinearPath instance.
 
@@ -161,6 +162,7 @@ def __init__(
             bias_module: An optional module used to define additive offsets.
             input_transform: An optional input transform for the module.
             output_transform: An optional output transform for the module.
+            is_ensemble: Whether the associated model is an ensemble model or not.
         """
         super().__init__()
         self.feature_map = feature_map
@@ -170,8 +172,13 @@ def __init__(
         self.bias_module = bias_module
         self.input_transform = input_transform
         self.output_transform = output_transform
+        self.is_ensemble = is_ensemble
 
     def forward(self, x: Tensor, **kwargs) -> Tensor:
+        if self.is_ensemble:
+            # assuming that the ensembling dimension is added after (n, d), but
+            # before the other batch dimensions, starting from the left.
+            x = x.unsqueeze(-3)
         feat = self.feature_map(x, **kwargs)
         out = (feat @ self.weight.unsqueeze(-1)).squeeze(-1)
         return out if self.bias_module is None else out + self.bias_module(x)
diff --git a/botorch/sampling/pathwise/prior_samplers.py b/botorch/sampling/pathwise/prior_samplers.py
@@ -24,6 +24,7 @@
 )
 from botorch.utils.dispatcher import Dispatcher
 from botorch.utils.sampling import draw_sobol_normal_samples
+from botorch.utils.transforms import is_ensemble
 from gpytorch.kernels import Kernel
 from gpytorch.models import ApproximateGP, ExactGP, GP
 from gpytorch.variational import _VariationalStrategy
@@ -61,6 +62,7 @@ def _draw_kernel_feature_paths_fallback(
     input_transform: TInputTransform | None = None,
     output_transform: TOutputTransform | None = None,
     weight_generator: Callable[[Size], Tensor] | None = None,
+    is_ensemble: bool = False,
 ) -> GeneralizedLinearPath:
     # Generate a kernel feature map
     feature_map = map_generator(
@@ -89,6 +91,7 @@ def _draw_kernel_feature_paths_fallback(
         bias_module=mean_module,
         input_transform=input_transform,
         output_transform=output_transform,
+        is_ensemble=is_ensemble,
     )
 
 
@@ -103,6 +106,7 @@ def _draw_kernel_feature_paths_ExactGP(
         covar_module=model.covar_module,
         input_transform=get_input_transform(model),
         output_transform=get_output_transform(model),
+        is_ensemble=is_ensemble(model),
         **kwargs,
     )
 
@@ -150,5 +154,6 @@ def _draw_kernel_feature_paths_ApproximateGP_fallback(
         num_inputs=num_inputs,
         mean_module=model.mean_module,
         covar_module=model.covar_module,
+        is_ensemble=is_ensemble(model),
         **kwargs,
     )
diff --git a/botorch/sampling/pathwise/update_strategies.py b/botorch/sampling/pathwise/update_strategies.py
@@ -13,6 +13,7 @@
 from typing import Any
 
 import torch
+
 from botorch.models.approximate_gp import ApproximateGPyTorchModel
 from botorch.models.transforms.input import InputTransform
 from botorch.sampling.pathwise.features import KernelEvaluationMap
@@ -24,6 +25,7 @@
     TInputTransform,
 )
 from botorch.utils.dispatcher import Dispatcher
+from botorch.utils.transforms import is_ensemble
 from botorch.utils.types import DEFAULT
 from gpytorch.kernels.kernel import Kernel
 from gpytorch.likelihoods import _GaussianLikelihoodBase, Likelihood
@@ -79,6 +81,7 @@ def _gaussian_update_exact(
     noise_covariance: Tensor | LinearOperator | None = None,
     scale_tril: Tensor | LinearOperator | None = None,
     input_transform: TInputTransform | None = None,
+    is_ensemble: bool = False,
 ) -> GeneralizedLinearPath:
     # Prepare Cholesky factor of `Cov(y, y)` and noise sample values as needed
     if isinstance(noise_covariance, (NoneType, ZeroLinearOperator)):
@@ -103,7 +106,9 @@ def _gaussian_update_exact(
         points=points,
         input_transform=input_transform,
     )
-    return GeneralizedLinearPath(feature_map=feature_map, weight=weight.squeeze(-1))
+    return GeneralizedLinearPath(
+        feature_map=feature_map, weight=weight.squeeze(-1), is_ensemble=is_ensemble
+    )
 
 
 @GaussianUpdate.register(ExactGP, _GaussianLikelihoodBase)
@@ -134,6 +139,7 @@ def _gaussian_update_ExactGP(
         noise_covariance=noise_covariance,
         scale_tril=scale_tril,
         input_transform=get_input_transform(model),
+        is_ensemble=is_ensemble(model),
     )
 
 
@@ -194,4 +200,5 @@ def _gaussian_update_ApproximateGP_VariationalStrategy(
         sample_values=sample_values,
         scale_tril=L,
         input_transform=input_transform,
+        is_ensemble=is_ensemble(model),
     )
diff --git a/botorch/utils/test_helpers.py b/botorch/utils/test_helpers.py
@@ -39,19 +39,6 @@
 from torch.nn.functional import pad
 
 
-def _get_mcmc_samples(num_samples: int, dim: int, infer_noise: bool, **tkwargs):
-    mcmc_samples = {
-        "lengthscale": 1 + torch.rand(num_samples, 1, dim, **tkwargs),
-        "outputscale": 1 + torch.rand(num_samples, **tkwargs),
-        "mean": torch.randn(num_samples, **tkwargs),
-    }
-    if infer_noise:
-        mcmc_samples["noise"] = torch.rand(num_samples, 1, **tkwargs)
-    mcmc_samples["lengthscale"] = mcmc_samples["lengthscale"]
-
-    return mcmc_samples
-
-
 def get_model(
     train_X: Tensor,
     train_Y: Tensor,
@@ -93,8 +80,8 @@ def get_fully_bayesian_model(
     train_X: Tensor,
     train_Y: Tensor,
     num_models: int,
-    standardize_model: bool,
-    infer_noise: bool,
+    standardize_model: bool = False,
+    infer_noise: bool = True,
     **tkwargs: Any,
 ) -> SaasFullyBayesianSingleTaskGP:
     num_objectives = train_Y.shape[-1]
@@ -122,6 +109,20 @@ def get_fully_bayesian_model(
     return model
 
 
+def _get_mcmc_samples(
+    num_samples: int, dim: int, infer_noise: bool, **tkwargs
+) -> dict[str, Tensor]:
+    mcmc_samples = {
+        "lengthscale": 1 + torch.rand(num_samples, 1, dim, **tkwargs),
+        "outputscale": 1 + torch.rand(num_samples, **tkwargs),
+        "mean": torch.randn(num_samples, **tkwargs),
+    }
+    if infer_noise:
+        mcmc_samples["noise"] = torch.rand(num_samples, 1, **tkwargs)
+
+    return mcmc_samples
+
+
 def get_fully_bayesian_model_list(
     train_X: Tensor,
     train_Y: Tensor,
diff --git a/test/acquisition/test_thompson_sampling.py b/test/acquisition/test_thompson_sampling.py
diff --git a/test/sampling/pathwise/test_posterior_samplers.py b/test/sampling/pathwise/test_posterior_samplers.py