Update the remaining models to use new default covar & likelihood modules

saitcakmak · facebook-github-bot · commit f45f5baf76dc · 2024-09-04T11:34:07.000-07:00
Summary: Updates the default covar & likelihood modules of BoTorch models. See #2451 for details on the new defaults. For models that utilize a composite kernel, such as multi-fidelity/task/context, this change only affects the base kernel. Models that do not utilize the new modules: - Fully-bayesian models. - Pairwise GP. - Fidelity kernels for MF models. Differential Revision: D62196414
diff --git a/botorch/models/approximate_gp.py b/botorch/models/approximate_gp.py
@@ -40,8 +40,8 @@
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.models.utils import validate_input_scaling
 from botorch.models.utils.gpytorch_modules import (
-    get_gaussian_likelihood_with_gamma_prior,
-    get_matern_kernel_with_gamma_prior,
+    get_covar_module_with_dim_scaled_prior,
+    get_gaussian_likelihood_with_lognormal_prior,
 )
 from botorch.models.utils.inducing_point_allocators import (
     GreedyVarianceReduction,
@@ -193,7 +193,7 @@ def __init__(
                 this does not have to be all of the training inputs).
             train_Y: Not used.
             num_outputs: Number of output responses per input.
-            covar_module: Kernel function. If omitted, uses a `MaternKernel`.
+            covar_module: Kernel function. If omitted, uses an `RBFKernel`.
             mean_module: Mean of GP model. If omitted, uses a `ConstantMean`.
             variational_distribution: Type of variational distribution to use
                 (default: CholeskyVariationalDistribution), the properties of the
@@ -217,14 +217,13 @@ def __init__(
         self._aug_batch_shape = aug_batch_shape
 
         if covar_module is None:
-            covar_module = get_matern_kernel_with_gamma_prior(
+            covar_module = get_covar_module_with_dim_scaled_prior(
                 ard_num_dims=train_X.shape[-1],
                 batch_shape=self._aug_batch_shape,
             ).to(train_X)
             self._subset_batch_dict = {
                 "mean_module.constant": -2,
-                "covar_module.raw_outputscale": -1,
-                "covar_module.base_kernel.raw_lengthscale": -3,
+                "covar_module.raw_lengthscale": -3,
             }
 
         if inducing_point_allocator is None:
@@ -343,7 +342,7 @@ def __init__(
                 either a `GaussianLikelihood` (if `num_outputs=1`) or a
                 `MultitaskGaussianLikelihood`(if `num_outputs>1`).
             num_outputs: Number of output responses per input (default: 1).
-            covar_module: Kernel function. If omitted, uses a `MaternKernel`.
+            covar_module: Kernel function. If omitted, uses an `RBFKernel`.
             mean_module: Mean of GP model. If omitted, uses a `ConstantMean`.
             variational_distribution: Type of variational distribution to use
                 (default: CholeskyVariationalDistribution), the properties of the
@@ -378,7 +377,7 @@ def __init__(
 
         if likelihood is None:
             if num_outputs == 1:
-                likelihood = get_gaussian_likelihood_with_gamma_prior(
+                likelihood = get_gaussian_likelihood_with_lognormal_prior(
                     batch_shape=self._aug_batch_shape
                 )
             else:
diff --git a/botorch/models/contextual_multioutput.py b/botorch/models/contextual_multioutput.py
@@ -64,7 +64,7 @@ def __init__(
                 is common across all tasks.
             mean_module: The mean function to be used. Defaults to `ConstantMean`.
             covar_module: The module for computing the covariance matrix between
-                the non-task features. Defaults to `MaternKernel`.
+                the non-task features. Defaults to `RBFKernel`.
             likelihood: A likelihood. The default is selected based on `train_Yvar`.
                 If `train_Yvar` is None, a standard `GaussianLikelihood` with inferred
                 noise level is used. Otherwise, a FixedNoiseGaussianLikelihood is used.
diff --git a/botorch/models/gp_regression.py b/botorch/models/gp_regression.py
@@ -149,7 +149,7 @@ def __init__(
                 is None, and a `FixedNoiseGaussianLikelihood` with the given
                 noise observations if `train_Yvar` is not None.
             covar_module: The module computing the covariance (Kernel) matrix.
-                If omitted, use a `MaternKernel`.
+                If omitted, uses an `RBFKernel`.
             mean_module: The mean function to be used. If omitted, use a
                 `ConstantMean`.
             outcome_transform: An outcome transform that is applied to the
diff --git a/botorch/models/gp_regression_fidelity.py b/botorch/models/gp_regression_fidelity.py
@@ -26,7 +26,6 @@
 from __future__ import annotations
 
 import warnings
-
 from typing import Any, Optional, Union
 
 import torch
@@ -39,9 +38,9 @@
 )
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
+from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
 from botorch.utils.datasets import SupervisedDataset
 from gpytorch.kernels.kernel import ProductKernel
-from gpytorch.kernels.rbf_kernel import RBFKernel
 from gpytorch.kernels.scale_kernel import ScaleKernel
 from gpytorch.likelihoods.likelihood import Likelihood
 from gpytorch.priors.torch_priors import GammaPrior
@@ -273,10 +272,9 @@ def _setup_multifidelity_covar_module(
             non_active_dims.add(iteration_fidelity)
         active_dimsX = sorted(set(range(dim)) - non_active_dims)
         kernels.append(
-            RBFKernel(
+            get_covar_module_with_dim_scaled_prior(
                 ard_num_dims=len(active_dimsX),
                 batch_shape=aug_batch_shape,
-                lengthscale_prior=GammaPrior(3.0, 6.0),
                 active_dims=active_dimsX,
             )
         )
diff --git a/botorch/models/gp_regression_mixed.py b/botorch/models/gp_regression_mixed.py
@@ -13,15 +13,13 @@
 from botorch.models.kernels.categorical import CategoricalKernel
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
+from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
 from botorch.utils.datasets import SupervisedDataset
 from botorch.utils.transforms import normalize_indices
 from gpytorch.constraints import GreaterThan
 from gpytorch.kernels.kernel import Kernel
-from gpytorch.kernels.matern_kernel import MaternKernel
 from gpytorch.kernels.scale_kernel import ScaleKernel
-from gpytorch.likelihoods.gaussian_likelihood import GaussianLikelihood
 from gpytorch.likelihoods.likelihood import Likelihood
-from gpytorch.priors import GammaPrior
 from torch import Tensor
 
 
@@ -82,7 +80,7 @@ def __init__(
             cont_kernel_factory: A method that accepts  `batch_shape`, `ard_num_dims`,
                 and `active_dims` arguments and returns an instantiated GPyTorch
                 `Kernel` object to be used as the base kernel for the continuous
-                dimensions. If omitted, this model uses a Matern-2.5 kernel as
+                dimensions. If omitted, this model uses an `RBFKernel` as
                 the kernel for the ordinal parameters.
             likelihood: A likelihood. If omitted, use a standard
                 GaussianLikelihood with inferred noise level.
@@ -105,30 +103,7 @@ def __init__(
         _, aug_batch_shape = self.get_batch_dimensions(train_X=train_X, train_Y=train_Y)
 
         if cont_kernel_factory is None:
-
-            def cont_kernel_factory(
-                batch_shape: torch.Size,
-                ard_num_dims: int,
-                active_dims: list[int],
-            ) -> MaternKernel:
-                return MaternKernel(
-                    nu=2.5,
-                    batch_shape=batch_shape,
-                    ard_num_dims=ard_num_dims,
-                    active_dims=active_dims,
-                    lengthscale_constraint=GreaterThan(1e-04),
-                )
-
-        if likelihood is None and train_Yvar is None:
-            # This Gamma prior is quite close to the Horseshoe prior
-            min_noise = 1e-5 if train_X.dtype == torch.float else 1e-6
-            likelihood = GaussianLikelihood(
-                batch_shape=aug_batch_shape,
-                noise_constraint=GreaterThan(
-                    min_noise, transform=None, initial_value=1e-3
-                ),
-                noise_prior=GammaPrior(0.9, 10.0),
-            )
+            cont_kernel_factory = get_covar_module_with_dim_scaled_prior
 
         d = train_X.shape[-1]
         cat_dims = normalize_indices(indices=cat_dims, d=d)
diff --git a/botorch/models/higher_order_gp.py b/botorch/models/higher_order_gp.py
@@ -27,18 +27,19 @@
 from botorch.models.utils import gpt_posterior_settings
 from botorch.models.utils.assorted import fantasize as fantasize_flag
 from botorch.models.utils.gpytorch_modules import (
-    get_gaussian_likelihood_with_gamma_prior,
+    get_covar_module_with_dim_scaled_prior,
+    get_gaussian_likelihood_with_lognormal_prior,
 )
 from botorch.posteriors import (
     GPyTorchPosterior,
     HigherOrderGPPosterior,
     TransformedPosterior,
 )
 from gpytorch.distributions import MultivariateNormal
-from gpytorch.kernels import Kernel, MaternKernel
+from gpytorch.kernels import Kernel
 from gpytorch.likelihoods import Likelihood
 from gpytorch.models import ExactGP
-from gpytorch.priors.torch_priors import GammaPrior, MultivariateNormalPrior
+from gpytorch.priors.torch_priors import MultivariateNormalPrior
 from gpytorch.settings import fast_pred_var, skip_posterior_variances
 from linear_operator.operators import (
     BatchRepeatLinearOperator,
@@ -232,7 +233,7 @@ def __init__(
         self._input_batch_shape = batch_shape
 
         if likelihood is None:
-            likelihood = get_gaussian_likelihood_with_gamma_prior(
+            likelihood = get_gaussian_likelihood_with_lognormal_prior(
                 batch_shape=self._aug_batch_shape
             )
         else:
@@ -249,9 +250,7 @@ def __init__(
         else:
             self.covar_modules = ModuleList(
                 [
-                    MaternKernel(
-                        nu=2.5,
-                        lengthscale_prior=GammaPrior(3.0, 6.0),
+                    get_covar_module_with_dim_scaled_prior(
                         batch_shape=self._aug_batch_shape,
                         ard_num_dims=1 if dim > 0 else train_X.shape[-1],
                     )
diff --git a/botorch/models/kernels/contextual_lcea.py b/botorch/models/kernels/contextual_lcea.py
@@ -7,9 +7,9 @@
 from typing import Any, Optional
 
 import torch
+from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
 from gpytorch.constraints import Positive
 from gpytorch.kernels.kernel import Kernel
-from gpytorch.kernels.matern_kernel import MaternKernel
 from gpytorch.priors.torch_priors import GammaPrior
 from linear_operator.operators import DiagLinearOperator
 from linear_operator.operators.dense_linear_operator import DenseLinearOperator
@@ -158,18 +158,14 @@ def __init__(
         if train_embedding:
             self._set_emb_layers()
         # task covariance matrix
-        self.task_covar_module = MaternKernel(
-            nu=2.5,
+        self.task_covar_module = get_covar_module_with_dim_scaled_prior(
             ard_num_dims=self.n_embs,
             batch_shape=batch_shape,
-            lengthscale_prior=GammaPrior(3.0, 6.0),
         )
         # base kernel
-        self.base_kernel = MaternKernel(
-            nu=2.5,
+        self.base_kernel = get_covar_module_with_dim_scaled_prior(
             ard_num_dims=self.num_param,
             batch_shape=batch_shape,
-            lengthscale_prior=GammaPrior(3.0, 6.0),
         )
         # outputscales for each context (note this is like sqrt of outputscale)
         self.context_weight = None
diff --git a/botorch/models/kernels/contextual_sac.py b/botorch/models/kernels/contextual_sac.py
@@ -7,8 +7,8 @@
 from typing import Any, Optional
 
 import torch
+from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
 from gpytorch.kernels.kernel import Kernel
-from gpytorch.kernels.matern_kernel import MaternKernel
 from gpytorch.kernels.scale_kernel import ScaleKernel
 from gpytorch.priors.torch_priors import GammaPrior
 from linear_operator.operators.sum_linear_operator import SumLinearOperator
@@ -36,7 +36,7 @@ class SACKernel(Kernel):
     where
     * :math: M is the number of partitions of parameter space. Each partition contains
     same number of parameters d. Each kernel `k_i` acts only on d parameters of ith
-    partition i.e. `\mathbf{x}_(i)`. Each kernel `k_i` is a scaled Matern kernel
+    partition i.e. `\mathbf{x}_(i)`. Each kernel `k_i` is a scaled RBF kernel
     with same lengthscales but different outputscales.
     """
 
@@ -72,11 +72,9 @@ def __init__(
             for context, active_params in self.decomposition.items()
         }
 
-        self.base_kernel = MaternKernel(
-            nu=2.5,
+        self.base_kernel = get_covar_module_with_dim_scaled_prior(
             ard_num_dims=num_param,
             batch_shape=batch_shape,
-            lengthscale_prior=GammaPrior(3.0, 6.0),
         )
 
         self.kernel_dict = {}  # scaled kernel for each parameter space partition
diff --git a/botorch/models/multitask.py b/botorch/models/multitask.py
@@ -40,7 +40,8 @@
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.models.utils.gpytorch_modules import (
-    get_matern_kernel_with_gamma_prior,
+    get_covar_module_with_dim_scaled_prior,
+    get_gaussian_likelihood_with_lognormal_prior,
     MIN_INFERRED_NOISE_LEVEL,
 )
 from botorch.posteriors.multitask import MultitaskGPPosterior
@@ -51,12 +52,8 @@
 )
 from gpytorch.distributions.multivariate_normal import MultivariateNormal
 from gpytorch.kernels.index_kernel import IndexKernel
-from gpytorch.kernels.matern_kernel import MaternKernel
 from gpytorch.kernels.multitask_kernel import MultitaskKernel
-from gpytorch.likelihoods.gaussian_likelihood import (
-    FixedNoiseGaussianLikelihood,
-    GaussianLikelihood,
-)
+from gpytorch.likelihoods.gaussian_likelihood import FixedNoiseGaussianLikelihood
 from gpytorch.likelihoods.likelihood import Likelihood
 from gpytorch.likelihoods.multitask_gaussian_likelihood import (
     MultitaskGaussianLikelihood,
@@ -167,7 +164,7 @@ def __init__(
                 Note that the inferred noise is common across all tasks.
             mean_module: The mean function to be used. Defaults to `ConstantMean`.
             covar_module: The module for computing the covariance matrix between
-                the non-task features. Defaults to `MaternKernel`.
+                the non-task features. Defaults to `RBFKernel`.
             likelihood: A likelihood. The default is selected based on `train_Yvar`.
                 If `train_Yvar` is None, a standard `GaussianLikelihood` with inferred
                 noise level is used. Otherwise, a FixedNoiseGaussianLikelihood is used.
@@ -233,7 +230,7 @@ def __init__(
         # TODO (T41270962): Support task-specific noise levels in likelihood
         if likelihood is None:
             if train_Yvar is None:
-                likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05))
+                likelihood = get_gaussian_likelihood_with_lognormal_prior()
             else:
                 likelihood = FixedNoiseGaussianLikelihood(noise=train_Yvar.squeeze(-1))
 
@@ -247,7 +244,7 @@ def __init__(
         )
         self.mean_module = mean_module or ConstantMean()
         if covar_module is None:
-            self.covar_module = get_matern_kernel_with_gamma_prior(
+            self.covar_module = get_covar_module_with_dim_scaled_prior(
                 ard_num_dims=self.num_non_task_features
             )
         else:
@@ -442,7 +439,7 @@ def __init__(
                 `MultitaskGaussianLikelihood` with a `GammaPrior(1.1, 0.05)`
                 noise prior.
             data_covar_module: The module computing the covariance (Kernel) matrix
-                in data space. If omitted, use a `MaternKernel`.
+                in data space. If omitted, uses an `RBFKernel`.
             task_covar_prior : A Prior on the task covariance matrix. Must operate
                 on p.s.d. matrices. A common prior for this is the `LKJ` prior. If
                 omitted, uses `LKJCovariancePrior` with `eta` parameter as specified
@@ -500,10 +497,8 @@ def __init__(
             base_means=ConstantMean(batch_shape=batch_shape), num_tasks=num_tasks
         )
         if data_covar_module is None:
-            data_covar_module = MaternKernel(
-                nu=2.5,
+            data_covar_module = get_covar_module_with_dim_scaled_prior(
                 ard_num_dims=ard_num_dims,
-                lengthscale_prior=GammaPrior(3.0, 6.0),
                 batch_shape=batch_shape,
             )
         else:
diff --git a/botorch/models/utils/gpytorch_modules.py b/botorch/models/utils/gpytorch_modules.py
@@ -18,7 +18,7 @@
 """
 
 from math import log, sqrt
-from typing import Optional, Union
+from typing import Optional, Sequence, Union
 
 import torch
 from gpytorch.constraints.constraints import GreaterThan
@@ -101,7 +101,8 @@ def get_covar_module_with_dim_scaled_prior(
     ard_num_dims: int,
     batch_shape: Optional[torch.Size] = None,
     use_rbf_kernel: bool = True,
-) -> Union[MaternKernel, RBFKernel, ScaleKernel]:
+    active_dims: Optional[Sequence[int]] = None,
+) -> Union[MaternKernel, RBFKernel]:
     """Returns an RBF or Matern kernel with priors
     from  [Hvarfner2024vanilla]_.
 
@@ -123,5 +124,7 @@ def get_covar_module_with_dim_scaled_prior(
         lengthscale_constraint=GreaterThan(
             2.5e-2, transform=None, initial_value=lengthscale_prior.mode
         ),
+        # pyre-ignore[6] GPyTorch type is unnecessarily restrictive.
+        active_dims=active_dims,
     )
     return base_kernel
diff --git a/docs/models.md b/docs/models.md
@@ -121,10 +121,10 @@ instead.
   a fully Bayesian multi-task GP using an ICM kernel. The data kernel uses the
   SAAS prior to model high-dimensional parameter spaces.
 
-All of the above models use Matérn 5/2 kernels with Automatic Relevance
-Discovery (ARD), and have reasonable priors on hyperparameters that make them
-work well in settings where the **input features are normalized to the unit
-cube** and the **observations are standardized** (zero mean, unit variance).
+All of the above models use RBF kernels with Automatic Relevance Discovery
+(ARD), and have reasonable priors on hyperparameters that make them work well in
+settings where the **input features are normalized to the unit cube** and the
+**observations are standardized** (zero mean, unit variance).
 
 ## Other useful models
 
@@ -182,6 +182,6 @@ model. If you wish to use gradient-based optimization algorithms, the model
 should allow back-propagating gradients through the samples to the model input.
 
 If you happen to implement a model that would be useful for other researchers as
-well (and involves more than just swapping out the Matérn kernel for an RBF
+well (and involves more than just swapping out the RBF kernel for a Matérn
 kernel), please consider [contributing](getting_started#contributing) this model
 to BoTorch.
diff --git a/test/models/kernels/test_contextual.py b/test/models/kernels/test_contextual.py
diff --git a/test/models/test_approximate_gp.py b/test/models/test_approximate_gp.py
diff --git a/test/models/test_gp_regression_mixed.py b/test/models/test_gp_regression_mixed.py
diff --git a/test/models/test_gpytorch.py b/test/models/test_gpytorch.py
diff --git a/test/models/test_multitask.py b/test/models/test_multitask.py