Use Standardize outcome transform by default in more models (#2532)

Summary: Pull Request resolved: #2532 Makes models which had their priors updated in #2507 use the `Standardize` outcome transform by default, mimicking #2458 Also removes some deprecated functionality in the process, namely the `data_fidelity` argument to `SingleTaskMultiFidelityGP` as well as the `FixedNoiseMultiFidelityGP` and `FixedNoiseLCEMGP` models. Reviewed By: saitcakmak, esantorella Differential Revision: D62552307
pytorch · Sep 14, 2024 · 7f29dda · 7f29dda
1 parent c895a8d
commit 7f29dda
Show file tree

Hide file tree

Showing 16 changed files with 342 additions and 309 deletions.
diff --git a/botorch/models/approximate_gp.py b/botorch/models/approximate_gp.py
@@ -35,6 +35,7 @@
 from typing import Optional, TypeVar, Union
 
 import torch
+from botorch.exceptions.warnings import UserInputWarning
 from botorch.models.gpytorch import GPyTorchModel
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
@@ -70,6 +71,14 @@
 
 
 TApproxModel = TypeVar("TApproxModel", bound="ApproximateGPyTorchModel")
+TRANSFORM_WARNING = (
+    "Using an {ttype} transform with `SingleTaskVariationalGP`. If this "
+    "model is trained in minibatches, a {ttype} transform with learnable "
+    "parameters would update its parameters for each minibatch, which is "
+    "undesirable. If you do intend to train in minibatches, we recommend "
+    "you not use a {ttype} transform and instead pre-transform your whole "
+    "data set before fitting the model."
+)
 
 
 class ApproximateGPyTorchModel(GPyTorchModel):
@@ -325,9 +334,9 @@ def __init__(
         variational_distribution: Optional[_VariationalDistribution] = None,
         variational_strategy: type[_VariationalStrategy] = VariationalStrategy,
         inducing_points: Optional[Union[Tensor, int]] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
         inducing_point_allocator: Optional[InducingPointAllocator] = None,
+        outcome_transform: OutcomeTransform | None = None,
+        input_transform: InputTransform | None = None,
     ) -> None:
         r"""
         Args:
@@ -338,6 +347,8 @@ def __init__(
                 either a `GaussianLikelihood` (if `num_outputs=1`) or a
                 `MultitaskGaussianLikelihood`(if `num_outputs>1`).
             num_outputs: Number of output responses per input (default: 1).
+            learn_inducing_points: If True, the inducing point locations are learned
+                jointly with the other model parameters.
             covar_module: Kernel function. If omitted, uses an `RBFKernel`.
             mean_module: Mean of GP model. If omitted, uses a `ConstantMean`.
             variational_distribution: Type of variational distribution to use
@@ -351,13 +362,32 @@ def __init__(
             inducing_point_allocator: The `InducingPointAllocator` used to
                 initialize the inducing point locations. If omitted,
                 uses `GreedyVarianceReduction`.
+            outcome_transform: An outcome transform that is applied to the training
+                data during instantiation and to the posterior during inference.
+                NOTE: If this model is trained in minibatches, an outcome transform
+                with learnable parameters (such as `Standardize`) would update its
+                parameters for each minibatch, which is undesirable. If you do intend
+                to train in minibatches, we recommend you not use an outcome transform
+                and instead pre-transform your whole data set before fitting the model.
+            input_transform: An input transform that is applied in the model's
+                forward pass.
+                NOTE: If this model is trained in minibatches, an input transform
+                with learnable parameters (such as `Normalize`) would update its
+                parameters for each minibatch, which is undesirable. If you do intend
+                to train in minibatches, we recommend you not use an input transform
+                and instead pre-transform your whole data set before fitting the model.
         """
         with torch.no_grad():
             transformed_X = self.transform_inputs(
                 X=train_X, input_transform=input_transform
             )
         if train_Y is not None:
             if outcome_transform is not None:
+                warnings.warn(
+                    TRANSFORM_WARNING.format(ttype="outcome"),
+                    UserInputWarning,
+                    stacklevel=3,
+                )
                 train_Y, _ = outcome_transform(train_Y)
             self._validate_tensor_args(X=transformed_X, Y=train_Y)
             validate_input_scaling(train_X=transformed_X, train_Y=train_Y)
@@ -388,6 +418,7 @@ def __init__(
                 "being further optimized during the model fit. If so "
                 "then set `learn_inducing_points` to False.",
                 UserWarning,
+                stacklevel=3,
             )
 
         if inducing_point_allocator is None:
@@ -412,6 +443,11 @@ def __init__(
         if outcome_transform is not None:
             self.outcome_transform = outcome_transform
         if input_transform is not None:
+            warnings.warn(
+                TRANSFORM_WARNING.format(ttype="input"),
+                UserInputWarning,
+                stacklevel=3,
+            )
             self.input_transform = input_transform
 
         # for model fitting utilities

diff --git a/botorch/models/contextual_multioutput.py b/botorch/models/contextual_multioutput.py
@@ -13,14 +13,14 @@
     Advances in Neural Information Processing Systems 33, NeurIPS 2020.
 """
 
-import warnings
 from typing import Any, Optional, Union
 
 import torch
 from botorch.models.multitask import MultiTaskGP
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.utils.datasets import MultiTaskDataset, SupervisedDataset
+from botorch.utils.types import _DefaultType, DEFAULT
 from gpytorch.constraints import Interval
 from gpytorch.kernels.rbf_kernel import RBFKernel
 from gpytorch.likelihoods.likelihood import Likelihood
@@ -51,8 +51,8 @@ def __init__(
         embs_dim_list: Optional[list[int]] = None,
         output_tasks: Optional[list[int]] = None,
         all_tasks: Optional[list[int]] = None,
+        outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
         input_transform: Optional[InputTransform] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
     ) -> None:
         r"""
         Args:
@@ -85,12 +85,14 @@ def __init__(
                 training data. Note that when a task is not observed, the corresponding
                 task covariance will heavily depend on random initialization and may
                 behave unexpectedly.
-            input_transform: An input transform that is applied in the model's
-                forward pass.
             outcome_transform: An outcome transform that is applied to the
                 training data during instantiation and to the posterior during
                 inference (that is, the `Posterior` obtained by calling
-                `.posterior` on the model will be on the original scale).
+                `.posterior` on the model will be on the original scale). We use a
+                `Standardize` transform if no `outcome_transform` is specified.
+                Pass down `None` to use no outcome transform.
+            input_transform: An input transform that is applied in the model's
+                forward pass.
         """
         super().__init__(
             train_X=train_X,
@@ -102,8 +104,8 @@ def __init__(
             likelihood=likelihood,
             output_tasks=output_tasks,
             all_tasks=all_tasks,
-            input_transform=input_transform,
             outcome_transform=outcome_transform,
+            input_transform=input_transform,
         )
         self.device = train_X.device
         if all_tasks is None:
@@ -247,62 +249,3 @@ def construct_inputs(
         if embs_dim_list is not None:
             base_inputs["embs_dim_list"] = embs_dim_list
         return base_inputs
-
-
-class FixedNoiseLCEMGP(LCEMGP):
-    r"""The Multi-Task GP the latent context embedding multioutput
-    (LCE-M) kernel, with known observation noise.
-
-    DEPRECATED: Please use `LCEMGP` with `train_Yvar` instead.
-    Will be removed in a future release (~v0.11).
-    """
-
-    def __init__(
-        self,
-        train_X: Tensor,
-        train_Y: Tensor,
-        train_Yvar: Tensor,
-        task_feature: int,
-        context_cat_feature: Optional[Tensor] = None,
-        context_emb_feature: Optional[Tensor] = None,
-        embs_dim_list: Optional[list[int]] = None,
-        output_tasks: Optional[list[int]] = None,
-    ) -> None:
-        r"""
-        Args:
-            train_X: (n x d) X training data.
-            train_Y: (n x 1) Y training data.
-            train_Yvar: (n x 1) Observed variances of each training Y.
-            task_feature: Column index of train_X to get context indices.
-            context_cat_feature: (n_contexts x k) one-hot encoded context
-                features. Rows are ordered by context indices, where k is the
-                number of categorical variables. If None, task indices will
-                be used and k = 1.
-            context_emb_feature: (n_contexts x m) pre-given continuous
-                embedding features. Rows are ordered by context indices.
-            embs_dim_list: Embedding dimension for each categorical variable.
-                The length equals to k. If None, the embedding dimension is set to
-                1 for each categorical variable.
-            output_tasks: A list of task indices for which to compute model
-                outputs for. If omitted, return outputs for all task indices.
-
-        """
-        warnings.warn(
-            "`FixedNoiseLCEMGP` has been deprecated and will be removed in a "
-            "future release. Please use the `LCEMGP` model instead. "
-            "When `train_Yvar` is specified, `LCEMGP` behaves the same "
-            "as the `FixedNoiseLCEMGP`.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        super().__init__(
-            train_X=train_X,
-            train_Y=train_Y,
-            task_feature=task_feature,
-            train_Yvar=train_Yvar,
-            context_cat_feature=context_cat_feature,
-            context_emb_feature=context_emb_feature,
-            embs_dim_list=embs_dim_list,
-            output_tasks=output_tasks,
-        )
diff --git a/botorch/models/fully_bayesian_multitask.py b/botorch/models/fully_bayesian_multitask.py
@@ -167,7 +167,7 @@ class SaasFullyBayesianMultiTaskGP(MultiTaskGP):
 
     This model assumes that the inputs have been normalized to [0, 1]^d and that the
     output has been stratified standardized to have zero mean and unit variance for
-    each task.The SAAS model [Eriksson2021saasbo]_ with a Matern-5/2 is used as data
+    each task. The SAAS model [Eriksson2021saasbo]_ with a Matern-5/2 is used as data
     kernel by default.
 
     You are expected to use `fit_fully_bayesian_model_nuts` to fit this model as it
@@ -243,6 +243,7 @@ def __init__(
                 X=train_X, input_transform=input_transform
             )
         if outcome_transform is not None:
+            outcome_transform.train()  # Ensure we learn parameters here on init
             train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar)
         if train_Yvar is not None:  # Clamp after transforming
             train_Yvar = train_Yvar.clamp(MIN_INFERRED_NOISE_LEVEL)
@@ -254,6 +255,11 @@ def __init__(
             task_feature=task_feature,
             output_tasks=output_tasks,
             rank=rank,
+            # We already transformed the data above, this avoids applying the
+            # default `Standardize` transform twice. As outcome_transform is
+            # set on `self` below, it will be applied to the posterior in the
+            # `posterior` method of `MultiTaskGP`.
+            outcome_transform=None,
         )
         if all_tasks is not None and self._expected_task_values != set(all_tasks):
             raise NotImplementedError(

diff --git a/botorch/models/gp_regression_fidelity.py b/botorch/models/gp_regression_fidelity.py
@@ -25,8 +25,7 @@
 
 from __future__ import annotations
 
-import warnings
-from typing import Any, Optional, Union
+from typing import Any, Sequence
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
@@ -40,6 +39,7 @@
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
 from botorch.utils.datasets import SupervisedDataset
+from botorch.utils.types import _DefaultType, DEFAULT
 from gpytorch.kernels.kernel import ProductKernel
 from gpytorch.kernels.scale_kernel import ScaleKernel
 from gpytorch.likelihoods.likelihood import Likelihood
@@ -66,15 +66,14 @@ def __init__(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        train_Yvar: Optional[Tensor] = None,
-        iteration_fidelity: Optional[int] = None,
-        data_fidelities: Optional[Union[list[int], tuple[int]]] = None,
-        data_fidelity: Optional[int] = None,
+        train_Yvar: Tensor | None = None,
+        iteration_fidelity: int | None = None,
+        data_fidelities: Sequence[int] | None = None,
         linear_truncated: bool = True,
         nu: float = 2.5,
-        likelihood: Optional[Likelihood] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
+        likelihood: Likelihood | None = None,
+        outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
+        input_transform: InputTransform | None = None,
     ) -> None:
         r"""
         Args:
@@ -89,33 +88,21 @@ def __init__(
             data_fidelities: The column indices for the downsampling fidelity parameter.
                 If a list/tuple of indices is provided, a kernel will be constructed for
                 each index (optional).
-            data_fidelity: The column index for the downsampling fidelity parameter
-                (optional). Deprecated in favor of `data_fidelities`.
             linear_truncated: If True, use a `LinearTruncatedFidelityKernel` instead
                 of the default kernel.
             nu: The smoothness parameter for the Matern kernel: either 1/2, 3/2, or
                 5/2. Only used when `linear_truncated=True`.
             likelihood: A likelihood. If omitted, use a standard GaussianLikelihood
                 with inferred noise level.
             outcome_transform: An outcome transform that is applied to the
-                    training data during instantiation and to the posterior during
-                    inference (that is, the `Posterior` obtained by calling
-                    `.posterior` on the model will be on the original scale).
+                training data during instantiation and to the posterior during
+                inference (that is, the `Posterior` obtained by calling
+                `.posterior` on the model will be on the original scale). We use a
+                `Standardize` transform if no `outcome_transform` is specified.
+                Pass down `None` to use no outcome transform.
             input_transform: An input transform that is applied in the model's
                     forward pass.
         """
-        if data_fidelity is not None:
-            warnings.warn(
-                "The `data_fidelity` argument is deprecated and will be removed in "
-                "a future release. Please use `data_fidelities` instead.",
-                DeprecationWarning,
-            )
-            if data_fidelities is not None:
-                raise ValueError(
-                    "Cannot specify both `data_fidelity` and `data_fidelities`."
-                )
-            data_fidelities = [data_fidelity]
-
         self._init_args = {
             "iteration_fidelity": iteration_fidelity,
             "data_fidelities": data_fidelities,
@@ -179,47 +166,11 @@ def construct_inputs(
         return inputs
 
 
-class FixedNoiseMultiFidelityGP(SingleTaskMultiFidelityGP):
-    def __init__(
-        self,
-        train_X: Tensor,
-        train_Y: Tensor,
-        train_Yvar: Tensor,
-        iteration_fidelity: Optional[int] = None,
-        data_fidelities: Optional[Union[list[int], tuple[int]]] = None,
-        data_fidelity: Optional[int] = None,
-        linear_truncated: bool = True,
-        nu: float = 2.5,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
-    ) -> None:
-        r"""DEPRECATED: Use `SingleTaskMultiFidelityGP` instead.
-        Will be removed in a future release (~v0.11).
-        """
-        warnings.warn(
-            "`FixedNoiseMultiFidelityGP` has been deprecated. "
-            "Use `SingleTaskMultiFidelityGP` with `train_Yvar` instead.",
-            DeprecationWarning,
-        )
-        super().__init__(
-            train_X=train_X,
-            train_Y=train_Y,
-            train_Yvar=train_Yvar,
-            iteration_fidelity=iteration_fidelity,
-            data_fidelities=data_fidelities,
-            data_fidelity=data_fidelity,
-            linear_truncated=linear_truncated,
-            nu=nu,
-            outcome_transform=outcome_transform,
-            input_transform=input_transform,
-        )
-
-
 def _setup_multifidelity_covar_module(
     dim: int,
     aug_batch_shape: torch.Size,
-    iteration_fidelity: Optional[int],
-    data_fidelities: Optional[list[int]],
+    iteration_fidelity: int | None,
+    data_fidelities: Sequence[int] | None,
     linear_truncated: bool,
     nu: float,
 ) -> tuple[ScaleKernel, dict]:
@@ -246,6 +197,7 @@ def _setup_multifidelity_covar_module(
     if iteration_fidelity is not None and iteration_fidelity < 0:
         iteration_fidelity = dim + iteration_fidelity
     if data_fidelities is not None:
+        data_fidelities = list(data_fidelities)
         for i in range(len(data_fidelities)):
             if data_fidelities[i] < 0:
                 data_fidelities[i] = dim + data_fidelities[i]