diff --git a/botorch/models/approximate_gp.py b/botorch/models/approximate_gp.py
index 5bd4beecb6..df55632dc3 100644
--- a/botorch/models/approximate_gp.py
+++ b/botorch/models/approximate_gp.py
@@ -35,6 +35,7 @@
 from typing import Optional, TypeVar, Union
 
 import torch
+from botorch.exceptions.warnings import UserInputWarning
 from botorch.models.gpytorch import GPyTorchModel
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
@@ -70,6 +71,14 @@
 
 
 TApproxModel = TypeVar("TApproxModel", bound="ApproximateGPyTorchModel")
+TRANSFORM_WARNING = (
+    "Using an {ttype} transform with `SingleTaskVariationalGP`. If this "
+    "model is trained in minibatches, a {ttype} transform with learnable "
+    "parameters would update its parameters for each minibatch, which is "
+    "undesirable. If you do intend to train in minibatches, we recommend "
+    "you not use a {ttype} transform and instead pre-transform your whole "
+    "data set before fitting the model."
+)
 
 
 class ApproximateGPyTorchModel(GPyTorchModel):
@@ -325,9 +334,9 @@ def __init__(
         variational_distribution: Optional[_VariationalDistribution] = None,
         variational_strategy: type[_VariationalStrategy] = VariationalStrategy,
         inducing_points: Optional[Union[Tensor, int]] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
         inducing_point_allocator: Optional[InducingPointAllocator] = None,
+        outcome_transform: OutcomeTransform | None = None,
+        input_transform: InputTransform | None = None,
     ) -> None:
         r"""
         Args:
@@ -338,6 +347,8 @@ def __init__(
                 either a `GaussianLikelihood` (if `num_outputs=1`) or a
                 `MultitaskGaussianLikelihood`(if `num_outputs>1`).
             num_outputs: Number of output responses per input (default: 1).
+            learn_inducing_points: If True, the inducing point locations are learned
+                jointly with the other model parameters.
             covar_module: Kernel function. If omitted, uses an `RBFKernel`.
             mean_module: Mean of GP model. If omitted, uses a `ConstantMean`.
             variational_distribution: Type of variational distribution to use
@@ -351,6 +362,20 @@ def __init__(
             inducing_point_allocator: The `InducingPointAllocator` used to
                 initialize the inducing point locations. If omitted,
                 uses `GreedyVarianceReduction`.
+            outcome_transform: An outcome transform that is applied to the training
+                data during instantiation and to the posterior during inference.
+                NOTE: If this model is trained in minibatches, an outcome transform
+                with learnable parameters (such as `Standardize`) would update its
+                parameters for each minibatch, which is undesirable. If you do intend
+                to train in minibatches, we recommend you not use an outcome transform
+                and instead pre-transform your whole data set before fitting the model.
+            input_transform: An input transform that is applied in the model's
+                forward pass.
+                NOTE: If this model is trained in minibatches, an input transform
+                with learnable parameters (such as `Normalize`) would update its
+                parameters for each minibatch, which is undesirable. If you do intend
+                to train in minibatches, we recommend you not use an input transform
+                and instead pre-transform your whole data set before fitting the model.
         """
         with torch.no_grad():
             transformed_X = self.transform_inputs(
@@ -358,6 +383,11 @@ def __init__(
             )
         if train_Y is not None:
             if outcome_transform is not None:
+                warnings.warn(
+                    TRANSFORM_WARNING.format(ttype="outcome"),
+                    UserInputWarning,
+                    stacklevel=3,
+                )
                 train_Y, _ = outcome_transform(train_Y)
             self._validate_tensor_args(X=transformed_X, Y=train_Y)
             validate_input_scaling(train_X=transformed_X, train_Y=train_Y)
@@ -388,6 +418,7 @@ def __init__(
                 "being further optimized during the model fit. If so "
                 "then set `learn_inducing_points` to False.",
                 UserWarning,
+                stacklevel=3,
             )
 
         if inducing_point_allocator is None:
@@ -412,6 +443,11 @@ def __init__(
         if outcome_transform is not None:
             self.outcome_transform = outcome_transform
         if input_transform is not None:
+            warnings.warn(
+                TRANSFORM_WARNING.format(ttype="input"),
+                UserInputWarning,
+                stacklevel=3,
+            )
             self.input_transform = input_transform
 
         # for model fitting utilities
diff --git a/botorch/models/contextual_multioutput.py b/botorch/models/contextual_multioutput.py
index dc954a4df3..e303315d7b 100644
--- a/botorch/models/contextual_multioutput.py
+++ b/botorch/models/contextual_multioutput.py
@@ -13,7 +13,6 @@
     Advances in Neural Information Processing Systems 33, NeurIPS 2020.
 """
 
-import warnings
 from typing import Any, Optional, Union
 
 import torch
@@ -21,6 +20,7 @@
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.utils.datasets import MultiTaskDataset, SupervisedDataset
+from botorch.utils.types import _DefaultType, DEFAULT
 from gpytorch.constraints import Interval
 from gpytorch.kernels.rbf_kernel import RBFKernel
 from gpytorch.likelihoods.likelihood import Likelihood
@@ -51,8 +51,8 @@ def __init__(
         embs_dim_list: Optional[list[int]] = None,
         output_tasks: Optional[list[int]] = None,
         all_tasks: Optional[list[int]] = None,
+        outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
         input_transform: Optional[InputTransform] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
     ) -> None:
         r"""
         Args:
@@ -85,12 +85,14 @@ def __init__(
                 training data. Note that when a task is not observed, the corresponding
                 task covariance will heavily depend on random initialization and may
                 behave unexpectedly.
-            input_transform: An input transform that is applied in the model's
-                forward pass.
             outcome_transform: An outcome transform that is applied to the
                 training data during instantiation and to the posterior during
                 inference (that is, the `Posterior` obtained by calling
-                `.posterior` on the model will be on the original scale).
+                `.posterior` on the model will be on the original scale). We use a
+                `Standardize` transform if no `outcome_transform` is specified.
+                Pass down `None` to use no outcome transform.
+            input_transform: An input transform that is applied in the model's
+                forward pass.
         """
         super().__init__(
             train_X=train_X,
@@ -102,8 +104,8 @@ def __init__(
             likelihood=likelihood,
             output_tasks=output_tasks,
             all_tasks=all_tasks,
-            input_transform=input_transform,
             outcome_transform=outcome_transform,
+            input_transform=input_transform,
         )
         self.device = train_X.device
         if all_tasks is None:
@@ -247,62 +249,3 @@ def construct_inputs(
         if embs_dim_list is not None:
             base_inputs["embs_dim_list"] = embs_dim_list
         return base_inputs
-
-
-class FixedNoiseLCEMGP(LCEMGP):
-    r"""The Multi-Task GP the latent context embedding multioutput
-    (LCE-M) kernel, with known observation noise.
-
-    DEPRECATED: Please use `LCEMGP` with `train_Yvar` instead.
-    Will be removed in a future release (~v0.11).
-    """
-
-    def __init__(
-        self,
-        train_X: Tensor,
-        train_Y: Tensor,
-        train_Yvar: Tensor,
-        task_feature: int,
-        context_cat_feature: Optional[Tensor] = None,
-        context_emb_feature: Optional[Tensor] = None,
-        embs_dim_list: Optional[list[int]] = None,
-        output_tasks: Optional[list[int]] = None,
-    ) -> None:
-        r"""
-        Args:
-            train_X: (n x d) X training data.
-            train_Y: (n x 1) Y training data.
-            train_Yvar: (n x 1) Observed variances of each training Y.
-            task_feature: Column index of train_X to get context indices.
-            context_cat_feature: (n_contexts x k) one-hot encoded context
-                features. Rows are ordered by context indices, where k is the
-                number of categorical variables. If None, task indices will
-                be used and k = 1.
-            context_emb_feature: (n_contexts x m) pre-given continuous
-                embedding features. Rows are ordered by context indices.
-            embs_dim_list: Embedding dimension for each categorical variable.
-                The length equals to k. If None, the embedding dimension is set to
-                1 for each categorical variable.
-            output_tasks: A list of task indices for which to compute model
-                outputs for. If omitted, return outputs for all task indices.
-
-        """
-        warnings.warn(
-            "`FixedNoiseLCEMGP` has been deprecated and will be removed in a "
-            "future release. Please use the `LCEMGP` model instead. "
-            "When `train_Yvar` is specified, `LCEMGP` behaves the same "
-            "as the `FixedNoiseLCEMGP`.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-        super().__init__(
-            train_X=train_X,
-            train_Y=train_Y,
-            task_feature=task_feature,
-            train_Yvar=train_Yvar,
-            context_cat_feature=context_cat_feature,
-            context_emb_feature=context_emb_feature,
-            embs_dim_list=embs_dim_list,
-            output_tasks=output_tasks,
-        )
diff --git a/botorch/models/fully_bayesian_multitask.py b/botorch/models/fully_bayesian_multitask.py
index 94c3f30d94..44a74f5e89 100644
--- a/botorch/models/fully_bayesian_multitask.py
+++ b/botorch/models/fully_bayesian_multitask.py
@@ -167,7 +167,7 @@ class SaasFullyBayesianMultiTaskGP(MultiTaskGP):
 
     This model assumes that the inputs have been normalized to [0, 1]^d and that the
     output has been stratified standardized to have zero mean and unit variance for
-    each task.The SAAS model [Eriksson2021saasbo]_ with a Matern-5/2 is used as data
+    each task. The SAAS model [Eriksson2021saasbo]_ with a Matern-5/2 is used as data
     kernel by default.
 
     You are expected to use `fit_fully_bayesian_model_nuts` to fit this model as it
@@ -243,6 +243,7 @@ def __init__(
                 X=train_X, input_transform=input_transform
             )
         if outcome_transform is not None:
+            outcome_transform.train()  # Ensure we learn parameters here on init
             train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar)
         if train_Yvar is not None:  # Clamp after transforming
             train_Yvar = train_Yvar.clamp(MIN_INFERRED_NOISE_LEVEL)
@@ -254,6 +255,11 @@ def __init__(
             task_feature=task_feature,
             output_tasks=output_tasks,
             rank=rank,
+            # We already transformed the data above, this avoids applying the
+            # default `Standardize` transform twice. As outcome_transform is
+            # set on `self` below, it will be applied to the posterior in the
+            # `posterior` method of `MultiTaskGP`.
+            outcome_transform=None,
         )
         if all_tasks is not None and self._expected_task_values != set(all_tasks):
             raise NotImplementedError(
diff --git a/botorch/models/gp_regression_fidelity.py b/botorch/models/gp_regression_fidelity.py
index 99f7f02acc..29532b86ff 100644
--- a/botorch/models/gp_regression_fidelity.py
+++ b/botorch/models/gp_regression_fidelity.py
@@ -25,8 +25,7 @@
 
 from __future__ import annotations
 
-import warnings
-from typing import Any, Optional, Union
+from typing import Any, Sequence
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
@@ -40,6 +39,7 @@
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
 from botorch.utils.datasets import SupervisedDataset
+from botorch.utils.types import _DefaultType, DEFAULT
 from gpytorch.kernels.kernel import ProductKernel
 from gpytorch.kernels.scale_kernel import ScaleKernel
 from gpytorch.likelihoods.likelihood import Likelihood
@@ -66,15 +66,14 @@ def __init__(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        train_Yvar: Optional[Tensor] = None,
-        iteration_fidelity: Optional[int] = None,
-        data_fidelities: Optional[Union[list[int], tuple[int]]] = None,
-        data_fidelity: Optional[int] = None,
+        train_Yvar: Tensor | None = None,
+        iteration_fidelity: int | None = None,
+        data_fidelities: Sequence[int] | None = None,
         linear_truncated: bool = True,
         nu: float = 2.5,
-        likelihood: Optional[Likelihood] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
+        likelihood: Likelihood | None = None,
+        outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
+        input_transform: InputTransform | None = None,
     ) -> None:
         r"""
         Args:
@@ -89,8 +88,6 @@ def __init__(
             data_fidelities: The column indices for the downsampling fidelity parameter.
                 If a list/tuple of indices is provided, a kernel will be constructed for
                 each index (optional).
-            data_fidelity: The column index for the downsampling fidelity parameter
-                (optional). Deprecated in favor of `data_fidelities`.
             linear_truncated: If True, use a `LinearTruncatedFidelityKernel` instead
                 of the default kernel.
             nu: The smoothness parameter for the Matern kernel: either 1/2, 3/2, or
@@ -98,24 +95,14 @@ def __init__(
             likelihood: A likelihood. If omitted, use a standard GaussianLikelihood
                 with inferred noise level.
             outcome_transform: An outcome transform that is applied to the
-                    training data during instantiation and to the posterior during
-                    inference (that is, the `Posterior` obtained by calling
-                    `.posterior` on the model will be on the original scale).
+                training data during instantiation and to the posterior during
+                inference (that is, the `Posterior` obtained by calling
+                `.posterior` on the model will be on the original scale). We use a
+                `Standardize` transform if no `outcome_transform` is specified.
+                Pass down `None` to use no outcome transform.
             input_transform: An input transform that is applied in the model's
                     forward pass.
         """
-        if data_fidelity is not None:
-            warnings.warn(
-                "The `data_fidelity` argument is deprecated and will be removed in "
-                "a future release. Please use `data_fidelities` instead.",
-                DeprecationWarning,
-            )
-            if data_fidelities is not None:
-                raise ValueError(
-                    "Cannot specify both `data_fidelity` and `data_fidelities`."
-                )
-            data_fidelities = [data_fidelity]
-
         self._init_args = {
             "iteration_fidelity": iteration_fidelity,
             "data_fidelities": data_fidelities,
@@ -179,47 +166,11 @@ def construct_inputs(
         return inputs
 
 
-class FixedNoiseMultiFidelityGP(SingleTaskMultiFidelityGP):
-    def __init__(
-        self,
-        train_X: Tensor,
-        train_Y: Tensor,
-        train_Yvar: Tensor,
-        iteration_fidelity: Optional[int] = None,
-        data_fidelities: Optional[Union[list[int], tuple[int]]] = None,
-        data_fidelity: Optional[int] = None,
-        linear_truncated: bool = True,
-        nu: float = 2.5,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
-    ) -> None:
-        r"""DEPRECATED: Use `SingleTaskMultiFidelityGP` instead.
-        Will be removed in a future release (~v0.11).
-        """
-        warnings.warn(
-            "`FixedNoiseMultiFidelityGP` has been deprecated. "
-            "Use `SingleTaskMultiFidelityGP` with `train_Yvar` instead.",
-            DeprecationWarning,
-        )
-        super().__init__(
-            train_X=train_X,
-            train_Y=train_Y,
-            train_Yvar=train_Yvar,
-            iteration_fidelity=iteration_fidelity,
-            data_fidelities=data_fidelities,
-            data_fidelity=data_fidelity,
-            linear_truncated=linear_truncated,
-            nu=nu,
-            outcome_transform=outcome_transform,
-            input_transform=input_transform,
-        )
-
-
 def _setup_multifidelity_covar_module(
     dim: int,
     aug_batch_shape: torch.Size,
-    iteration_fidelity: Optional[int],
-    data_fidelities: Optional[list[int]],
+    iteration_fidelity: int | None,
+    data_fidelities: Sequence[int] | None,
     linear_truncated: bool,
     nu: float,
 ) -> tuple[ScaleKernel, dict]:
@@ -246,6 +197,7 @@ def _setup_multifidelity_covar_module(
     if iteration_fidelity is not None and iteration_fidelity < 0:
         iteration_fidelity = dim + iteration_fidelity
     if data_fidelities is not None:
+        data_fidelities = list(data_fidelities)
         for i in range(len(data_fidelities)):
             if data_fidelities[i] < 0:
                 data_fidelities[i] = dim + data_fidelities[i]
diff --git a/botorch/models/gp_regression_mixed.py b/botorch/models/gp_regression_mixed.py
index 5dc4697f04..30941d27e3 100644
--- a/botorch/models/gp_regression_mixed.py
+++ b/botorch/models/gp_regression_mixed.py
@@ -6,7 +6,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 from botorch.models.gp_regression import SingleTaskGP
@@ -16,6 +16,7 @@
 from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
 from botorch.utils.datasets import SupervisedDataset
 from botorch.utils.transforms import normalize_indices
+from botorch.utils.types import _DefaultType, DEFAULT
 from gpytorch.constraints import GreaterThan
 from gpytorch.kernels.kernel import Kernel
 from gpytorch.kernels.scale_kernel import ScaleKernel
@@ -65,7 +66,7 @@ def __init__(
             Callable[[torch.Size, int, list[int]], Kernel]
         ] = None,
         likelihood: Optional[Likelihood] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,  # TODO
+        outcome_transform: Optional[Union[OutcomeTransform, _DefaultType]] = DEFAULT,
         input_transform: Optional[InputTransform] = None,  # TODO
     ) -> None:
         r"""A single-task exact GP model supporting categorical parameters.
@@ -87,7 +88,9 @@ def __init__(
             outcome_transform: An outcome transform that is applied to the
                 training data during instantiation and to the posterior during
                 inference (that is, the `Posterior` obtained by calling
-                `.posterior` on the model will be on the original scale).
+                `.posterior` on the model will be on the original scale). We use a
+                `Standardize` transform if no `outcome_transform` is specified.
+                Pass down `None` to use no outcome transform.
             input_transform: An input transform that is applied in the model's
                 forward pass. Only input transforms are allowed which do not
                 transform the categorical dimensions. If you want to use it
diff --git a/botorch/models/multitask.py b/botorch/models/multitask.py
index 807b4b55c9..092639f250 100644
--- a/botorch/models/multitask.py
+++ b/botorch/models/multitask.py
@@ -38,7 +38,7 @@
 from botorch.models.gpytorch import GPyTorchModel, MultiTaskGPyTorchModel
 from botorch.models.model import FantasizeMixin
 from botorch.models.transforms.input import InputTransform
-from botorch.models.transforms.outcome import OutcomeTransform
+from botorch.models.transforms.outcome import OutcomeTransform, Standardize
 from botorch.models.utils.gpytorch_modules import (
     get_covar_module_with_dim_scaled_prior,
     get_gaussian_likelihood_with_lognormal_prior,
@@ -46,6 +46,7 @@
 )
 from botorch.posteriors.multitask import MultitaskGPPosterior
 from botorch.utils.datasets import MultiTaskDataset, SupervisedDataset
+from botorch.utils.types import _DefaultType, DEFAULT
 from gpytorch.constraints import GreaterThan
 from gpytorch.distributions.multitask_multivariate_normal import (
     MultitaskMultivariateNormal,
@@ -65,7 +66,7 @@
 from gpytorch.priors.lkj_prior import LKJCovariancePrior
 from gpytorch.priors.prior import Prior
 from gpytorch.priors.smoothed_box_prior import SmoothedBoxPrior
-from gpytorch.priors.torch_priors import GammaPrior
+from gpytorch.priors.torch_priors import GammaPrior, LogNormalPrior
 from gpytorch.settings import detach_test_caches
 from gpytorch.utils.errors import CachingError
 from gpytorch.utils.memoize import cached, pop_from_cache
@@ -108,7 +109,7 @@ def get_task_value_remapping(
         # Create a tensor that maps task values to new task values.
         # The number of tasks should be small, so this should be quite efficient.
         mapper = torch.full(
-            (task_values.max().item() + 1,),
+            (int(task_values.max().item()) + 1,),
             float("nan"),
             dtype=dtype,
             device=task_values.device,
@@ -122,11 +123,11 @@ class MultiTaskGP(ExactGP, MultiTaskGPyTorchModel, FantasizeMixin):
     kernel. See [Bonilla2007MTGP]_ and [Swersky2013MTBO]_ for a reference on the
     model and its use in Bayesian optimization.
 
-
     The model can be single-output or multi-output, determined by the `output_tasks`.
     This model uses relatively strong priors on the base Kernel hyperparameters, which
     work best when covariates are normalized to the unit cube and outcomes are
-    standardized (zero mean, unit variance).
+    standardized (zero mean, unit variance) - this standardization should be applied in
+    a stratified fashion at the level of the tasks, rather than across all data points.
 
     If the `train_Yvar` is None, this model infers the noise level. If you have
     known observation noise, you can set `train_Yvar` to a tensor containing
@@ -147,8 +148,8 @@ def __init__(
         output_tasks: Optional[list[int]] = None,
         rank: Optional[int] = None,
         all_tasks: Optional[list[int]] = None,
+        outcome_transform: Optional[Union[OutcomeTransform, _DefaultType]] = DEFAULT,
         input_transform: Optional[InputTransform] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
     ) -> None:
         r"""Multi-Task GP model using an ICM kernel.
 
@@ -180,12 +181,15 @@ def __init__(
                 training data. Note that when a task is not observed, the corresponding
                 task covariance will heavily depend on random initialization and may
                 behave unexpectedly.
-            input_transform: An input transform that is applied in the model's
-                forward pass.
             outcome_transform: An outcome transform that is applied to the
                 training data during instantiation and to the posterior during
                 inference (that is, the `Posterior` obtained by calling
-                `.posterior` on the model will be on the original scale).
+                `.posterior` on the model will be on the original scale). We use a
+                `Standardize` transform if no `outcome_transform` is specified.
+                Pass down `None` to use no outcome transform. NOTE: Standardization
+                should be applied in a stratified fashion, separately for each task.
+            input_transform: An input transform that is applied in the model's
+                forward pass.
 
         Example:
             >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2)
@@ -214,6 +218,8 @@ def __init__(
             )
         all_tasks = all_tasks or all_tasks_inferred
         self.num_tasks = len(all_tasks)
+        if outcome_transform == DEFAULT:
+            outcome_transform = Standardize(m=1, batch_shape=train_X.shape[:-2])
         if outcome_transform is not None:
             train_Y, train_Yvar = outcome_transform(Y=train_Y, Yvar=train_Yvar)
 
@@ -470,8 +476,7 @@ def __init__(
         if rank is None:
             rank = num_tasks
         if likelihood is None:
-            noise_prior = GammaPrior(1.1, 0.05)
-            noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate
+            noise_prior = LogNormalPrior(loc=-4.0, scale=1.0)
             likelihood = MultitaskGaussianLikelihood(
                 num_tasks=num_tasks,
                 batch_shape=batch_shape,
@@ -479,7 +484,7 @@ def __init__(
                 noise_constraint=GreaterThan(
                     MIN_INFERRED_NOISE_LEVEL,
                     transform=None,
-                    initial_value=noise_prior_mode,
+                    initial_value=noise_prior.mode,
                 ),
                 rank=kwargs.get("likelihood_rank", 0),
             )
diff --git a/botorch/utils/test_helpers.py b/botorch/utils/test_helpers.py
index e9977ae9db..6d99758c7b 100644
--- a/botorch/utils/test_helpers.py
+++ b/botorch/utils/test_helpers.py
@@ -19,13 +19,13 @@
 from botorch.exceptions.errors import UnsupportedError
 from botorch.models import SingleTaskGP
 from botorch.models.fully_bayesian import SaasFullyBayesianSingleTaskGP
-from botorch.models.gpytorch import GPyTorchModel
+from botorch.models.gpytorch import BatchedMultiOutputGPyTorchModel, GPyTorchModel
 from botorch.models.model import FantasizeMixin, Model
 from botorch.models.model_list_gp_regression import ModelListGP
 from botorch.models.transforms.outcome import Standardize
 from botorch.models.utils import add_output_dim
 from botorch.models.utils.assorted import fantasize
-from botorch.posteriors.posterior import Posterior
+from botorch.posteriors.torch import TorchPosterior
 from botorch.utils.datasets import MultiTaskDataset, SupervisedDataset
 from gpytorch.distributions.multivariate_normal import MultivariateNormal
 from gpytorch.kernels import RBFKernel, ScaleKernel
@@ -244,21 +244,58 @@ def gen_multi_task_dataset(
     return dataset, (train_X, train_Y, train_Yvar)
 
 
-def get_pvar_expected(posterior: Posterior, model: Model, X: Tensor, m: int) -> Tensor:
+def get_pvar_expected(
+    posterior: TorchPosterior, model: Model, X: Tensor, m: int
+) -> Tensor:
     """Computes the expected variance of a posterior after adding the
     predictive noise from the likelihood.
+
+    Args:
+        posterior: The posterior to compute the variance of. Must be a
+            `TorchPosterior` object.
+        model: The model that generated the posterior. If `m > 1`, this must be
+            a `BatchedMultiOutputGPyTorchModel`.
+        X: The test inputs.
+        m: The number of outputs.
+
+    Returns:
+        The expected variance of the posterior after adding the observation
+        noise from the likelihood.
     """
     X = model.transform_inputs(X)
     lh_kwargs = {}
+    odim = -1  # this is the output dimension index
+
+    if m > 1:
+        if not isinstance(model, BatchedMultiOutputGPyTorchModel):
+            raise UnsupportedError(
+                "`get_pvar_expected` only supports `BatchedMultiOutputGPyTorchModel`s."
+            )
+        # We need to add a batch dimension to the input to be compatible with the
+        # augmented batch shape of the model. This also changes the output dimension
+        # index.
+        X, odim = add_output_dim(X=X, original_batch_shape=model._input_batch_shape)
+
     if isinstance(model.likelihood, FixedNoiseGaussianLikelihood):
-        lh_kwargs["noise"] = model.likelihood.noise.mean().expand(X.shape[:-1])
+        noise = model.likelihood.noise.mean(dim=-1, keepdim=True)
+        broadcasted_shape = torch.broadcast_shapes(noise.shape, X.shape[:-1])
+        lh_kwargs["noise"] = noise.expand(broadcasted_shape)
+
+    pvar_exp = model.likelihood(model(X), X, **lh_kwargs).variance
     if m == 1:
-        return model.likelihood(
-            posterior.distribution, X, **lh_kwargs
-        ).variance.unsqueeze(-1)
-    X_, odi = add_output_dim(X=X, original_batch_shape=model._input_batch_shape)
-    pvar_exp = model.likelihood(model(X_), X_, **lh_kwargs).variance
-    return torch.stack([pvar_exp.select(dim=odi, index=i) for i in range(m)], dim=-1)
+        pvar_exp = pvar_exp.unsqueeze(-1)
+    pvar_exp = torch.stack(
+        [pvar_exp.select(dim=odim, index=i) for i in range(m)], dim=-1
+    )
+
+    # If the model has an outcome transform, we need to untransform the
+    # variance according to that transform.
+    if hasattr(model, "outcome_transform"):
+        _, pvar_exp = model.outcome_transform.untransform(
+            Y=torch.zeros_like(pvar_exp), Yvar=pvar_exp
+        )
+
+    return pvar_exp
 
 
 class DummyNonScalarizingPosteriorTransform(PosteriorTransform):
diff --git a/test/models/test_approximate_gp.py b/test/models/test_approximate_gp.py
index ed7cba18cf..2a05e689cf 100644
--- a/test/models/test_approximate_gp.py
+++ b/test/models/test_approximate_gp.py
@@ -8,6 +8,7 @@
 import warnings
 
 import torch
+from botorch.exceptions.warnings import UserInputWarning
 from botorch.fit import fit_gpytorch_mll
 from botorch.models.approximate_gp import (
     _SingleTaskVariationalGP,
@@ -190,6 +191,26 @@ def test_initializations(self):
             else:
                 self.assertFalse(hasattr(model, "outcome_transform"))
 
+        # test user warnings when using transforms
+        with self.assertWarnsRegex(
+            UserInputWarning,
+            "Using an input transform with `SingleTaskVariationalGP`",
+        ):
+            SingleTaskVariationalGP(
+                train_X=train_X,
+                train_Y=train_Y,
+                input_transform=Normalize(d=1),
+            )
+        with self.assertWarnsRegex(
+            UserInputWarning,
+            "Using an outcome transform with `SingleTaskVariationalGP`",
+        ):
+            SingleTaskVariationalGP(
+                train_X=train_X,
+                train_Y=train_Y,
+                outcome_transform=Log(),
+            )
+
         # test default inducing point allocator
         self.assertIsInstance(model._inducing_point_allocator, GreedyVarianceReduction)
 
diff --git a/test/models/test_contextual_multioutput.py b/test/models/test_contextual_multioutput.py
index 4b94d2a6b0..040c08be91 100644
--- a/test/models/test_contextual_multioutput.py
+++ b/test/models/test_contextual_multioutput.py
@@ -7,7 +7,7 @@
 
 import torch
 from botorch.fit import fit_gpytorch_mll
-from botorch.models.contextual_multioutput import FixedNoiseLCEMGP, LCEMGP
+from botorch.models.contextual_multioutput import LCEMGP
 from botorch.models.multitask import MultiTaskGP
 from botorch.posteriors import GPyTorchPosterior
 from botorch.utils.test_helpers import gen_multi_task_dataset
@@ -102,26 +102,6 @@ def test_LCEMGP(self):
             ).to_dense()
             self.assertAllClose(previous_covar, model.task_covar_module(task_idcs))
 
-    def test_FixedNoiseLCEMGP(self):
-        for dtype in (torch.float, torch.double):
-            _, (train_x, train_y, train_yvar) = gen_multi_task_dataset(
-                yvar=0.01, dtype=dtype, device=self.device
-            )
-
-            with self.assertWarnsRegex(DeprecationWarning, "FixedNoiseLCEMGP"):
-                model = FixedNoiseLCEMGP(
-                    train_X=train_x,
-                    train_Y=train_y,
-                    train_Yvar=train_yvar,
-                    task_feature=0,
-                )
-            mll = ExactMarginalLogLikelihood(model.likelihood, model)
-            fit_gpytorch_mll(mll, optimizer_kwargs={"options": {"maxiter": 1}})
-            self.assertIsInstance(model, FixedNoiseLCEMGP)
-
-            test_x = train_x[:5]
-            self.assertIsInstance(model(test_x), MultivariateNormal)
-
     def test_construct_inputs(self) -> None:
         for with_embedding_inputs, yvar, skip_task_features_in_datasets in zip(
             (True, False), (None, 0.01), (True, False), strict=True
diff --git a/test/models/test_converter.py b/test/models/test_converter.py
index ef6fad033b..db23e06479 100644
--- a/test/models/test_converter.py
+++ b/test/models/test_converter.py
@@ -211,8 +211,18 @@ def test_model_list_to_batched(self):
             batch_gp = model_list_to_batched(list_gp)
             self.assertIsInstance(batch_gp.likelihood, FixedNoiseGaussianLikelihood)
             # test SingleTaskMultiFidelityGP
-            gp1_ = SingleTaskMultiFidelityGP(train_X, train_Y1, iteration_fidelity=1)
-            gp2_ = SingleTaskMultiFidelityGP(train_X, train_Y2, iteration_fidelity=1)
+            gp1_ = SingleTaskMultiFidelityGP(
+                train_X,
+                train_Y1,
+                iteration_fidelity=1,
+                outcome_transform=None,
+            )
+            gp2_ = SingleTaskMultiFidelityGP(
+                train_X,
+                train_Y2,
+                iteration_fidelity=1,
+                outcome_transform=None,
+            )
             list_gp = ModelListGP(gp1_, gp2_)
             batch_gp = model_list_to_batched(list_gp)
             gp2_ = SingleTaskMultiFidelityGP(train_X, train_Y2, iteration_fidelity=2)
@@ -372,7 +382,11 @@ def test_roundtrip(self):
             # SingleTaskMultiFidelityGP
             for lin_trunc in (False, True):
                 batch_gp = SingleTaskMultiFidelityGP(
-                    train_X, train_Y, iteration_fidelity=1, linear_truncated=lin_trunc
+                    train_X=train_X,
+                    train_Y=train_Y,
+                    iteration_fidelity=1,
+                    linear_truncated=lin_trunc,
+                    outcome_transform=None,
                 )
                 list_gp = batched_to_model_list(batch_gp)
                 batch_gp_recov = model_list_to_batched(list_gp)
@@ -429,7 +443,10 @@ def test_batched_multi_output_to_single_output(self):
             self.assertEqual(batched_so_model.num_outputs, 1)
             # test SingleTaskMultiFidelityGP
             batched_mo_model = SingleTaskMultiFidelityGP(
-                train_X, train_Y, iteration_fidelity=1
+                train_X,
+                train_Y,
+                iteration_fidelity=1,
+                outcome_transform=None,
             )
             batched_so_model = batched_multi_output_to_single_output(batched_mo_model)
             self.assertIsInstance(batched_so_model, SingleTaskMultiFidelityGP)
@@ -478,5 +495,8 @@ def test_batched_multi_output_to_single_output(self):
             batched_mo_model = SingleTaskGP(
                 train_X, train_Y, outcome_transform=Standardize(m=2)
             )
-            with self.assertRaises(NotImplementedError):
+            with self.assertRaisesRegex(
+                NotImplementedError,
+                "Converting batched multi-output models with outcome transforms",
+            ):
                 batched_multi_output_to_single_output(batched_mo_model)
diff --git a/test/models/test_fully_bayesian_multitask.py b/test/models/test_fully_bayesian_multitask.py
index 9cd9e33009..e1c924b62b 100644
--- a/test/models/test_fully_bayesian_multitask.py
+++ b/test/models/test_fully_bayesian_multitask.py
@@ -11,21 +11,24 @@
 import torch
 from botorch import fit_fully_bayesian_model_nuts
 from botorch.acquisition.analytic import (
-    ExpectedImprovement,
+    LogExpectedImprovement,
     PosteriorMean,
     ProbabilityOfImprovement,
     UpperConfidenceBound,
 )
+from botorch.acquisition.logei import (
+    qLogExpectedImprovement,
+    qLogNoisyExpectedImprovement,
+)
 from botorch.acquisition.monte_carlo import (
-    qExpectedImprovement,
-    qNoisyExpectedImprovement,
     qProbabilityOfImprovement,
     qSimpleRegret,
     qUpperConfidenceBound,
 )
-from botorch.acquisition.multi_objective import (
-    qExpectedHypervolumeImprovement,
-    qNoisyExpectedHypervolumeImprovement,
+
+from botorch.acquisition.multi_objective.logei import (
+    qLogExpectedHypervolumeImprovement,
+    qLogNoisyExpectedHypervolumeImprovement,
 )
 from botorch.models import ModelList, ModelListGP
 from botorch.models.deterministic import GenericDeterministicModel
@@ -70,31 +73,38 @@
 
 
 class TestFullyBayesianMultiTaskGP(BotorchTestCase):
+
     def _get_data_and_model(
         self,
         task_rank: Optional[int] = None,
         output_tasks: Optional[list[int]] = None,
         infer_noise: bool = False,
-        **tkwargs
+        use_outcome_transform: bool = True,
+        **tkwargs,
     ):
         with torch.random.fork_rng():
             torch.manual_seed(0)
             train_X = torch.rand(10, 4, **tkwargs)
-            task_indices = torch.cat(
-                [torch.zeros(5, 1, **tkwargs), torch.ones(5, 1, **tkwargs)], dim=0
-            )
-            self.num_tasks = 2
-            train_X = torch.cat([train_X, task_indices], dim=1)
-            train_Y = torch.sin(train_X[:, :1])
-            train_Yvar = 0.5 * torch.arange(10, **tkwargs).unsqueeze(-1)
-            model = SaasFullyBayesianMultiTaskGP(
-                train_X=train_X,
-                train_Y=train_Y,
-                train_Yvar=None if infer_noise else train_Yvar,
-                task_feature=4,
-                output_tasks=output_tasks,
-                rank=task_rank,
-            )
+        task_indices = torch.cat(
+            [torch.zeros(5, 1, **tkwargs), torch.ones(5, 1, **tkwargs)], dim=0
+        )
+        self.num_tasks = 2
+        train_X = torch.cat([train_X, task_indices], dim=1)
+        train_Y = torch.sin(train_X[:, :1])
+        train_Yvar = 0.5 * torch.arange(10, **tkwargs).unsqueeze(-1)
+        model = SaasFullyBayesianMultiTaskGP(
+            train_X=train_X,
+            train_Y=train_Y,
+            train_Yvar=None if infer_noise else train_Yvar,
+            task_feature=4,
+            output_tasks=output_tasks,
+            rank=task_rank,
+            outcome_transform=(
+                Standardize(m=1, batch_shape=train_X.shape[:-2])
+                if use_outcome_transform
+                else None
+            ),
+        )
         return train_X, train_Y, train_Yvar, model
 
     def _get_unnormalized_data(self, **tkwargs):
@@ -205,26 +215,37 @@ def test_fit_model(
         dtype: torch.dtype = torch.double,
         infer_noise: bool = False,
         task_rank: int = 1,
+        use_outcome_transform: bool = False,
     ):
         tkwargs = {"device": self.device, "dtype": dtype}
         train_X, train_Y, train_Yvar, model = self._get_data_and_model(
-            infer_noise=infer_noise, task_rank=task_rank, **tkwargs
+            infer_noise=infer_noise,
+            task_rank=task_rank,
+            use_outcome_transform=use_outcome_transform,
+            **tkwargs,
         )
         n = train_X.shape[0]
         d = train_X.shape[1] - 1
 
+        # Handle outcome transforms (if used)
+        train_Y_tf, train_Yvar_tf = train_Y, train_Yvar
+        if use_outcome_transform:
+            train_Y_tf, train_Yvar_tf = model.outcome_transform(
+                Y=train_Y, Yvar=train_Yvar
+            )
+
         # Test init
         self.assertIsNone(model.mean_module)
         self.assertIsNone(model.covar_module)
         self.assertIsNone(model.likelihood)
         self.assertIsInstance(model.pyro_model, MultitaskSaasPyroModel)
         self.assertAllClose(train_X, model.pyro_model.train_X)
-        self.assertAllClose(train_Y, model.pyro_model.train_Y)
+        self.assertAllClose(train_Y_tf, model.pyro_model.train_Y)
         if infer_noise:
             self.assertIsNone(model.pyro_model.train_Yvar)
         else:
             self.assertAllClose(
-                train_Yvar.clamp(MIN_INFERRED_NOISE_LEVEL),
+                train_Yvar_tf.clamp(MIN_INFERRED_NOISE_LEVEL),
                 model.pyro_model.train_Yvar,
             )
 
@@ -345,14 +366,32 @@ def test_fit_model(
 
         # Check the keys in the state dict
         true_keys = EXPECTED_KEYS_NOISE if infer_noise else EXPECTED_KEYS
+        if use_outcome_transform:
+            true_keys = true_keys + [
+                "outcome_transform.stdvs",
+                "outcome_transform._is_trained",
+                "outcome_transform._stdvs_sq",
+                "outcome_transform.means",
+            ]
         self.assertEqual(set(model.state_dict().keys()), set(true_keys))
 
         # Check that we can load the state dict.
         state_dict = model.state_dict()
         _, _, _, m_new = self._get_data_and_model(
-            infer_noise=infer_noise, task_rank=task_rank, **tkwargs
+            infer_noise=infer_noise,
+            task_rank=task_rank,
+            use_outcome_transform=use_outcome_transform,
+            **tkwargs,
         )
-        self.assertEqual(m_new.state_dict(), {})
+        expected_state_dict = {}
+        if use_outcome_transform:
+            expected_state_dict.update(
+                {
+                    "outcome_transform." + k: v
+                    for k, v in model.outcome_transform.state_dict().items()
+                }
+            )
+        self.assertEqual(m_new.state_dict(), expected_state_dict)
         m_new.load_state_dict(state_dict)
         self.assertEqual(model.state_dict().keys(), m_new.state_dict().keys())
         for k in model.state_dict().keys():
@@ -377,12 +416,15 @@ def test_fit_model_float(self):
     def test_fit_model_infer_noise(self):
         self.test_fit_model(infer_noise=True, task_rank=2)
 
+    def test_fit_model_with_outcome_transform(self):
+        self.test_fit_model(use_outcome_transform=True)
+
     def test_transforms(self, infer_noise: bool = False):
         tkwargs = {"device": self.device, "dtype": torch.double}
         train_X, train_Y, train_Yvar, test_X = self._get_unnormalized_data(**tkwargs)
         n, d = train_X.shape
         normalize_indices = torch.tensor(
-            list(range(train_X.shape[-1] - 1)), **{"device": self.device}
+            list(range(train_X.shape[-1] - 1)), device=self.device
         )
 
         lb, ub = (
@@ -466,14 +508,14 @@ def test_acquisition_functions(self):
                 posterior=mixed_list.posterior(test_X), sample_shape=torch.Size([2])
             )
             acquisition_functions = [
-                ExpectedImprovement(model=model, best_f=train_Y.max()),
+                LogExpectedImprovement(model=model, best_f=train_Y.max()),
                 ProbabilityOfImprovement(model=model, best_f=train_Y.max()),
                 PosteriorMean(model=model),
                 UpperConfidenceBound(model=model, beta=4),
-                qExpectedImprovement(
+                qLogExpectedImprovement(
                     model=model, best_f=train_Y.max(), sampler=simple_sampler
                 ),
-                qNoisyExpectedImprovement(
+                qLogNoisyExpectedImprovement(
                     model=model, X_baseline=test_X, sampler=simple_sampler
                 ),
                 qProbabilityOfImprovement(
@@ -481,13 +523,13 @@ def test_acquisition_functions(self):
                 ),
                 qSimpleRegret(model=model, sampler=simple_sampler),
                 qUpperConfidenceBound(model=model, beta=4, sampler=simple_sampler),
-                qNoisyExpectedHypervolumeImprovement(
+                qLogNoisyExpectedHypervolumeImprovement(
                     model=list_gp,
                     X_baseline=test_X,
                     ref_point=torch.zeros(2, **tkwargs),
                     sampler=list_gp_sampler,
                 ),
-                qExpectedHypervolumeImprovement(
+                qLogExpectedHypervolumeImprovement(
                     model=list_gp,
                     ref_point=torch.zeros(2, **tkwargs),
                     sampler=list_gp_sampler,
@@ -496,13 +538,13 @@ def test_acquisition_functions(self):
                     ),
                 ),
                 # qEHVI/qNEHVI with mixed models
-                qNoisyExpectedHypervolumeImprovement(
+                qLogNoisyExpectedHypervolumeImprovement(
                     model=mixed_list,
                     X_baseline=test_X,
                     ref_point=torch.zeros(2, **tkwargs),
                     sampler=mixed_list_sampler,
                 ),
-                qExpectedHypervolumeImprovement(
+                qLogExpectedHypervolumeImprovement(
                     model=mixed_list,
                     ref_point=torch.zeros(2, **tkwargs),
                     sampler=mixed_list_sampler,
@@ -522,14 +564,22 @@ def test_acquisition_functions(self):
                     self.assertEqual(acqf(test_X).shape, torch.Size(batch_shape))
 
     def test_load_samples(self):
-        for task_rank, dtype in itertools.product([1, 2], [torch.float, torch.double]):
+        for task_rank, dtype, use_outcome_transform in itertools.product(
+            [1, 2], [torch.float, torch.double], (False, True)
+        ):
             tkwargs = {"device": self.device, "dtype": dtype}
             train_X, train_Y, train_Yvar, model = self._get_data_and_model(
-                task_rank=task_rank, **tkwargs
+                task_rank=task_rank,
+                use_outcome_transform=use_outcome_transform,
+                **tkwargs,
             )
+
             d = train_X.shape[1] - 1
             mcmc_samples = self._get_mcmc_samples(
-                num_samples=3, dim=d, task_rank=task_rank, **tkwargs
+                num_samples=3,
+                dim=d,
+                task_rank=task_rank,
+                **tkwargs,
             )
             model.load_mcmc_samples(mcmc_samples)
 
@@ -551,10 +601,24 @@ def test_load_samples(self):
                     mcmc_samples["mean"],
                 )
             )
+
+            # Handle outcome transforms (if used)
+            train_Y_tf, train_Yvar_tf = train_Y, train_Yvar
+            if use_outcome_transform:
+                train_Y_tf, train_Yvar_tf = model.outcome_transform(
+                    Y=train_Y, Yvar=train_Yvar
+                )
+
+            self.assertTrue(
+                torch.allclose(
+                    model.pyro_model.train_Y,
+                    train_Y_tf,
+                )
+            )
             self.assertTrue(
                 torch.allclose(
                     model.pyro_model.train_Yvar,
-                    train_Yvar.clamp(MIN_INFERRED_NOISE_LEVEL),
+                    train_Yvar_tf.clamp(MIN_INFERRED_NOISE_LEVEL),
                 )
             )
             self.assertTrue(
diff --git a/test/models/test_gp_regression.py b/test/models/test_gp_regression.py
index e2e3979f05..c26cd98f5b 100644
--- a/test/models/test_gp_regression.py
+++ b/test/models/test_gp_regression.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import itertools
+import math
 import warnings
 
 import torch
@@ -115,9 +116,7 @@ def test_gp(self, double_only: bool = False):
             # test param sizes
             params = dict(model.named_parameters())
             for p in params:
-                self.assertEqual(
-                    params[p].numel(), m * torch.tensor(batch_shape).prod().item()
-                )
+                self.assertEqual(params[p].numel(), m * math.prod(batch_shape))
 
             # test posterior
             # test non batch evaluation
@@ -133,18 +132,9 @@ def test_gp(self, double_only: bool = False):
             self.assertIsInstance(posterior_pred, GPyTorchPosterior)
             self.assertEqual(posterior_pred.mean.shape, expected_shape)
             self.assertEqual(posterior_pred.variance.shape, expected_shape)
-            if use_octf:
-                # ensure un-transformation is applied
-                tmp_tf = model.outcome_transform
-                del model.outcome_transform
-                pp_tf = model.posterior(X, observation_noise=True)
-                model.outcome_transform = tmp_tf
-                expected_var = tmp_tf.untransform_posterior(pp_tf).variance
-                self.assertAllClose(posterior_pred.variance, expected_var)
-            else:
-                pvar = posterior_pred.variance
-                pvar_exp = get_pvar_expected(posterior, model, X, m)
-                self.assertAllClose(pvar, pvar_exp, rtol=1e-4, atol=1e-5)
+            pvar = posterior_pred.variance
+            pvar_exp = get_pvar_expected(posterior=posterior, model=model, X=X, m=m)
+            self.assertAllClose(pvar, pvar_exp, rtol=1e-4, atol=1e-5)
 
             # Tensor valued observation noise.
             obs_noise = torch.rand(X.shape, **tkwargs)
@@ -167,18 +157,9 @@ def test_gp(self, double_only: bool = False):
             posterior_pred = model.posterior(X, observation_noise=True)
             self.assertIsInstance(posterior_pred, GPyTorchPosterior)
             self.assertEqual(posterior_pred.mean.shape, expected_shape)
-            if use_octf:
-                # ensure un-transformation is applied
-                tmp_tf = model.outcome_transform
-                del model.outcome_transform
-                pp_tf = model.posterior(X, observation_noise=True)
-                model.outcome_transform = tmp_tf
-                expected_var = tmp_tf.untransform_posterior(pp_tf).variance
-                self.assertAllClose(posterior_pred.variance, expected_var)
-            else:
-                pvar = posterior_pred.variance
-                pvar_exp = get_pvar_expected(posterior, model, X, m)
-                self.assertAllClose(pvar, pvar_exp, rtol=1e-4, atol=1e-5)
+            pvar = posterior_pred.variance
+            pvar_exp = get_pvar_expected(posterior=posterior, model=model, X=X, m=m)
+            self.assertAllClose(pvar, pvar_exp, rtol=1e-4, atol=1e-5)
 
             # test batch evaluation with broadcasting
             for input_batch_shape in ([], [3], [1]):
@@ -186,11 +167,10 @@ def test_gp(self, double_only: bool = False):
 
                 if input_batch_shape == [3] and len(batch_shape) > 0:
                     msg = (
-                        "Shape mismatch: objects cannot be broadcast to a"
-                        " single shape"
+                        "Shape mismatch: objects cannot be broadcast to a single shape"
                         if m == 1
-                        else "The trailing batch dimensions of X must match"
-                        " the trailing batch dimensions of the training inputs."
+                        else "The trailing batch dimensions of X must match "
+                        "the trailing batch dimensions of the training inputs."
                     )
                     with self.assertRaisesRegex(RuntimeError, msg):
                         model.posterior(X, observation_noise=True)
diff --git a/test/models/test_gp_regression_fidelity.py b/test/models/test_gp_regression_fidelity.py
index c44c403757..a297ef174b 100644
--- a/test/models/test_gp_regression_fidelity.py
+++ b/test/models/test_gp_regression_fidelity.py
@@ -11,10 +11,7 @@
 from botorch.exceptions.errors import UnsupportedError
 from botorch.exceptions.warnings import OptimizationWarning
 from botorch.fit import fit_gpytorch_mll
-from botorch.models.gp_regression_fidelity import (
-    FixedNoiseMultiFidelityGP,
-    SingleTaskMultiFidelityGP,
-)
+from botorch.models.gp_regression_fidelity import SingleTaskMultiFidelityGP
 from botorch.models.transforms import Normalize, Standardize
 from botorch.posteriors import GPyTorchPosterior
 from botorch.sampling import SobolQMCNormalSampler
@@ -45,6 +42,7 @@ def _get_random_data_with_fidelity(
 class TestSingleTaskMultiFidelityGP(BotorchTestCase):
 
     FIDELITY_TEST_PAIRS = (
+        # (iteration_fidelity, data_fidelities)
         (None, [1]),
         (1, None),
         (None, [-1]),
@@ -80,13 +78,10 @@ def _get_model_and_data(
                 "train_Y": train_Y,
                 "iteration_fidelity": iteration_fidelity,
                 "linear_truncated": lin_truncated,
+                "outcome_transform": outcome_transform,
+                "input_transform": input_transform,
             }
         )
-
-        if outcome_transform is not None:
-            model_kwargs["outcome_transform"] = outcome_transform
-        if input_transform is not None:
-            model_kwargs["input_transform"] = input_transform
         model = SingleTaskMultiFidelityGP(**model_kwargs)
         return model, model_kwargs
 
@@ -105,14 +100,6 @@ def test_init_error(self) -> None:
                 SingleTaskMultiFidelityGP(
                     train_X, train_Y, linear_truncated=lin_truncated, data_fidelities=[]
                 )
-        with self.assertRaises(ValueError):
-            SingleTaskMultiFidelityGP(
-                train_X, train_Y, data_fidelities=[1], data_fidelity=2
-            )
-        with self.assertWarnsRegex(DeprecationWarning, "data_fidelity"):
-            SingleTaskMultiFidelityGP(
-                train_X, train_Y, data_fidelity=1, linear_truncated=False
-            )
 
     def test_gp(self) -> None:
         for iteration_fidelity, data_fidelities in self.FIDELITY_TEST_PAIRS:
@@ -299,6 +286,7 @@ def test_condition_on_observations(self):
                                 "iteration_fidelity",
                                 "data_fidelities",
                                 "linear_truncated",
+                                "outcome_transform",
                                 "input_transform",
                             ):
                                 model_kwargs_non_batch[k] = v
@@ -441,8 +429,7 @@ def test_construct_inputs(self):
                 self.assertTrue(kwargs["train_Y"].equal(data_dict["train_Y"]))
 
 
-class TestFixedNoiseMultiFidelityGP(TestSingleTaskMultiFidelityGP):
-    model_class = FixedNoiseMultiFidelityGP
+class TestFixedNoiseSingleTaskMultiFidelityGP(TestSingleTaskMultiFidelityGP):
 
     def _get_model_and_data(
         self,
@@ -471,17 +458,11 @@ def _get_model_and_data(
                 "train_Yvar": train_Yvar,
                 "iteration_fidelity": iteration_fidelity,
                 "linear_truncated": lin_truncated,
+                "outcome_transform": outcome_transform,
+                "input_transform": input_transform,
             }
         )
-        if outcome_transform is not None:
-            model_kwargs["outcome_transform"] = outcome_transform
-        if input_transform is not None:
-            model_kwargs["input_transform"] = input_transform
-        if self.model_class is FixedNoiseMultiFidelityGP:
-            with self.assertWarnsRegex(DeprecationWarning, "SingleTaskMultiFidelityGP"):
-                model = FixedNoiseMultiFidelityGP(**model_kwargs)
-        else:
-            model = self.model_class(**model_kwargs)
+        model = SingleTaskMultiFidelityGP(**model_kwargs)
         return model, model_kwargs
 
     def test_init_error(self):
@@ -490,17 +471,9 @@ def test_init_error(self):
         train_Yvar = torch.full_like(train_Y, 0.01)
         for lin_truncated in (True, False):
             with self.assertRaises(UnsupportedError):
-                FixedNoiseMultiFidelityGP(
+                SingleTaskMultiFidelityGP(
                     train_X, train_Y, train_Yvar, linear_truncated=lin_truncated
                 )
-        with self.assertRaises(ValueError):
-            FixedNoiseMultiFidelityGP(
-                train_X, train_Y, train_Yvar, data_fidelities=[1], data_fidelity=2
-            )
-        with self.assertWarnsRegex(DeprecationWarning, "data_fidelity"):
-            FixedNoiseMultiFidelityGP(
-                train_X, train_Y, train_Yvar, data_fidelity=1, linear_truncated=False
-            )
 
     def test_fixed_noise_likelihood(self):
         for iteration_fidelity, data_fidelities in self.FIDELITY_TEST_PAIRS:
@@ -571,8 +544,3 @@ def test_construct_inputs(self):
                 self.assertEqual(data_dict.get("data_fidelities", None), [1])
                 self.assertTrue(kwargs["train_X"].equal(data_dict["train_X"]))
                 self.assertTrue(kwargs["train_Y"].equal(data_dict["train_Y"]))
-
-
-class TestFixedNoiseSingleTaskMultiFidelityGP(TestFixedNoiseMultiFidelityGP):
-    # Test SingleTaskMultiFidelityGP with observed noise.
-    model_class = SingleTaskMultiFidelityGP
diff --git a/test/models/test_gp_regression_mixed.py b/test/models/test_gp_regression_mixed.py
index ee2534622a..9429b47091 100644
--- a/test/models/test_gp_regression_mixed.py
+++ b/test/models/test_gp_regression_mixed.py
@@ -34,12 +34,16 @@ class TestMixedSingleTaskGP(BotorchTestCase):
     def test_gp(self):
         d = 3
         bounds = torch.tensor([[-1.0] * d, [1.0] * d])
-        for batch_shape, m, ncat, dtype, observed_noise in (
-            (torch.Size(), 1, 0, torch.float, False),
-            (torch.Size(), 2, 1, torch.double, True),
-            (torch.Size([2]), 2, 3, torch.double, False),
+        for batch_shape, m, ncat, dtype, observed_noise, use_octf in (
+            (torch.Size(), 1, 0, torch.float, False, False),
+            (torch.Size(), 2, 1, torch.double, True, True),
+            (torch.Size([2]), 2, 3, torch.double, False, True),
         ):
             tkwargs = {"device": self.device, "dtype": dtype}
+            # The model by default uses a `Standardize` outcome transform, so
+            # to test without that transform we need to explicitly pass in `None`.
+            outcome_transform_kwargs = {} if use_octf else {"outcome_transform": None}
+
             train_X, train_Y = _get_random_data(
                 batch_shape=batch_shape, m=m, d=d, **tkwargs
             )
@@ -70,6 +74,7 @@ def test_gp(self):
                 train_Y=train_Y,
                 cat_dims=cat_dims,
                 train_Yvar=train_Yvar,
+                **outcome_transform_kwargs,
             )
             self.assertEqual(model._ignore_X_dims_scaling_check, cat_dims)
             mll = ExactMarginalLogLikelihood(model.likelihood, model).to(**tkwargs)
@@ -118,7 +123,7 @@ def test_gp(self):
             self.assertEqual(posterior_pred.mean.shape, expected_shape)
             self.assertEqual(posterior_pred.variance.shape, expected_shape)
             pvar = posterior_pred.variance
-            pvar_exp = get_pvar_expected(posterior, model, X, m)
+            pvar_exp = get_pvar_expected(posterior=posterior, model=model, X=X, m=m)
             self.assertAllClose(pvar, pvar_exp, rtol=1e-4, atol=1e-5)
 
             # test batch evaluation
@@ -132,7 +137,7 @@ def test_gp(self):
             self.assertIsInstance(posterior_pred, GPyTorchPosterior)
             self.assertEqual(posterior_pred.mean.shape, expected_shape)
             pvar = posterior_pred.variance
-            pvar_exp = get_pvar_expected(posterior, model, X, m)
+            pvar_exp = get_pvar_expected(posterior=posterior, model=model, X=X, m=m)
             self.assertAllClose(pvar, pvar_exp, rtol=1e-4, atol=1e-5)
 
             # test that model converter throws an exception
diff --git a/test/models/test_model_list_gp_regression.py b/test/models/test_model_list_gp_regression.py
index 2c901e47c7..f27080dafb 100644
--- a/test/models/test_model_list_gp_regression.py
+++ b/test/models/test_model_list_gp_regression.py
@@ -292,8 +292,11 @@ def test_ModelListGP_single(self):
         self.assertIsInstance(posterior, GPyTorchPosterior)
         self.assertIsInstance(posterior.distribution, MultivariateNormal)
 
-    def test_ModelListGP_multi_task(self):
+    def test_ModelListGP_multi_task(self, use_outcome_transform: bool = False):
         tkwargs = {"device": self.device, "dtype": torch.float}
+        outcome_transform_kwargs = (
+            {} if use_outcome_transform else {"outcome_transform": None}
+        )
         train_x_raw, train_y = _get_random_data(
             batch_shape=torch.Size(), m=1, n=10, **tkwargs
         )
@@ -306,6 +309,7 @@ def test_ModelListGP_multi_task(self):
             train_Y=train_y,
             task_feature=-1,
             output_tasks=[0],
+            **outcome_transform_kwargs,
         )
         # Wrap a single single-output MTGP.
         model_list_gp = ModelListGP(model)
@@ -326,6 +330,7 @@ def test_ModelListGP_multi_task(self):
             train_X=train_x,
             train_Y=train_y,
             task_feature=-1,
+            **outcome_transform_kwargs,
         )
         model_list_gp = ModelListGP(model2)
         self.assertEqual(model_list_gp.num_outputs, 2)
@@ -360,9 +365,7 @@ def test_ModelListGP_multi_task(self):
         self.assertEqual(len(subset_model.models), 2)
         # Test condition on observations
         model_s1 = SingleTaskGP(
-            train_X=train_x_raw,
-            train_Y=train_y,
-            outcome_transform=None,
+            train_X=train_x_raw, train_Y=train_y, **outcome_transform_kwargs
         )
         model_list_gp = ModelListGP(model_s1, model2, deepcopy(model_s1))
         model_list_gp.posterior(train_x_raw)
@@ -375,11 +378,21 @@ def test_ModelListGP_multi_task(self):
         self.assertIsInstance(cm, ModelListGP)
         self.assertEqual(cm.num_outputs, 4)
         self.assertEqual(len(cm.models), 3)
+        # TODO: Figure out why the outcome transform changes the input shape...
+        exp_shape_stgp = (
+            torch.Size([1, 15, 1]) if use_outcome_transform else torch.Size([15, 1])
+        )
+        exp_shape_mtgp = (
+            torch.Size([1, 20, 2]) if use_outcome_transform else torch.Size([20, 2])
+        )
         for i in [0, 2]:
             self.assertIsInstance(cm.models[i], SingleTaskGP)
-            self.assertEqual(cm.models[i].train_inputs[0].shape, torch.Size([15, 1]))
+            self.assertEqual(cm.models[i].train_inputs[0].shape, exp_shape_stgp)
         self.assertIsInstance(cm.models[1], MultiTaskGP)
-        self.assertEqual(cm.models[1].train_inputs[0].shape, torch.Size([20, 2]))
+        self.assertEqual(cm.models[1].train_inputs[0].shape, exp_shape_mtgp)
+
+    def test_ModelListGP_multi_task_outcome_transform(self):
+        self.test_ModelListGP_multi_task(use_outcome_transform=True)
 
     def test_transform_revert_train_inputs(self):
         tkwargs = {"device": self.device, "dtype": torch.float}
diff --git a/test/utils/test_transforms.py b/test/utils/test_transforms.py
index 8b95ecd971..07bdbc2791 100644
--- a/test/utils/test_transforms.py
+++ b/test/utils/test_transforms.py
@@ -340,7 +340,7 @@ def test_is_fully_bayesian(self):
             SingleTaskGP(train_X=X, train_Y=Y),
             MultiTaskGP(train_X=X, train_Y=Y, task_feature=-1),
             HigherOrderGP(train_X=X, train_Y=Y),
-            SingleTaskMultiFidelityGP(train_X=X, train_Y=Y, data_fidelity=3),
+            SingleTaskMultiFidelityGP(train_X=X, train_Y=Y, data_fidelities=[3]),
             MixedSingleTaskGP(train_X=X, train_Y=Y, cat_dims=[1]),
             PairwiseGP(datapoints=X, comparisons=None),
         )
@@ -382,7 +382,7 @@ def test_is_ensemble(self):
             SingleTaskGP(train_X=X, train_Y=Y),
             MultiTaskGP(train_X=X, train_Y=Y, task_feature=-1),
             HigherOrderGP(train_X=X, train_Y=Y),
-            SingleTaskMultiFidelityGP(train_X=X, train_Y=Y, data_fidelity=3),
+            SingleTaskMultiFidelityGP(train_X=X, train_Y=Y, data_fidelities=[3]),
             MixedSingleTaskGP(train_X=X, train_Y=Y, cat_dims=[1]),
             PairwiseGP(datapoints=X, comparisons=None),
         )