diff --git a/botorch/acquisition/multi_objective/monte_carlo.py b/botorch/acquisition/multi_objective/monte_carlo.py
index f96e34771e..df45143446 100644
--- a/botorch/acquisition/multi_objective/monte_carlo.py
+++ b/botorch/acquisition/multi_objective/monte_carlo.py
@@ -42,7 +42,7 @@
 from botorch.exceptions.errors import UnsupportedError
 from botorch.exceptions.warnings import BotorchWarning
 from botorch.models.model import Model
-from botorch.models.transforms.input import InputPerturbation
+from botorch.models.transforms.input_augmentation import InputPerturbation
 from botorch.posteriors import DeterministicPosterior
 from botorch.posteriors.posterior import Posterior
 from botorch.sampling.samplers import MCSampler, SobolQMCNormalSampler
diff --git a/botorch/models/gp_regression.py b/botorch/models/gp_regression.py
index a637feb4c6..4c9060aa70 100644
--- a/botorch/models/gp_regression.py
+++ b/botorch/models/gp_regression.py
@@ -16,8 +16,9 @@
 from botorch import settings
 from botorch.models.gpytorch import BatchedMultiOutputGPyTorchModel
 from botorch.models.transforms.input import InputTransform
+from botorch.models.transforms.input_augmentation import InputAugmentationTransform
 from botorch.models.transforms.outcome import Log, OutcomeTransform
-from botorch.models.utils import fantasize as fantasize_flag, validate_input_scaling
+from botorch.models.utils import validate_input_scaling
 from botorch.sampling.samplers import MCSampler
 from botorch.utils.containers import TrainingData
 from gpytorch.constraints.constraints import GreaterThan
@@ -70,6 +71,7 @@ def __init__(
         mean_module: Optional[Mean] = None,
         outcome_transform: Optional[OutcomeTransform] = None,
         input_transform: Optional[InputTransform] = None,
+        input_augmentation_transform: Optional[InputAugmentationTransform] = None,
     ) -> None:
         r"""A single-task exact GP model.
 
@@ -88,6 +90,8 @@ def __init__(
                 `.posterior` on the model will be on the original scale).
             input_transform: An input transform that is applied in the model's
                 forward pass.
+            input_augmentation_transform: An input augmentation transform that is
+                applied in the `posterior` call.
 
         Example:
             >>> train_X = torch.rand(20, 2)
@@ -148,11 +152,11 @@ def __init__(
             self.outcome_transform = outcome_transform
         if input_transform is not None:
             self.input_transform = input_transform
+        if input_augmentation_transform is not None:
+            self.input_augmentation_transform = input_augmentation_transform
         self.to(train_X)
 
-    def forward(self, x: Tensor) -> MultivariateNormal:
-        if self.training:
-            x = self.transform_inputs(x)
+    def _forward(self, x: Tensor) -> MultivariateNormal:
         mean_x = self.mean_module(x)
         covar_x = self.covar_module(x)
         return MultivariateNormal(mean_x, covar_x)
@@ -191,6 +195,7 @@ def __init__(
         mean_module: Optional[Mean] = None,
         outcome_transform: Optional[OutcomeTransform] = None,
         input_transform: Optional[InputTransform] = None,
+        input_augmentation_transform: Optional[InputAugmentationTransform] = None,
         **kwargs: Any,
     ) -> None:
         r"""A single-task exact GP model using fixed noise levels.
@@ -210,6 +215,8 @@ def __init__(
                 `.posterior` on the model will be on the original scale).
             input_transform: An input transfrom that is applied in the model's
                 forward pass.
+            input_augmentation_transform: An input augmentation transform that is
+                applied in the `posterior` call.
 
         Example:
             >>> train_X = torch.rand(20, 2)
@@ -262,7 +269,8 @@ def __init__(
             self.input_transform = input_transform
         if outcome_transform is not None:
             self.outcome_transform = outcome_transform
-
+        if input_augmentation_transform is not None:
+            self.input_augmentation_transform = input_augmentation_transform
         self.to(train_X)
 
     def fantasize(
@@ -298,24 +306,19 @@ def fantasize(
             The constructed fantasy model.
         """
         propagate_grads = kwargs.pop("propagate_grads", False)
-        with fantasize_flag():
-            with settings.propagate_grads(propagate_grads):
-                post_X = self.posterior(
-                    X, observation_noise=observation_noise, **kwargs
-                )
-            Y_fantasized = sampler(post_X)  # num_fantasies x batch_shape x n' x m
-            # Use the mean of the previous noise values (TODO: be smarter here).
-            # noise should be batch_shape x q x m when X is batch_shape x q x d, and
-            # Y_fantasized is num_fantasies x batch_shape x q x m.
-            noise_shape = Y_fantasized.shape[1:]
-            noise = self.likelihood.noise.mean().expand(noise_shape)
-            return self.condition_on_observations(
-                X=self.transform_inputs(X), Y=Y_fantasized, noise=noise
-            )
+        with settings.propagate_grads(propagate_grads):
+            post_X = self._posterior(X, observation_noise=observation_noise, **kwargs)
+        Y_fantasized = sampler(post_X)  # num_fantasies x batch_shape x n' x m
+        # Use the mean of the previous noise values (TODO: be smarter here).
+        # noise should be batch_shape x q x m when X is batch_shape x q x d, and
+        # Y_fantasized is num_fantasies x batch_shape x q x m.
+        noise_shape = Y_fantasized.shape[1:]
+        noise = self.likelihood.noise.mean().expand(noise_shape)
+        return self.condition_on_observations(
+            X=self.transform_inputs(X), Y=Y_fantasized, noise=noise
+        )
 
-    def forward(self, x: Tensor) -> MultivariateNormal:
-        if self.training:
-            x = self.transform_inputs(x)
+    def _forward(self, x: Tensor) -> MultivariateNormal:
         mean_x = self.mean_module(x)
         covar_x = self.covar_module(x)
         return MultivariateNormal(mean_x, covar_x)
@@ -370,6 +373,7 @@ def __init__(
         train_Yvar: Tensor,
         outcome_transform: Optional[OutcomeTransform] = None,
         input_transform: Optional[InputTransform] = None,
+        input_augmentation_transform: Optional[InputAugmentationTransform] = None,
     ) -> None:
         r"""A single-task exact GP model using a heteroskedastic noise model.
 
@@ -386,6 +390,8 @@ def __init__(
                 variances, which will happen after this transform is applied.
             input_transform: An input transfrom that is applied in the model's
                 forward pass.
+            input_augmentation_transform: An input augmentation transform that is
+                applied in the `posterior` call.
 
         Example:
             >>> train_X = torch.rand(20, 2)
@@ -419,6 +425,7 @@ def __init__(
             train_Y=train_Y,
             likelihood=likelihood,
             input_transform=input_transform,
+            input_augmentation_transform=input_augmentation_transform,
         )
         self.register_added_loss_term("noise_added_loss")
         self.update_added_loss_term(
diff --git a/botorch/models/gpytorch.py b/botorch/models/gpytorch.py
index e9577b05c5..1191612758 100644
--- a/botorch/models/gpytorch.py
+++ b/botorch/models/gpytorch.py
@@ -117,11 +117,10 @@ def num_outputs(self) -> int:
         r"""The number of outputs of the model."""
         return self._num_outputs
 
-    def posterior(
+    def _posterior(
         self,
         X: Tensor,
         observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
         **kwargs: Any,
     ) -> GPyTorchPosterior:
         r"""Computes the posterior over model outputs at the provided points.
@@ -133,7 +132,6 @@ def posterior(
             observation_noise: If True, add the observation noise from the
                 likelihood to the posterior. If a Tensor, use it directly as the
                 observation noise (must be of shape `(batch_shape) x q`).
-            posterior_transform: An optional PosteriorTransform.
 
         Returns:
             A `GPyTorchPosterior` object, representing a batch of `b` joint
@@ -141,9 +139,6 @@ def posterior(
             specified.
         """
         self.eval()  # make sure model is in eval mode
-        # input transforms are applied at `posterior` in `eval` mode, and at
-        # `model.forward()` at the training time
-        X = self.transform_inputs(X)
         with gpt_posterior_settings():
             mvn = self(X)
             if observation_noise is not False:
@@ -158,8 +153,6 @@ def posterior(
         posterior = GPyTorchPosterior(mvn=mvn)
         if hasattr(self, "outcome_transform"):
             posterior = self.outcome_transform.untransform_posterior(posterior)
-        if posterior_transform is not None:
-            return posterior_transform(posterior)
         return posterior
 
     def condition_on_observations(self, X: Tensor, Y: Tensor, **kwargs: Any) -> Model:
@@ -301,12 +294,11 @@ def _transform_tensor_args(
             )
         return X, Y.squeeze(-1), None if Yvar is None else Yvar.squeeze(-1)
 
-    def posterior(
+    def _posterior(
         self,
         X: Tensor,
         output_indices: Optional[List[int]] = None,
         observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
         **kwargs: Any,
     ) -> GPyTorchPosterior:
         r"""Computes the posterior over model outputs at the provided points.
@@ -323,7 +315,6 @@ def posterior(
             observation_noise: If True, add the observation noise from the
                 likelihood to the posterior. If a Tensor, use it directly as the
                 observation noise (must be of shape `(batch_shape) x q x m`).
-            posterior_transform: An optional PosteriorTransform.
 
         Returns:
             A `GPyTorchPosterior` object, representing `batch_shape` joint
@@ -331,9 +322,6 @@ def posterior(
             `output_indices` each. Includes observation noise if specified.
         """
         self.eval()  # make sure model is in eval mode
-        # input transforms are applied at `posterior` in `eval` mode, and at
-        # `model.forward()` at the training time
-        X = self.transform_inputs(X)
         with gpt_posterior_settings():
             # insert a dimension for the output dimension
             if self._num_outputs > 1:
@@ -369,8 +357,6 @@ def posterior(
         posterior = GPyTorchPosterior(mvn=mvn)
         if hasattr(self, "outcome_transform"):
             posterior = self.outcome_transform.untransform_posterior(posterior)
-        if posterior_transform is not None:
-            return posterior_transform(posterior)
         return posterior
 
     def condition_on_observations(
@@ -549,6 +535,8 @@ def posterior(
             by `output_indices` each. Includes measurement noise if
             `observation_noise` is specified.
         """
+        # TODO: Not sure if this needs special handling or is good with a `_`.
+        # Leaving untouched for now.
         self.eval()  # make sure model is in eval mode
         # input transforms are applied at `posterior` in `eval` mode, and at
         # `model.forward()` at the training time
@@ -622,12 +610,11 @@ class MultiTaskGPyTorchModel(GPyTorchModel, ABC):
     "long-format" multi-task GP in the style of `MultiTaskGP`.
     """
 
-    def posterior(
+    def _posterior(
         self,
         X: Tensor,
         output_indices: Optional[List[int]] = None,
         observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
         **kwargs: Any,
     ) -> GPyTorchPosterior:
         r"""Computes the posterior over model outputs at the provided points.
@@ -644,7 +631,6 @@ def posterior(
             observation_noise: If True, add observation noise from the respective
                 likelihoods. If a Tensor, specifies the observation noise levels
                 to add.
-            posterior_transform: An optional PosteriorTransform.
 
         Returns:
             A `GPyTorchPosterior` object, representing `batch_shape` joint
@@ -663,9 +649,6 @@ def posterior(
         X_full = _make_X_full(X=X, output_indices=output_indices, tf=self._task_feature)
 
         self.eval()  # make sure model is in eval mode
-        # input transforms are applied at `posterior` in `eval` mode, and at
-        # `model.forward()` at the training time
-        X_full = self.transform_inputs(X_full)
         with gpt_posterior_settings():
             mvn = self(X_full)
             if observation_noise is not False:
@@ -685,6 +668,4 @@ def posterior(
             posterior = GPyTorchPosterior(mvn=mtmvn)
         if hasattr(self, "outcome_transform"):
             posterior = self.outcome_transform.untransform_posterior(posterior)
-        if posterior_transform is not None:
-            return posterior_transform(posterior)
         return posterior
diff --git a/botorch/models/model.py b/botorch/models/model.py
index cf27ec4864..748c513930 100644
--- a/botorch/models/model.py
+++ b/botorch/models/model.py
@@ -10,7 +10,6 @@
 
 from __future__ import annotations
 
-import warnings
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from copy import deepcopy
@@ -19,32 +18,48 @@
 import numpy as np
 import torch
 from botorch import settings
-from botorch.models.utils import fantasize as fantasize_flag
 from botorch.posteriors import Posterior, PosteriorList
 from botorch.posteriors.fully_bayesian import FullyBayesianPosteriorList
 from botorch.sampling.samplers import MCSampler
 from botorch.utils.containers import TrainingData
 from botorch.utils.transforms import is_fully_bayesian
+from gpytorch.distributions import MultivariateNormal
 from torch import Tensor
 from torch.nn import Module, ModuleList
 
 
 class Model(Module, ABC):
-    r"""Abstract base class for BoTorch models.
+    r"""Abstract base class for BoTorch models."""
 
-    Args:
-        _has_transformed_inputs: A boolean denoting whether `train_inputs` are currently
-            stored as transformed or not.
-        _original_train_inputs: A Tensor storing the original train inputs for use in
-            `_revert_to_original_inputs`. Note that this is necessary since
-            transform / untransform cycle introduces numerical errors which lead
-            to upstream errors during training.
-    """
+    def forward(self, x: Tensor) -> MultivariateNormal:
+        r"""Transforms the inputs and computes the prior over model outputs
+        at the provided points.
 
-    _has_transformed_inputs: bool = False
-    _original_train_inputs: Optional[Tensor] = None
+        Args:
+            x: A `b x q x d`-dim Tensor of inputs, where `d` is the dimension of
+                the feature space, `q` is the number of points considered jointly,
+                and `b` is the batch dimension.
+
+        Returns:
+            A MultivariateNormal object denoting the prior distribution.
+        """
+        x = self.transform_inputs(x)
+        return self._forward(x)
 
     @abstractmethod
+    def _forward(self, x: Tensor) -> MultivariateNormal:
+        r"""Computes the prior over model outputs at the provided points.
+
+        Args:
+            x: A `b x q x d`-dim Tensor of inputs, where `d` is the dimension of
+                the feature space, `q` is the number of points considered jointly,
+                and `b` is the batch dimension.
+
+        Returns:
+            A MultivariateNormal object denoting the prior distribution.
+        """
+        pass  # pragma: no cover
+
     def posterior(
         self,
         X: Tensor,
@@ -53,11 +68,8 @@ def posterior(
         posterior_transform: Optional[Callable[[Posterior], Posterior]] = None,
         **kwargs: Any,
     ) -> Posterior:
-        r"""Computes the posterior over model outputs at the provided points.
-
-        Note: The input transforms should be applied here using
-            `self.transform_inputs(X)` after the `self.eval()` call and before
-            any `model.forward` or `model.likelihood` calls.
+        r"""Augments the inputs, if needed, and computes the posterior over model
+        outputs at the provided points.
 
         Args:
             X: A `b x q x d`-dim Tensor, where `d` is the dimension of the
@@ -71,6 +83,42 @@ def posterior(
             observation_noise: If True, add observation noise to the posterior.
             posterior_transform: An optional PosteriorTransform.
 
+        Returns:
+            A `Posterior` object, representing a batch of `b` joint distributions
+            over `q` points and `m` outputs each.
+        """
+        X = self.augment_inputs(X)
+        posterior = self._posterior(
+            X=X,
+            output_indices=output_indices,
+            observation_noise=observation_noise,
+            kwargs=kwargs,
+        )
+        if posterior_transform is not None:
+            posterior = posterior_transform(posterior)
+        return posterior
+
+    @abstractmethod
+    def _posterior(
+        self,
+        X: Tensor,
+        output_indices: Optional[List[int]] = None,
+        observation_noise: bool = False,
+        **kwargs: Any,
+    ) -> Posterior:
+        r"""Computes the posterior over model outputs at the provided points.
+
+        Args:
+            X: A `b x q x d`-dim Tensor, where `d` is the dimension of the
+                feature space, `q` is the number of points considered jointly,
+                and `b` is the batch dimension.
+            output_indices: A list of indices, corresponding to the outputs over
+                which to compute the posterior (if the model is multi-output).
+                Can be used to speed up computation if only a subset of the
+                model's outputs are required for optimization. If omitted,
+                computes the posterior over all model outputs.
+            observation_noise: If True, add observation noise to the posterior.
+
         Returns:
             A `Posterior` object, representing a batch of `b` joint distributions
             over `q` points and `m` outputs each.
@@ -161,13 +209,12 @@ def fantasize(
             The constructed fantasy model.
         """
         propagate_grads = kwargs.pop("propagate_grads", False)
-        with fantasize_flag():
-            with settings.propagate_grads(propagate_grads):
-                post_X = self.posterior(X, observation_noise=observation_noise)
-            Y_fantasized = sampler(post_X)  # num_fantasies x batch_shape x n' x m
-            return self.condition_on_observations(
-                X=self.transform_inputs(X), Y=Y_fantasized, **kwargs
-            )
+        with settings.propagate_grads(propagate_grads):
+            post_X = self._posterior(X, observation_noise=observation_noise)
+        Y_fantasized = sampler(post_X)  # num_fantasies x batch_shape x n' x m
+        return self.condition_on_observations(
+            X=self.transform_inputs(X), Y=Y_fantasized, **kwargs
+        )
 
     @classmethod
     def construct_inputs(
@@ -186,11 +233,11 @@ def transform_inputs(
         r"""Transform inputs.
 
         Args:
-            X: A tensor of inputs
+            X: A `b x q x d`-dim tensor of inputs.
             input_transform: A Module that performs the input transformation.
 
         Returns:
-            A tensor of transformed inputs
+            A `b x q x d`-dim tensor of transformed inputs.
         """
         if input_transform is not None:
             input_transform.to(X)
@@ -200,49 +247,22 @@ def transform_inputs(
         except AttributeError:
             return X
 
-    def _set_transformed_inputs(self) -> None:
-        r"""Update training inputs with transformed inputs."""
-        if hasattr(self, "input_transform") and not self._has_transformed_inputs:
-            if hasattr(self, "train_inputs"):
-                self._original_train_inputs = self.train_inputs[0]
-                with torch.no_grad():
-                    X_tf = self.input_transform.preprocess_transform(
-                        self.train_inputs[0]
-                    )
-                self.set_train_data(X_tf, strict=False)
-                self._has_transformed_inputs = True
-            else:
-                warnings.warn(
-                    "Could not update `train_inputs` with transformed inputs "
-                    f"since {self.__class__.__name__} does not have a `train_inputs` "
-                    "attribute. Make sure that the `input_transform` is applied to "
-                    "both the train inputs and test inputs.",
-                    RuntimeWarning,
-                )
-
-    def _revert_to_original_inputs(self) -> None:
-        r"""Revert training inputs back to original."""
-        if hasattr(self, "input_transform") and self._has_transformed_inputs:
-            self.set_train_data(self._original_train_inputs, strict=False)
-            self._has_transformed_inputs = False
-
-    def eval(self) -> Model:
-        r"""Puts the model in `eval` mode and sets the transformed inputs."""
-        self._set_transformed_inputs()
-        return super().eval()
-
-    def train(self, mode: bool = True) -> Model:
-        r"""Puts the model in `train` mode and reverts to the original inputs.
+    def augment_inputs(
+        self,
+        X: Tensor,
+    ) -> Tensor:
+        r"""Applies the input augmentation transform, if any.
 
         Args:
-            mode: A boolean denoting whether to put in `train` or `eval` mode.
-                If `False`, model is put in `eval` mode.
+            X: A `b x q x d`-dim tensor of inputs.
+
+        Returns:
+            A `b x q' x d'`-dim tensor of augmented inputs.
         """
-        if mode:
-            self._revert_to_original_inputs()
-        else:
-            self._set_transformed_inputs()
-        return super().train(mode=mode)
+        try:
+            return self.input_augmentation_transform(X)
+        except AttributeError:
+            return X
 
 
 class ModelList(Model):
diff --git a/botorch/models/model_list_gp_regression.py b/botorch/models/model_list_gp_regression.py
index c7d40700fb..d37d81fcea 100644
--- a/botorch/models/model_list_gp_regression.py
+++ b/botorch/models/model_list_gp_regression.py
@@ -102,13 +102,3 @@ def subset_output(self, idcs: List[int]) -> ModelListGP:
             The current model, subset to the specified output indices.
         """
         return self.__class__(*[deepcopy(self.models[i]) for i in idcs])
-
-    def _set_transformed_inputs(self) -> None:
-        r"""Update training inputs with transformed inputs."""
-        for m in self.models:
-            m._set_transformed_inputs()
-
-    def _revert_to_original_inputs(self) -> None:
-        r"""Revert training inputs back to original."""
-        for m in self.models:
-            m._revert_to_original_inputs()
diff --git a/botorch/models/transforms/input.py b/botorch/models/transforms/input.py
index 40a10bc30c..97bfcd7b59 100644
--- a/botorch/models/transforms/input.py
+++ b/botorch/models/transforms/input.py
@@ -18,12 +18,11 @@
 
 from abc import ABC, abstractmethod
 from collections import OrderedDict
-from typing import Callable, List, Optional, Union
+from typing import List, Optional, Union
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError
 from botorch.models.transforms.utils import expand_and_copy_tensor
-from botorch.models.utils import fantasize
 from botorch.utils.rounding import approximate_round
 from gpytorch import Module as GPyTorchModule
 from gpytorch.constraints import GreaterThan
@@ -39,20 +38,8 @@ class InputTransform(ABC):
     Note: Input transforms must inherit from `torch.nn.Module`. This
         is deferred to the subclasses to avoid any potential conflict
         between `gpytorch.module.Module` and `torch.nn.Module` in `Warp`.
-
-    Properties:
-        transform_on_train: A boolean indicating whether to apply the
-            transform in train() mode.
-        transform_on_eval: A boolean indicating whether to apply the
-            transform in eval() mode.
-        transform_on_fantasize: A boolean indicating whether to apply
-            the transform when called from within a `fantasize` call.
     """
 
-    transform_on_eval: bool
-    transform_on_train: bool
-    transform_on_fantasize: bool
-
     def forward(self, X: Tensor) -> Tensor:
         r"""Transform the inputs to a model.
 
@@ -60,15 +47,9 @@ def forward(self, X: Tensor) -> Tensor:
             X: A `batch_shape x n x d`-dim tensor of inputs.
 
         Returns:
-            A `batch_shape x n' x d`-dim tensor of transformed inputs.
+            A `batch_shape x n x d`-dim tensor of transformed inputs.
         """
-        if self.training:
-            if self.transform_on_train:
-                return self.transform(X)
-        elif self.transform_on_eval:
-            if fantasize.off() or self.transform_on_fantasize:
-                return self.transform(X)
-        return X
+        return self.transform(X)
 
     @abstractmethod
     def transform(self, X: Tensor) -> Tensor:
@@ -110,44 +91,11 @@ def equals(self, other: InputTransform) -> bool:
             A boolean indicating if the other transform is equivalent.
         """
         other_state_dict = other.state_dict()
-        return (
-            type(self) == type(other)
-            and (self.transform_on_train == other.transform_on_train)
-            and (self.transform_on_eval == other.transform_on_eval)
-            and (self.transform_on_fantasize == other.transform_on_fantasize)
-            and all(
-                torch.allclose(v, other_state_dict[k].to(v))
-                for k, v in self.state_dict().items()
-            )
+        return type(self) == type(other) and all(
+            torch.allclose(v, other_state_dict[k].to(v))
+            for k, v in self.state_dict().items()
         )
 
-    def preprocess_transform(self, X: Tensor) -> Tensor:
-        r"""Apply transforms for preprocessing inputs.
-
-        The main use cases for this method are 1) to preprocess training data
-        before calling `set_train_data` and 2) preprocess `X_baseline` for noisy
-        acquisition functions so that `X_baseline` is "preprocessed" with the
-        same transformations as the cached training inputs.
-
-        Args:
-            X: A `batch_shape x n x d`-dim tensor of inputs.
-
-        Returns:
-            A `batch_shape x n x d`-dim tensor of (transformed) inputs.
-        """
-        if self.transform_on_train:
-            # We need to disable learning of bounds here.
-            # See why: https://github.com/pytorch/botorch/issues/1078.
-            if hasattr(self, "learn_bounds"):
-                learn_bounds = self.learn_bounds
-                self.learn_bounds = False
-                result = self.transform(X)
-                self.learn_bounds = learn_bounds
-                return result
-            else:
-                return self.transform(X)
-        return X
-
 
 class ChainedInputTransform(InputTransform, ModuleDict):
     r"""An input transform representing the chaining of individual transforms."""
@@ -171,13 +119,6 @@ def __init__(self, **transforms: InputTransform) -> None:
 
         """
         super().__init__(OrderedDict(transforms))
-        self.transform_on_train = False
-        self.transform_on_eval = False
-        self.transform_on_fantasize = False
-        for tf in transforms.values():
-            self.transform_on_train |= tf.transform_on_train
-            self.transform_on_eval |= tf.transform_on_eval
-            self.transform_on_fantasize |= tf.transform_on_fantasize
 
     def transform(self, X: Tensor) -> Tensor:
         r"""Transform the inputs to a model.
@@ -222,24 +163,6 @@ def equals(self, other: InputTransform) -> bool:
             t1 == t2 for t1, t2 in zip(self.values(), other.values())
         )
 
-    def preprocess_transform(self, X: Tensor) -> Tensor:
-        r"""Apply transforms for preprocessing inputs.
-
-        The main use cases for this method are 1) to preprocess training data
-        before calling `set_train_data` and 2) preprocess `X_baseline` for noisy
-        acquisition functions so that `X_baseline` is "preprocessed" with the
-        same transformations as the cached training inputs.
-
-        Args:
-            X: A `batch_shape x n x d`-dim tensor of inputs.
-
-        Returns:
-            A `batch_shape x n x d`-dim tensor of (transformed) inputs.
-        """
-        for tf in self.values():
-            X = tf.preprocess_transform(X)
-        return X
-
 
 class ReversibleInputTransform(InputTransform, ABC):
     r"""An abstract class for a reversible input transform.
@@ -323,9 +246,6 @@ def __init__(
         indices: Optional[List[int]] = None,
         bounds: Optional[Tensor] = None,
         batch_shape: torch.Size = torch.Size(),  # noqa: B008
-        transform_on_train: bool = True,
-        transform_on_eval: bool = True,
-        transform_on_fantasize: bool = True,
         reverse: bool = False,
         min_range: float = 1e-8,
     ) -> None:
@@ -340,12 +260,6 @@ def __init__(
             batch_shape: The batch shape of the inputs (asssuming input tensors
                 of shape `batch_shape x n x d`). If provided, perform individual
                 normalization per batch, otherwise uses a single normalization.
-            transform_on_train: A boolean indicating whether to apply the
-                transforms in train() mode. Default: True.
-            transform_on_eval: A boolean indicating whether to apply the
-                transform in eval() mode. Default: True.
-            transform_on_fantasize: A boolean indicating whether to apply the
-                transform when called from within a `fantasize` call. Default: True.
             reverse: A boolean indicating whether the forward pass should untransform
                 the inputs.
             min_range: Amount of noise to add to the range to ensure no division by
@@ -378,9 +292,6 @@ def __init__(
         self.register_buffer("mins", mins)
         self.register_buffer("ranges", ranges)
         self._d = d
-        self.transform_on_train = transform_on_train
-        self.transform_on_eval = transform_on_eval
-        self.transform_on_fantasize = transform_on_fantasize
         self.reverse = reverse
         self.batch_shape = batch_shape
         self.min_range = min_range
@@ -480,9 +391,6 @@ def __init__(
         d: int,
         indices: Optional[List[int]] = None,
         batch_shape: torch.Size = torch.Size(),  # noqa: B008
-        transform_on_train: bool = True,
-        transform_on_eval: bool = True,
-        transform_on_fantasize: bool = True,
         reverse: bool = False,
         min_std: float = 1e-8,
     ) -> None:
@@ -495,10 +403,6 @@ def __init__(
             batch_shape: The batch shape of the inputs (asssuming input tensors
                 of shape `batch_shape x n x d`). If provided, perform individual
                 normalization per batch, otherwise uses a single normalization.
-            transform_on_train: A boolean indicating whether to apply the
-                transforms in train() mode. Default: True
-            transform_on_eval: A boolean indicating whether to apply the
-                transform in eval() mode. Default: True
             reverse: A boolean indicating whether the forward pass should untransform
                 the inputs.
             min_std: Amount of noise to add to the standard deviation to ensure no
@@ -519,9 +423,6 @@ def __init__(
         self.register_buffer("means", torch.zeros(*batch_shape, 1, d))
         self.register_buffer("stds", torch.ones(*batch_shape, 1, d))
         self._d = d
-        self.transform_on_train = transform_on_train
-        self.transform_on_eval = transform_on_eval
-        self.transform_on_fantasize = transform_on_fantasize
         self.batch_shape = batch_shape
         self.min_std = min_std
         self.reverse = reverse
@@ -639,9 +540,6 @@ class Round(InputTransform, Module):
     def __init__(
         self,
         indices: List[int],
-        transform_on_train: bool = True,
-        transform_on_eval: bool = True,
-        transform_on_fantasize: bool = True,
         approximate: bool = True,
         tau: float = 1e-3,
     ) -> None:
@@ -649,20 +547,11 @@ def __init__(
 
         Args:
             indices: The indices of the integer inputs.
-            transform_on_train: A boolean indicating whether to apply the
-                transforms in train() mode. Default: True.
-            transform_on_eval: A boolean indicating whether to apply the
-                transform in eval() mode. Default: True.
-            transform_on_fantasize: A boolean indicating whether to apply the
-                transform when called from within a `fantasize` call. Default: True.
             approximate: A boolean indicating whether approximate or exact
                 rounding should be used. Default: approximate.
             tau: The temperature parameter for approximate rounding.
         """
         super().__init__()
-        self.transform_on_train = transform_on_train
-        self.transform_on_eval = transform_on_eval
-        self.transform_on_fantasize = transform_on_fantasize
         self.register_buffer("indices", torch.tensor(indices, dtype=torch.long))
         self.approximate = approximate
         self.tau = tau
@@ -707,29 +596,17 @@ class Log10(ReversibleInputTransform, Module):
     def __init__(
         self,
         indices: List[int],
-        transform_on_train: bool = True,
-        transform_on_eval: bool = True,
-        transform_on_fantasize: bool = True,
         reverse: bool = False,
     ) -> None:
         r"""Initialize transform.
 
         Args:
             indices: The indices of the inputs to log transform.
-            transform_on_train: A boolean indicating whether to apply the
-                transforms in train() mode. Default: True.
-            transform_on_eval: A boolean indicating whether to apply the
-                transform in eval() mode. Default: True.
-            transform_on_fantasize: A boolean indicating whether to apply the
-                transform when called from within a `fantasize` call. Default: True.
             reverse: A boolean indicating whether the forward pass should untransform
                 the inputs.
         """
         super().__init__()
         self.register_buffer("indices", torch.tensor(indices, dtype=torch.long))
-        self.transform_on_train = transform_on_train
-        self.transform_on_eval = transform_on_eval
-        self.transform_on_fantasize = transform_on_fantasize
         self.reverse = reverse
 
     def _transform(self, X: Tensor) -> Tensor:
@@ -780,9 +657,6 @@ class Warp(ReversibleInputTransform, GPyTorchModule):
     def __init__(
         self,
         indices: List[int],
-        transform_on_train: bool = True,
-        transform_on_eval: bool = True,
-        transform_on_fantasize: bool = True,
         reverse: bool = False,
         eps: float = 1e-7,
         concentration1_prior: Optional[Prior] = None,
@@ -793,12 +667,6 @@ def __init__(
 
         Args:
             indices: The indices of the inputs to warp.
-            transform_on_train: A boolean indicating whether to apply the
-                transforms in train() mode. Default: True.
-            transform_on_eval: A boolean indicating whether to apply the
-                transform in eval() mode. Default: True.
-            transform_on_fantasize: A boolean indicating whether to apply the
-                transform when called from within a `fantasize` call. Default: True.
             reverse: A boolean indicating whether the forward pass should untransform
                 the inputs.
             eps: A small value used to clip values to be in the interval (0, 1).
@@ -810,9 +678,6 @@ def __init__(
         """
         super().__init__()
         self.register_buffer("indices", torch.tensor(indices, dtype=torch.long))
-        self.transform_on_train = transform_on_train
-        self.transform_on_eval = transform_on_eval
-        self.transform_on_fantasize = transform_on_fantasize
         self.reverse = reverse
         self.batch_shape = batch_shape or torch.Size([])
         self._X_min = eps
@@ -913,105 +778,6 @@ def _untransform(self, X: Tensor) -> Tensor:
         return X_tf
 
 
-class AppendFeatures(InputTransform, Module):
-    r"""A transform that appends the input with a given set of features.
-
-    As an example, this can be used with `RiskMeasureMCObjective` to optimize risk
-    measures as described in [Cakmak2020risk]_. A tutorial notebook implementing the
-    rhoKG acqusition function introduced in [Cakmak2020risk]_ can be found at
-    https://botorch.org/tutorials/risk_averse_bo_with_environmental_variables.
-
-    The steps for using this to obtain samples of a risk measure are as follows:
-
-    -   Train a model on `(x, w)` inputs and the corresponding observations;
-
-    -   Pass in an instance of `AppendFeatures` with the `feature_set` denoting the
-        samples of `W` as the `input_transform` to the trained model;
-
-    -   Call `posterior(...).rsample(...)` on the model with `x` inputs only to
-        get the joint posterior samples over `(x, w)`s, where the `w`s come
-        from the `feature_set`;
-
-    -   Pass these posterior samples through the `RiskMeasureMCObjective` of choice to
-        get the samples of the risk measure.
-
-    Note: The samples of the risk measure obtained this way are in general biased
-    since the `feature_set` does not fully represent the distribution of the
-    environmental variable.
-
-    Example:
-        >>> # We consider 1D `x` and 1D `w`, with `W` having a
-        >>> # uniform distribution over [0, 1]
-        >>> model = SingleTaskGP(
-        ...     train_X=torch.rand(10, 2),
-        ...     train_Y=torch.randn(10, 1),
-        ...     input_transform=AppendFeatures(feature_set=torch.rand(10, 1))
-        ... )
-        >>> mll = ExactMarginalLogLikelihood(model.likelihood, model)
-        >>> fit_gpytorch_model(mll)
-        >>> test_x = torch.rand(3, 1)
-        >>> # `posterior_samples` is a `10 x 30 x 1`-dim tensor
-        >>> posterior_samples = model.posterior(test_x).rsamples(torch.size([10]))
-        >>> risk_measure = VaR(alpha=0.8, n_w=10)
-        >>> # `risk_measure_samples` is a `10 x 3`-dim tensor of samples of the
-        >>> # risk measure VaR
-        >>> risk_measure_samples = risk_measure(posterior_samples)
-    """
-
-    def __init__(
-        self,
-        feature_set: Tensor,
-        transform_on_train: bool = False,
-        transform_on_eval: bool = True,
-        transform_on_fantasize: bool = False,
-    ) -> None:
-        r"""Append `feature_set` to each input.
-
-        Args:
-            feature_set: An `n_f x d_f`-dim tensor denoting the features to be
-                appended to the inputs.
-            transform_on_train: A boolean indicating whether to apply the
-                transforms in train() mode. Default: False.
-            transform_on_eval: A boolean indicating whether to apply the
-                transform in eval() mode. Default: True.
-            transform_on_fantasize: A boolean indicating whether to apply the
-                transform when called from within a `fantasize` call. Default: False.
-        """
-        super().__init__()
-        if feature_set.dim() != 2:
-            raise ValueError("`feature_set` must be an `n_f x d_f`-dim tensor!")
-        self.register_buffer("feature_set", feature_set)
-        self.transform_on_train = transform_on_train
-        self.transform_on_eval = transform_on_eval
-        self.transform_on_fantasize = transform_on_fantasize
-
-    def transform(self, X: Tensor) -> Tensor:
-        r"""Transform the inputs by appending `feature_set` to each input.
-
-        For each `1 x d`-dim element in the input tensor, this will produce
-        an `n_f x (d + d_f)`-dim tensor with `feature_set` appended as the last `d_f`
-        dimensions. For a generic `batch_shape x q x d`-dim `X`, this translates to a
-        `batch_shape x (q * n_f) x (d + d_f)`-dim output, where the values corresponding
-        to `X[..., i, :]` are found in `output[..., i * n_f: (i + 1) * n_f, :]`.
-
-        Note: Adding the `feature_set` on the `q-batch` dimension is necessary to avoid
-        introducing additional bias by evaluating the inputs on independent GP
-        sample paths.
-
-        Args:
-            X: A `batch_shape x q x d`-dim tensor of inputs.
-
-        Returns:
-            A `batch_shape x (q * n_f) x (d + d_f)`-dim tensor of appended inputs.
-        """
-        expanded_X = X.unsqueeze(dim=-2).expand(
-            *X.shape[:-1], self.feature_set.shape[0], -1
-        )
-        expanded_features = self.feature_set.expand(*expanded_X.shape[:-1], -1)
-        appended_X = torch.cat([expanded_X, expanded_features], dim=-1)
-        return appended_X.view(*X.shape[:-2], -1, appended_X.shape[-1])
-
-
 class FilterFeatures(InputTransform, Module):
 
     r"""A transform that filters the input with a given set of features indices.
@@ -1025,21 +791,12 @@ class FilterFeatures(InputTransform, Module):
     def __init__(
         self,
         feature_indices: Tensor,
-        transform_on_train: bool = True,
-        transform_on_eval: bool = True,
-        transform_on_fantasize: bool = True,
     ) -> None:
         r"""Filter features from a model.
 
         Args:
             feature_set: An one-dim tensor denoting the indices of the features to be
                 kept and fed to the model.
-            transform_on_train: A boolean indicating whether to apply the
-                transforms in train() mode. Default: True.
-            transform_on_eval: A boolean indicating whether to apply the
-                transform in eval() mode. Default: True.
-            transform_on_fantasize: A boolean indicating whether to apply the
-                transform when called from within a `fantasize` call. Default: True.
         """
         super().__init__()
         if feature_indices.dim() != 1:
@@ -1052,9 +809,6 @@ def __init__(
             )
         if len(feature_indices.unique()) != len(feature_indices):
             raise ValueError("Elements of `feature_indices` tensor must be unique!")
-        self.transform_on_train = transform_on_train
-        self.transform_on_eval = transform_on_eval
-        self.transform_on_fantasize = transform_on_fantasize
         self.register_buffer("feature_indices", feature_indices)
 
     def transform(self, X: Tensor) -> Tensor:
@@ -1062,10 +816,10 @@ def transform(self, X: Tensor) -> Tensor:
         feature indices and filtering out the others.
 
         Args:
-            X: A `batch_shape x q x d`-dim tensor of inputs.
+            X: A `batch_shape x n x d`-dim tensor of inputs.
 
         Returns:
-            A `batch_shape x q x e`-dim tensor of filtered inputs,
+            A `batch_shape x n x e`-dim tensor of filtered inputs,
                 where `e` is the length of `feature_indices`.
         """
         return X[..., self.feature_indices]
@@ -1082,107 +836,3 @@ def equals(self, other: InputTransform) -> bool:
         if len(self.feature_indices) != len(other.feature_indices):
             return False
         return super().equals(other=other)
-
-
-class InputPerturbation(InputTransform, Module):
-    r"""A transform that adds the set of perturbations to the given input.
-
-    Similar to `AppendFeatures`, this can be used with `RiskMeasureMCObjective`
-    to optimize risk measures. See `AppendFeatures` for additional discussion
-    on optimizing risk measures.
-
-    A tutorial notebook using this with `qNoisyExpectedImprovement` can be found at
-    https://botorch.org/tutorials/risk_averse_bo_with_input_perturbations.
-    """
-
-    def __init__(
-        self,
-        perturbation_set: Union[Tensor, Callable[[Tensor], Tensor]],
-        bounds: Optional[Tensor] = None,
-        multiplicative: bool = False,
-        transform_on_train: bool = False,
-        transform_on_eval: bool = True,
-        transform_on_fantasize: bool = False,
-    ) -> None:
-        r"""Add `perturbation_set` to each input.
-
-        Args:
-            perturbation_set: An `n_p x d`-dim tensor denoting the perturbations
-                to be added to the inputs. Alternatively, this can be a callable that
-                returns `batch x n_p x d`-dim tensor of perturbations for input of
-                shape `batch x d`. This is useful for heteroscedastic perturbations.
-            bounds: A `2 x d`-dim tensor of lower and upper bounds for each
-                column of the input. If given, the perturbed inputs will be
-                clamped to these bounds.
-            multiplicative: A boolean indicating whether the input perturbations
-                are additive or multiplicative. If True, inputs will be multiplied
-                with the perturbations.
-            transform_on_train: A boolean indicating whether to apply the
-                transforms in train() mode. Default: False.
-            transform_on_eval: A boolean indicating whether to apply the
-                transform in eval() mode. Default: True.
-            transform_on_fantasize: A boolean indicating whether to apply the
-                transform when called from within a `fantasize` call. Default: False.
-        """
-        super().__init__()
-        if isinstance(perturbation_set, Tensor):
-            if perturbation_set.dim() != 2:
-                raise ValueError("`perturbation_set` must be an `n_p x d`-dim tensor!")
-            self.register_buffer("perturbation_set", perturbation_set)
-        else:
-            self.perturbation_set = perturbation_set
-        if bounds is not None:
-            if (
-                isinstance(perturbation_set, Tensor)
-                and bounds.shape[-1] != perturbation_set.shape[-1]
-            ):
-                raise ValueError(
-                    "`bounds` must have the same number of columns (last dimension) as "
-                    f"the `perturbation_set`! Got {bounds.shape[-1]} and "
-                    f"{perturbation_set.shape[-1]}."
-                )
-            self.register_buffer("bounds", bounds)
-        else:
-            self.bounds = None
-        self.multiplicative = multiplicative
-        self.transform_on_train = transform_on_train
-        self.transform_on_eval = transform_on_eval
-        self.transform_on_fantasize = transform_on_fantasize
-
-    def transform(self, X: Tensor) -> Tensor:
-        r"""Transform the inputs by adding `perturbation_set` to each input.
-
-        For each `1 x d`-dim element in the input tensor, this will produce
-        an `n_p x d`-dim tensor with the `perturbation_set` added to the input.
-        For a generic `batch_shape x q x d`-dim `X`, this translates to a
-        `batch_shape x (q * n_p) x d`-dim output, where the values corresponding
-        to `X[..., i, :]` are found in `output[..., i * n_w: (i + 1) * n_w, :]`.
-
-        Note: Adding the `perturbation_set` on the `q-batch` dimension is necessary
-        to avoid introducing additional bias by evaluating the inputs on independent
-        GP sample paths.
-
-        Args:
-            X: A `batch_shape x q x d`-dim tensor of inputs.
-
-        Returns:
-            A `batch_shape x (q * n_p) x d`-dim tensor of perturbed inputs.
-        """
-        if isinstance(self.perturbation_set, Tensor):
-            perturbations = self.perturbation_set
-        else:
-            perturbations = self.perturbation_set(X)
-        expanded_X = X.unsqueeze(dim=-2).expand(
-            *X.shape[:-1], perturbations.shape[-2], -1
-        )
-        expanded_perturbations = perturbations.expand(*expanded_X.shape[:-1], -1)
-        if self.multiplicative:
-            perturbed_inputs = expanded_X * expanded_perturbations
-        else:
-            perturbed_inputs = expanded_X + expanded_perturbations
-        perturbed_inputs = perturbed_inputs.reshape(*X.shape[:-2], -1, X.shape[-1])
-        if self.bounds is not None:
-            perturbed_inputs = torch.maximum(
-                torch.minimum(perturbed_inputs, self.bounds[1]), self.bounds[0]
-            )
-        return perturbed_inputs
diff --git a/botorch/models/transforms/input_augmentation.py b/botorch/models/transforms/input_augmentation.py
new file mode 100644
index 0000000000..67d2a7f0d5
--- /dev/null
+++ b/botorch/models/transforms/input_augmentation.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+r"""
+Input Augmentation Transformations.
+
+These classes implement a variety of transformations for
+input parameters that are applied only to the test inputs
+at the `posterior` call.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Optional, Callable, Union
+
+import torch
+from torch import Tensor
+from torch.nn import Module
+
+
+class InputAugmentationTransform(Module, ABC):
+    r"""Abstract base class for input augmentation transforms."""
+
+    @abstractmethod
+    def forward(self, X: Tensor) -> Tensor:
+        r"""Transform the inputs to a model.
+
+        Args:
+            X: A `batch_shape x q x d`-dim tensor of inputs.
+
+        Returns:
+            A `batch_shape x q' x d'`-dim tensor of transformed inputs, where `q'`
+                is generally an integer multiple of `q` and `d' > d`, both determined
+                by the transform arguments.
+        """
+        pass  # pragma: no cover
+
+    def equals(self, other: InputAugmentationTransform) -> bool:
+        r"""Check if another input augmentation transform is equivalent.
+
+        Note: The reason that a custom equals method is defined rather than
+        defining an __eq__ method is because defining an __eq__ method sets
+        the __hash__ method to None. Hashing modules is currently used in
+        pytorch. See https://github.com/pytorch/pytorch/issues/7733.
+
+        Args:
+            other: Another input augmentation transform.
+
+        Returns:
+            A boolean indicating if the other transform is equivalent.
+        """
+        other_state_dict = other.state_dict()
+        return type(self) == type(other) and all(
+            torch.allclose(v, other_state_dict[k].to(v))
+            for k, v in self.state_dict().items()
+        )
+
+
+class AppendFeatures(InputAugmentationTransform):
+    r"""A transform that appends the input with a given set of features.
+
+    As an example, this can be used with `RiskMeasureMCObjective` to optimize risk
+    measures as described in [Cakmak2020risk]_. A tutorial notebook implementing the
+    rhoKG acqusition function introduced in [Cakmak2020risk]_ can be found at
+    https://botorch.org/tutorials/risk_averse_bo_with_environmental_variables.
+
+    The steps for using this to obtain samples of a risk measure are as follows:
+
+    -   Train a model on `(x, w)` inputs and the corresponding observations;
+
+    -   Pass in an instance of `AppendFeatures` with the `feature_set` denoting the
+        samples of `W` as the `input_transform` to the trained model;
+
+    -   Call `posterior(...).rsample(...)` on the model with `x` inputs only to
+        get the joint posterior samples over `(x, w)`s, where the `w`s come
+        from the `feature_set`;
+
+    -   Pass these posterior samples through the `RiskMeasureMCObjective` of choice to
+        get the samples of the risk measure.
+
+    Note: The samples of the risk measure obtained this way are in general biased
+    since the `feature_set` does not fully represent the distribution of the
+    environmental variable.
+
+    Example:
+        >>> # We consider 1D `x` and 1D `w`, with `W` having a
+        >>> # uniform distribution over [0, 1]
+        >>> model = SingleTaskGP(
+        ...     train_X=torch.rand(10, 2),
+        ...     train_Y=torch.randn(10, 1),
+        ...     input_augmentation_transform=AppendFeatures(feature_set=torch.rand(10, 1))
+        ... )
+        >>> mll = ExactMarginalLogLikelihood(model.likelihood, model)
+        >>> fit_gpytorch_model(mll)
+        >>> test_x = torch.rand(3, 1)
+        >>> # `posterior_samples` is a `10 x 30 x 1`-dim tensor
+        >>> posterior_samples = model.posterior(test_x).rsamples(torch.size([10]))
+        >>> risk_measure = VaR(alpha=0.8, n_w=10)
+        >>> # `risk_measure_samples` is a `10 x 3`-dim tensor of samples of the
+        >>> # risk measure VaR
+        >>> risk_measure_samples = risk_measure(posterior_samples)
+    """
+
+    def __init__(
+        self,
+        feature_set: Tensor,
+    ) -> None:
+        r"""Append `feature_set` to each input.
+
+        Args:
+            feature_set: An `n_f x d_f`-dim tensor denoting the features to be
+                appended to the inputs.
+        """
+        super().__init__()
+        if feature_set.dim() != 2:
+            raise ValueError("`feature_set` must be an `n_f x d_f`-dim tensor!")
+        self.register_buffer("feature_set", feature_set)
+
+    def forward(self, X: Tensor) -> Tensor:
+        r"""Transform the inputs by appending `feature_set` to each input.
+
+        For each `1 x d`-dim element in the input tensor, this will produce
+        an `n_f x (d + d_f)`-dim tensor with `feature_set` appended as the last `d_f`
+        dimensions. For a generic `batch_shape x q x d`-dim `X`, this translates to a
+        `batch_shape x (q * n_f) x (d + d_f)`-dim output, where the values corresponding
+        to `X[..., i, :]` are found in `output[..., i * n_f: (i + 1) * n_f, :]`.
+
+        Note: Adding the `feature_set` on the `q-batch` dimension is necessary to avoid
+        introducing additional bias by evaluating the inputs on independent GP
+        sample paths.
+
+        Args:
+            X: A `batch_shape x q x d`-dim tensor of inputs.
+
+        Returns:
+            A `batch_shape x (q * n_f) x (d + d_f)`-dim tensor of appended inputs.
+        """
+        expanded_X = X.unsqueeze(dim=-2).expand(
+            *X.shape[:-1], self.feature_set.shape[0], -1
+        )
+        expanded_features = self.feature_set.expand(*expanded_X.shape[:-1], -1)
+        appended_X = torch.cat([expanded_X, expanded_features], dim=-1)
+        return appended_X.view(*X.shape[:-2], -1, appended_X.shape[-1])
+
+
+class InputPerturbation(InputAugmentationTransform):
+    r"""A transform that adds the set of perturbations to the given input.
+
+    Similar to `AppendFeatures`, this can be used with `RiskMeasureMCObjective`
+    to optimize risk measures. See `AppendFeatures` for additional discussion
+    on optimizing risk measures.
+
+    A tutorial notebook using this with `qNoisyExpectedImprovement` can be found at
+    https://botorch.org/tutorials/risk_averse_bo_with_input_perturbations.
+    """
+
+    def __init__(
+        self,
+        perturbation_set: Union[Tensor, Callable[[Tensor], Tensor]],
+        bounds: Optional[Tensor] = None,
+        multiplicative: bool = False,
+    ) -> None:
+        r"""Add `perturbation_set` to each input.
+
+        Args:
+            perturbation_set: An `n_p x d`-dim tensor denoting the perturbations
+                to be added to the inputs. Alternatively, this can be a callable that
+                returns `batch x n_p x d`-dim tensor of perturbations for input of
+                shape `batch x d`. This is useful for heteroscedastic perturbations.
+            bounds: A `2 x d`-dim tensor of lower and upper bounds for each
+                column of the input. If given, the perturbed inputs will be
+                clamped to these bounds.
+            multiplicative: A boolean indicating whether the input perturbations
+                are additive or multiplicative. If True, inputs will be multiplied
+                with the perturbations.
+        """
+        super().__init__()
+        if isinstance(perturbation_set, Tensor):
+            if perturbation_set.dim() != 2:
+                raise ValueError("`perturbation_set` must be an `n_p x d`-dim tensor!")
+            self.register_buffer("perturbation_set", perturbation_set)
+        else:
+            self.perturbation_set = perturbation_set
+        if bounds is not None:
+            if (
+                isinstance(perturbation_set, Tensor)
+                and bounds.shape[-1] != perturbation_set.shape[-1]
+            ):
+                raise ValueError(
+                    "`bounds` must have the same number of columns (last dimension) as "
+                    f"the `perturbation_set`! Got {bounds.shape[-1]} and "
+                    f"{perturbation_set.shape[-1]}."
+                )
+            self.register_buffer("bounds", bounds)
+        else:
+            self.bounds = None
+        self.multiplicative = multiplicative
+
+    def forward(self, X: Tensor) -> Tensor:
+        r"""Transform the inputs by adding `perturbation_set` to each input.
+
+        For each `1 x d`-dim element in the input tensor, this will produce
+        an `n_p x d`-dim tensor with the `perturbation_set` added to the input.
+        For a generic `batch_shape x q x d`-dim `X`, this translates to a
+        `batch_shape x (q * n_p) x d`-dim output, where the values corresponding
+        to `X[..., i, :]` are found in `output[..., i * n_w: (i + 1) * n_w, :]`.
+
+        Note: Adding the `perturbation_set` on the `q-batch` dimension is necessary
+        to avoid introducing additional bias by evaluating the inputs on independent
+        GP sample paths.
+
+        Args:
+            X: A `batch_shape x q x d`-dim tensor of inputs.
+
+        Returns:
+            A `batch_shape x (q * n_p) x d`-dim tensor of perturbed inputs.
+        """
+        if isinstance(self.perturbation_set, Tensor):
+            perturbations = self.perturbation_set
+        else:
+            perturbations = self.perturbation_set(X)
+        expanded_X = X.unsqueeze(dim=-2).expand(
+            *X.shape[:-1], perturbations.shape[-2], -1
+        )
+        expanded_perturbations = perturbations.expand(*expanded_X.shape[:-1], -1)
+        if self.multiplicative:
+            perturbed_inputs = expanded_X * expanded_perturbations
+        else:
+            perturbed_inputs = expanded_X + expanded_perturbations
+        perturbed_inputs = perturbed_inputs.reshape(*X.shape[:-2], -1, X.shape[-1])
+        if self.bounds is not None:
+            perturbed_inputs = torch.maximum(
+                torch.minimum(perturbed_inputs, self.bounds[1]), self.bounds[0]
+            )
+        return perturbed_inputs
diff --git a/botorch/models/utils.py b/botorch/models/utils.py
index 42b10d0fda..ff4ad69760 100644
--- a/botorch/models/utils.py
+++ b/botorch/models/utils.py
@@ -17,7 +17,6 @@
 import torch
 from botorch import settings
 from botorch.exceptions import InputDataError, InputDataWarning
-from botorch.settings import _Flag
 from gpytorch import settings as gpt_settings
 from gpytorch.module import Module
 from gpytorch.utils.broadcasting import _mul_broadcast_shape
@@ -281,8 +280,3 @@ def gpt_posterior_settings():
             gpt_settings.detach_test_caches(settings.propagate_grads.off())
         )
         yield
-
-
-class fantasize(_Flag):
-    r"""A flag denoting whether we are currently in a `fantasize` context."""
-    _state: bool = False