From c292356566725bff603287662fa365a67a43eba3 Mon Sep 17 00:00:00 2001
From: Sait Cakmak <saitcakmak@fb.com>
Date: Mon, 14 Oct 2024 10:34:27 -0700
Subject: [PATCH] Update `initialize_q_batch` methods to return both candidates
 and the corresponding acquisition values (#2571)

Summary:
As titled. This avoids the need to re-compute the acquisition values after sub-selecting in cases where they are needed.


Differential Revision: D64333367

Pulled By: saitcakmak
---
 botorch/acquisition/multi_step_lookahead.py |   4 +-
 botorch/optim/initializers.py               | 100 ++--
 test/optim/test_initializers.py             |  82 +--
 tutorials/optimize_stochastic.ipynb         | 601 +++++++++++---------
 4 files changed, 421 insertions(+), 366 deletions(-)

diff --git a/botorch/acquisition/multi_step_lookahead.py b/botorch/acquisition/multi_step_lookahead.py
index 79208b8596..808b767cc9 100644
--- a/botorch/acquisition/multi_step_lookahead.py
+++ b/botorch/acquisition/multi_step_lookahead.py
@@ -656,8 +656,8 @@ def mixin_tree(T: Tensor, bounds: Tensor, alpha: float) -> Tensor:
     )
 
     with torch.no_grad():
-        Y_full = acq_function(X_full)
-    X_init = initialize_q_batch(X=X_full, Y=Y_full, n=num_restarts, eta=1.0)
+        acq_vals = acq_function(X_full)
+    X_init, _ = initialize_q_batch(X=X_full, acq_vals=acq_vals, n=num_restarts, eta=1.0)
     return X_init[:raw_samples]
 
 
diff --git a/botorch/optim/initializers.py b/botorch/optim/initializers.py
index e4431c3189..df0c85acf1 100644
--- a/botorch/optim/initializers.py
+++ b/botorch/optim/initializers.py
@@ -393,7 +393,8 @@ def gen_batch_initial_conditions(
                         ],
                         dim=0,
                     )
-            X_rnd = fix_features(X_rnd, fixed_features=fixed_features)
+            # Keep X on CPU for consistency & to limit GPU memory usage.
+            X_rnd = fix_features(X_rnd, fixed_features=fixed_features).cpu()
             if fixed_X_fantasies is not None:
                 if (d_f := fixed_X_fantasies.shape[-1]) != (d_r := X_rnd.shape[-1]):
                     raise BotorchTensorDimensionError(
@@ -415,16 +416,17 @@ def gen_batch_initial_conditions(
                     batch_limit = X_rnd.shape[0]
                 # Evaluate the acquisition function on `X_rnd` using `batch_limit`
                 # sized chunks.
-                Y_rnd = torch.cat(
+                acq_vals = torch.cat(
                     [
                         acq_function(x_.to(device=device)).cpu()
                         for x_ in X_rnd.split(split_size=batch_limit, dim=0)
                     ],
                     dim=0,
                 )
-            batch_initial_conditions = init_func(
-                X=X_rnd, Y=Y_rnd, n=num_restarts, **init_kwargs
-            ).to(device=device)
+            batch_initial_conditions, _ = init_func(
+                X=X_rnd, acq_vals=acq_vals, n=num_restarts, **init_kwargs
+            )
+            batch_initial_conditions = batch_initial_conditions.to(device=device)
             if not any(issubclass(w.category, BadInitialCandidatesWarning) for w in ws):
                 return batch_initial_conditions
             if factor < max_factor:
@@ -884,20 +886,24 @@ def gen_value_function_initial_conditions(
 
     # evaluate the raw samples
     with torch.no_grad():
-        Y_rnd = acq_function(X_rnd)
+        acq_vals = acq_function(X_rnd)
 
     # select the restart points using the heuristic
-    return initialize_q_batch(
-        X=X_rnd, Y=Y_rnd, n=num_restarts, eta=options.get("eta", 2.0)
+    X_init, _ = initialize_q_batch(
+        X=X_rnd, acq_vals=acq_vals, n=num_restarts, eta=options.get("eta", 2.0)
     )
+    return X_init
 
 
-def initialize_q_batch(X: Tensor, Y: Tensor, n: int, eta: float = 1.0) -> Tensor:
+def initialize_q_batch(
+    X: Tensor, acq_vals: Tensor, n: int, eta: float = 1.0
+) -> tuple[Tensor, Tensor]:
     r"""Heuristic for selecting initial conditions for candidate generation.
 
     This heuristic selects points from `X` (without replacement) with probability
-    proportional to `exp(eta * Z)`, where `Z = (Y - mean(Y)) / std(Y)` and `eta`
-    is a temperature parameter.
+    proportional to `exp(eta * Z)`, where
+    `Z = (acq_vals - mean(acq_vals)) / std(ac_vals)`
+    and `eta`is a temperature parameter.
 
     When using an acquisiton function that is non-negative and possibly zero
     over large areas of the feature space (e.g. qEI), you should use
@@ -907,22 +913,23 @@ def initialize_q_batch(X: Tensor, Y: Tensor, n: int, eta: float = 1.0) -> Tensor
         X: A `b x batch_shape x q x d` tensor of `b` - `batch_shape` samples of
             `q`-batches from a d`-dim feature space. Typically, these are generated
             using qMC sampling.
-        Y: A tensor of `b x batch_shape` outcomes associated with the samples.
+        acq_vals: A tensor of `b x batch_shape` outcomes associated with the samples.
             Typically, this is the value of the batch acquisition function to be
             maximized.
         n: The number of initial condition to be generated. Must be less than `b`.
         eta: Temperature parameter for weighting samples.
 
     Returns:
-        A `n x batch_shape x q x d` tensor of `n` - `batch_shape` `q`-batch initial
-        conditions, where each batch of `n x q x d` samples is selected independently.
+        - An `n x batch_shape x q x d` tensor of `n` - `batch_shape` `q`-batch initial
+          conditions, where each batch of `n x q x d` samples is selected independently.
+        - An `n x batch_shape` tensor of the corresponding acquisition values.
 
     Example:
         >>> # To get `n=10` starting points of q-batch size `q=3`
         >>> # for model with `d=6`:
         >>> qUCB = qUpperConfidenceBound(model, beta=0.1)
-        >>> Xrnd = torch.rand(500, 3, 6)
-        >>> Xinit = initialize_q_batch(Xrnd, qUCB(Xrnd), 10)
+        >>> X_rnd = torch.rand(500, 3, 6)
+        >>> X_init, acq_init = initialize_q_batch(X=X_rnd, acq_vals=qUCB(X_rnd), n=10)
     """
     n_samples = X.shape[0]
     batch_shape = X.shape[1:-2] or torch.Size()
@@ -932,9 +939,9 @@ def initialize_q_batch(X: Tensor, Y: Tensor, n: int, eta: float = 1.0) -> Tensor
             f"provided samples ({n_samples})"
         )
     elif n == n_samples:
-        return X
+        return X, acq_vals
 
-    Ystd = Y.std(dim=0)
+    Ystd = acq_vals.std(dim=0)
     if torch.any(Ystd == 0):
         warnings.warn(
             "All acquisition values for raw samples points are the same for "
@@ -942,10 +949,11 @@ def initialize_q_batch(X: Tensor, Y: Tensor, n: int, eta: float = 1.0) -> Tensor
             BadInitialCandidatesWarning,
             stacklevel=3,
         )
-        return X[torch.randperm(n=n_samples, device=X.device)][:n]
+        idcs = torch.randperm(n=n_samples, device=X.device)[:n]
+        return X[idcs], acq_vals[idcs]
 
-    max_val, max_idx = torch.max(Y, dim=0)
-    Z = (Y - Y.mean(dim=0)) / Ystd
+    max_val, max_idx = torch.max(acq_vals, dim=0)
+    Z = (acq_vals - acq_vals.mean(dim=0)) / Ystd
     etaZ = eta * Z
     weights = torch.exp(etaZ)
     while torch.isinf(weights).any():
@@ -961,28 +969,30 @@ def initialize_q_batch(X: Tensor, Y: Tensor, n: int, eta: float = 1.0) -> Tensor
     if max_idx not in idcs:
         idcs[-1] = max_idx
     if batch_shape == torch.Size():
-        return X[idcs]
+        return X[idcs], acq_vals[idcs]
     else:
-        return X.gather(
+        X_select = X.gather(
             dim=0, index=idcs.view(*idcs.shape, 1, 1).expand(n, *X.shape[1:])
         )
+        acq_select = acq_vals.gather(dim=0, index=idcs)
+        return X_select, acq_select
 
 
 def initialize_q_batch_nonneg(
-    X: Tensor, Y: Tensor, n: int, eta: float = 1.0, alpha: float = 1e-4
-) -> Tensor:
+    X: Tensor, acq_vals: Tensor, n: int, eta: float = 1.0, alpha: float = 1e-4
+) -> tuple[Tensor, Tensor]:
     r"""Heuristic for selecting initial conditions for non-neg. acquisition functions.
 
     This function is similar to `initialize_q_batch`, but designed specifically
     for acquisition functions that are non-negative and possibly zero over
     large areas of the feature space (e.g. qEI). All samples for which
-    `Y < alpha * max(Y)` will be ignored (assuming that `Y` contains at least
-    one positive value).
+    `acq_vals < alpha * max(acq_vals)` will be ignored (assuming that `acq_vals`
+    contains at least one positive value).
 
     Args:
         X: A `b x q x d` tensor of `b` samples of `q`-batches from a `d`-dim.
             feature space. Typically, these are generated using qMC.
-        Y: A tensor of `b` outcomes associated with the samples. Typically, this
+        acq_vals: A tensor of `b` outcomes associated with the samples. Typically, this
             is the value of the batch acquisition function to be maximized.
         n: The number of initial condition to be generated. Must be less than `b`.
         eta: Temperature parameter for weighting samples.
@@ -991,22 +1001,25 @@ def initialize_q_batch_nonneg(
             `Y < alpha * max(Y)` will be ignored.
 
     Returns:
-        A `n x q x d` tensor of `n` `q`-batch initial conditions.
+        - An `n x q x d` tensor of `n` `q`-batch initial conditions.
+        - An `n` tensor of the corresponding acquisition values.
 
     Example:
         >>> # To get `n=10` starting points of q-batch size `q=3`
         >>> # for model with `d=6`:
         >>> qEI = qExpectedImprovement(model, best_f=0.2)
-        >>> Xrnd = torch.rand(500, 3, 6)
-        >>> Xinit = initialize_q_batch(Xrnd, qEI(Xrnd), 10)
+        >>> X_rnd = torch.rand(500, 3, 6)
+        >>> X_init, acq_init = initialize_q_batch_nonneg(
+        ...     X=X_rnd, acq_vals=qEI(X_rnd), n=10
+        ... )
     """
     n_samples = X.shape[0]
     if n > n_samples:
         raise RuntimeError("n cannot be larger than the number of provided samples")
     elif n == n_samples:
-        return X
+        return X, acq_vals
 
-    max_val, max_idx = torch.max(Y, dim=0)
+    max_val, max_idx = torch.max(acq_vals, dim=0)
     if torch.any(max_val <= 0):
         warnings.warn(
             "All acquisition values for raw sampled points are nonpositive, so "
@@ -1014,31 +1027,34 @@ def initialize_q_batch_nonneg(
             BadInitialCandidatesWarning,
             stacklevel=3,
         )
-        return X[torch.randperm(n=n_samples, device=X.device)][:n]
+        idcs = torch.randperm(n=n_samples, device=X.device)[:n]
+        return X[idcs], acq_vals[idcs]
 
     # make sure there are at least `n` points with positive acquisition values
-    pos = Y > 0
+    pos = acq_vals > 0
     num_pos = pos.sum().item()
     if num_pos < n:
         # select all positive points and then fill remaining quota with randomly
         # selected points
         remaining_indices = (~pos).nonzero(as_tuple=False).view(-1)
-        rand_indices = torch.randperm(remaining_indices.shape[0], device=Y.device)
+        rand_indices = torch.randperm(
+            remaining_indices.shape[0], device=acq_vals.device
+        )
         sampled_remaining_indices = remaining_indices[rand_indices[: n - num_pos]]
         pos[sampled_remaining_indices] = 1
-        return X[pos]
+        return X[pos], acq_vals[pos]
     # select points within alpha of max_val, iteratively decreasing alpha by a
     # factor of 10 as necessary
-    alpha_pos = Y >= alpha * max_val
+    alpha_pos = acq_vals >= alpha * max_val
     while alpha_pos.sum() < n:
         alpha = 0.1 * alpha
-        alpha_pos = Y >= alpha * max_val
-    alpha_pos_idcs = torch.arange(len(Y), device=Y.device)[alpha_pos]
-    weights = torch.exp(eta * (Y[alpha_pos] / max_val - 1))
+        alpha_pos = acq_vals >= alpha * max_val
+    alpha_pos_idcs = torch.arange(len(acq_vals), device=acq_vals.device)[alpha_pos]
+    weights = torch.exp(eta * (acq_vals[alpha_pos] / max_val - 1))
     idcs = alpha_pos_idcs[torch.multinomial(weights, n)]
     if max_idx not in idcs:
         idcs[-1] = max_idx
-    return X[idcs]
+    return X[idcs], acq_vals[idcs]
 
 
 def sample_points_around_best(
diff --git a/test/optim/test_initializers.py b/test/optim/test_initializers.py
index 925e9cec7a..78793331ad 100644
--- a/test/optim/test_initializers.py
+++ b/test/optim/test_initializers.py
@@ -89,40 +89,42 @@ def test_initialize_q_batch_nonneg(self):
         for dtype in (torch.float, torch.double):
             # basic test
             X = torch.rand(5, 3, 4, device=self.device, dtype=dtype)
-            Y = torch.rand(5, device=self.device, dtype=dtype)
-            ics = initialize_q_batch_nonneg(X=X, Y=Y, n=2)
-            self.assertEqual(ics.shape, torch.Size([2, 3, 4]))
-            self.assertEqual(ics.device, X.device)
-            self.assertEqual(ics.dtype, X.dtype)
+            acq_vals = torch.rand(5, device=self.device, dtype=dtype)
+            ics_X, ics_acq_vals = initialize_q_batch_nonneg(X=X, acq_vals=acq_vals, n=2)
+            self.assertEqual(ics_X.shape, torch.Size([2, 3, 4]))
+            self.assertEqual(ics_X.device, X.device)
+            self.assertEqual(ics_X.dtype, X.dtype)
+            self.assertEqual(ics_acq_vals.shape, torch.Size([2]))
+            self.assertEqual(ics_acq_vals.device, acq_vals.device)
+            self.assertEqual(ics_acq_vals.dtype, acq_vals.dtype)
             # ensure nothing happens if we want all samples
-            ics = initialize_q_batch_nonneg(X=X, Y=Y, n=5)
-            self.assertTrue(torch.equal(X, ics))
+            ics_X, ics_acq_vals = initialize_q_batch_nonneg(X=X, acq_vals=acq_vals, n=5)
+            self.assertTrue(torch.equal(X, ics_X))
+            self.assertTrue(torch.equal(acq_vals, ics_acq_vals))
             # make sure things work with constant inputs
-            Y = torch.ones(5, device=self.device, dtype=dtype)
-            ics = initialize_q_batch_nonneg(X=X, Y=Y, n=2)
+            acq_vals = torch.ones(5, device=self.device, dtype=dtype)
+            ics, _ = initialize_q_batch_nonneg(X=X, acq_vals=acq_vals, n=2)
             self.assertEqual(ics.shape, torch.Size([2, 3, 4]))
             self.assertEqual(ics.device, X.device)
             self.assertEqual(ics.dtype, X.dtype)
             # ensure raises correct warning
-            Y = torch.zeros(5, device=self.device, dtype=dtype)
+            acq_vals = torch.zeros(5, device=self.device, dtype=dtype)
             with warnings.catch_warnings(record=True) as w, settings.debug(True):
-                ics = initialize_q_batch_nonneg(X=X, Y=Y, n=2)
-                self.assertEqual(len(w), 1)
-                self.assertTrue(issubclass(w[-1].category, BadInitialCandidatesWarning))
+                ics, _ = initialize_q_batch_nonneg(X=X, acq_vals=acq_vals, n=2)
+            self.assertEqual(len(w), 1)
+            self.assertTrue(issubclass(w[-1].category, BadInitialCandidatesWarning))
             self.assertEqual(ics.shape, torch.Size([2, 3, 4]))
             with self.assertRaises(RuntimeError):
-                initialize_q_batch_nonneg(X=X, Y=Y, n=10)
+                initialize_q_batch_nonneg(X=X, acq_vals=acq_vals, n=10)
             # test less than `n` positive acquisition values
-            Y = torch.arange(5, device=self.device, dtype=dtype) - 3
-            ics = initialize_q_batch_nonneg(X=X, Y=Y, n=2)
-            self.assertEqual(ics.shape, torch.Size([2, 3, 4]))
-            self.assertEqual(ics.device, X.device)
-            self.assertEqual(ics.dtype, X.dtype)
+            acq_vals = torch.arange(5, device=self.device, dtype=dtype) - 3
+            ics_X, ics_acq_vals = initialize_q_batch_nonneg(X=X, acq_vals=acq_vals, n=2)
+            self.assertEqual(ics_X.shape, torch.Size([2, 3, 4]))
             # check that we chose the point with the positive acquisition value
-            self.assertTrue(torch.equal(ics[0], X[-1]) or torch.equal(ics[1], X[-1]))
+            self.assertTrue((ics_acq_vals > 0).any())
             # test less than `n` alpha_pos values
-            Y = torch.arange(5, device=self.device, dtype=dtype)
-            ics = initialize_q_batch_nonneg(X=X, Y=Y, n=2, alpha=1.0)
+            acq_vals = torch.arange(5, device=self.device, dtype=dtype)
+            ics, _ = initialize_q_batch_nonneg(X=X, acq_vals=acq_vals, n=2, alpha=1.0)
             self.assertEqual(ics.shape, torch.Size([2, 3, 4]))
             self.assertEqual(ics.device, X.device)
             self.assertEqual(ics.dtype, X.dtype)
@@ -132,32 +134,36 @@ def test_initialize_q_batch(self):
             for batch_shape in (torch.Size(), [3, 2], (2,), torch.Size([2, 3, 4]), []):
                 # basic test
                 X = torch.rand(5, *batch_shape, 3, 4, device=self.device, dtype=dtype)
-                Y = torch.rand(5, *batch_shape, device=self.device, dtype=dtype)
-                ics = initialize_q_batch(X=X, Y=Y, n=2)
-                self.assertEqual(ics.shape, torch.Size([2, *batch_shape, 3, 4]))
-                self.assertEqual(ics.device, X.device)
-                self.assertEqual(ics.dtype, X.dtype)
+                acq_vals = torch.rand(5, *batch_shape, device=self.device, dtype=dtype)
+                ics_X, ics_acq_vals = initialize_q_batch(X=X, acq_vals=acq_vals, n=2)
+                self.assertEqual(ics_X.shape, torch.Size([2, *batch_shape, 3, 4]))
+                self.assertEqual(ics_X.device, X.device)
+                self.assertEqual(ics_X.dtype, X.dtype)
+                self.assertEqual(ics_acq_vals.shape, torch.Size([2, *batch_shape]))
+                self.assertEqual(ics_acq_vals.device, acq_vals.device)
+                self.assertEqual(ics_acq_vals.dtype, acq_vals.dtype)
                 # ensure nothing happens if we want all samples
-                ics = initialize_q_batch(X=X, Y=Y, n=5)
-                self.assertTrue(torch.equal(X, ics))
+                ics_X, ics_acq_vals = initialize_q_batch(X=X, acq_vals=acq_vals, n=5)
+                self.assertTrue(torch.equal(X, ics_X))
+                self.assertTrue(torch.equal(acq_vals, ics_acq_vals))
                 # ensure raises correct warning
-                Y = torch.zeros(5, device=self.device, dtype=dtype)
+                acq_vals = torch.zeros(5, device=self.device, dtype=dtype)
                 with warnings.catch_warnings(record=True) as w, settings.debug(True):
-                    ics = initialize_q_batch(X=X, Y=Y, n=2)
-                    self.assertEqual(len(w), 1)
-                    self.assertTrue(
-                        issubclass(w[-1].category, BadInitialCandidatesWarning)
-                    )
+                    ics, _ = initialize_q_batch(X=X, acq_vals=acq_vals, n=2)
+                self.assertEqual(len(w), 1)
+                self.assertTrue(issubclass(w[-1].category, BadInitialCandidatesWarning))
                 self.assertEqual(ics.shape, torch.Size([2, *batch_shape, 3, 4]))
                 with self.assertRaises(RuntimeError):
-                    initialize_q_batch(X=X, Y=Y, n=10)
+                    initialize_q_batch(X=X, acq_vals=acq_vals, n=10)
 
     def test_initialize_q_batch_largeZ(self):
         for dtype in (torch.float, torch.double):
             # testing large eta*Z
             X = torch.rand(5, 3, 4, device=self.device, dtype=dtype)
-            Y = torch.tensor([-1e12, 0, 0, 0, 1e12], device=self.device, dtype=dtype)
-            ics = initialize_q_batch(X=X, Y=Y, n=2, eta=100)
+            acq_vals = torch.tensor(
+                [-1e12, 0, 0, 0, 1e12], device=self.device, dtype=dtype
+            )
+            ics, _ = initialize_q_batch(X=X, acq_vals=acq_vals, n=2, eta=100)
             self.assertEqual(ics.shape[0], 2)
 
 
diff --git a/tutorials/optimize_stochastic.ipynb b/tutorials/optimize_stochastic.ipynb
index 34b50b1c3a..abf02f2958 100644
--- a/tutorials/optimize_stochastic.ipynb
+++ b/tutorials/optimize_stochastic.ipynb
@@ -1,300 +1,333 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "8cc0284e-16f0-48c1-8b74-37449dcd2fb4",
-        "showInput": false
-      },
-      "source": [
-        "## Optimize acquisition functions using torch.optim\n",
-        "\n",
-        "In this tutorial, we show how to use PyTorch's `optim` module for optimizing BoTorch MC acquisition functions. This is useful if the acquisition function is stochastic in nature (caused by re-sampling the base samples when using the reparameterization trick, or if the model posterior itself is stochastic).\n",
-        "\n",
-        "*Note:* A pre-packaged, more user-friendly version of the optimization loop we will develop below is contained in the `gen_candidates_torch` function in the `botorch.gen` module. This tutorial should be quite useful if you would like to implement custom optimizers beyond what is contained in `gen_candidates_torch`.\n",
-        "\n",
-        "As discussed in the [CMA-ES tutorial](./optimize_with_cmaes), for deterministic acquisition functions BoTorch uses quasi-second order methods (such as L-BFGS-B or SLSQP) by default, which provide superior convergence speed in this situation. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "876ccfae-63ae-403e-85c7-7e279b6405ea",
-        "showInput": false
-      },
-      "source": [
-        "### Set up a toy model\n",
-        "\n",
-        "We'll fit a `SingleTaskGP` model on noisy observations of the function $f(x) = 1 - \\|x\\|_2$ in `d=5` dimensions on the hypercube $[-1, 1]^d$."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "collapsed": false,
-        "customOutput": null,
-        "executionStartTime": 1668651600271,
-        "executionStopTime": 1668651601948,
-        "originalKey": "ab41b75c-bd1f-45a3-a10d-760b93eaf9af",
-        "requestMsgId": "9fb7ecfc-4c8c-4e5e-9cfe-44f8f73a2d45"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "I1116 182000.166 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "I1116 182000.167 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n"
-          ]
-        }
-      ],
-      "source": [
-        "import torch\n",
-        "\n",
-        "from botorch.fit import fit_gpytorch_mll\n",
-        "from botorch.models import SingleTaskGP\n",
-        "from gpytorch.mlls import ExactMarginalLogLikelihood"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "collapsed": false,
-        "customOutput": null,
-        "executionStartTime": 1668651602257,
-        "executionStopTime": 1668651602610,
-        "originalKey": "fa81436e-7e13-4521-8542-ca813ae08884",
-        "requestMsgId": "7cae57f9-5eaf-4362-9fc6-63e8e200b63e"
-      },
-      "outputs": [],
-      "source": [
-        "d = 5\n",
-        "\n",
-        "bounds = torch.stack([-torch.ones(d), torch.ones(d)])\n",
-        "\n",
-        "train_X = bounds[0] + (bounds[1] - bounds[0]) * torch.rand(50, d)\n",
-        "train_Y = 1 - torch.linalg.norm(train_X, dim=-1, keepdim=True)\n",
-        "\n",
-        "model = SingleTaskGP(train_X, train_Y)\n",
-        "mll = ExactMarginalLogLikelihood(model.likelihood, model)\n",
-        "fit_gpytorch_mll(mll);"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "e2966b4e-bf48-4fa2-a2bd-96970b803026",
-        "showInput": false
-      },
-      "source": [
-        "### Define acquisition function\n",
-        "\n",
-        "We'll use `qExpectedImprovement` with a `StochasticSampler` that uses a small number of MC samples. This results in a stochastic acquisition function that one should not attempt to optimize with the quasi-second order methods that are used by default in BoTorch's `optimize_acqf` function."
-      ]
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "originalKey": "8cc0284e-16f0-48c1-8b74-37449dcd2fb4",
+    "showInput": false
+   },
+   "source": [
+    "## Optimize acquisition functions using torch.optim\n",
+    "\n",
+    "In this tutorial, we show how to use PyTorch's `optim` module for optimizing BoTorch MC acquisition functions. This is useful if the acquisition function is stochastic in nature (caused by re-sampling the base samples when using the reparameterization trick, or if the model posterior itself is stochastic).\n",
+    "\n",
+    "*Note:* A pre-packaged, more user-friendly version of the optimization loop we will develop below is contained in the `gen_candidates_torch` function in the `botorch.gen` module. This tutorial should be quite useful if you would like to implement custom optimizers beyond what is contained in `gen_candidates_torch`.\n",
+    "\n",
+    "As discussed in the [CMA-ES tutorial](./optimize_with_cmaes), for deterministic acquisition functions BoTorch uses quasi-second order methods (such as L-BFGS-B or SLSQP) by default, which provide superior convergence speed in this situation. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "originalKey": "876ccfae-63ae-403e-85c7-7e279b6405ea",
+    "showInput": false
+   },
+   "source": [
+    "### Set up a toy model\n",
+    "\n",
+    "We'll fit a `SingleTaskGP` model on noisy observations of the function $f(x) = 1 - \\|x\\|_2$ in `d=5` dimensions on the hypercube $[-1, 1]^d$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false,
+    "customOutput": null,
+    "executionStartTime": 1668651600271,
+    "executionStopTime": 1668651601948,
+    "jupyter": {
+     "outputs_hidden": false
     },
+    "originalKey": "ab41b75c-bd1f-45a3-a10d-760b93eaf9af",
+    "requestMsgId": "9fb7ecfc-4c8c-4e5e-9cfe-44f8f73a2d45"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "collapsed": false,
-        "customOutput": null,
-        "executionStartTime": 1668651603261,
-        "executionStopTime": 1668651603264,
-        "originalKey": "bee44c18-723f-4eb3-a446-dc091b80305d",
-        "requestMsgId": "eda81892-99db-479a-9cad-a620c9c7fbf5"
-      },
-      "outputs": [],
-      "source": [
-        "from botorch.acquisition import qExpectedImprovement\n",
-        "from botorch.sampling.stochastic_samplers import StochasticSampler\n",
-        "\n",
-        "sampler = StochasticSampler(sample_shape=torch.Size([128]))\n",
-        "qEI = qExpectedImprovement(model, best_f=train_Y.max(), sampler=sampler)"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1116 182000.166 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "a952bcb8-745f-4164-8796-a85e7b9ee40c",
-        "showInput": false
-      },
-      "source": [
-        "### Optimizing the acquisition function\n",
-        "\n",
-        "We will perform optimization over `N=5` random initial `q`-batches with `q=2` in parallel. We use `N` random restarts because the acquisition function is non-convex and as a result we may get stuck in local minima."
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1116 182000.167 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "from botorch.fit import fit_gpytorch_mll\n",
+    "from botorch.models import SingleTaskGP\n",
+    "from gpytorch.mlls import ExactMarginalLogLikelihood"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false,
+    "customOutput": null,
+    "executionStartTime": 1668651602257,
+    "executionStopTime": 1668651602610,
+    "jupyter": {
+     "outputs_hidden": false
     },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "collapsed": false,
-        "customOutput": null,
-        "executionStartTime": 1668651603737,
-        "executionStopTime": 1668651603828,
-        "originalKey": "608b95ac-4445-41e6-ad5f-c64a77653a3e",
-        "requestMsgId": "5a4dfcc2-2643-46ce-abdf-7d170965aaee"
-      },
-      "outputs": [],
-      "source": [
-        "N = 5\n",
-        "q = 2"
-      ]
+    "originalKey": "fa81436e-7e13-4521-8542-ca813ae08884",
+    "requestMsgId": "7cae57f9-5eaf-4362-9fc6-63e8e200b63e"
+   },
+   "outputs": [],
+   "source": [
+    "d = 5\n",
+    "\n",
+    "bounds = torch.stack([-torch.ones(d), torch.ones(d)])\n",
+    "\n",
+    "train_X = bounds[0] + (bounds[1] - bounds[0]) * torch.rand(50, d)\n",
+    "train_Y = 1 - torch.linalg.norm(train_X, dim=-1, keepdim=True)\n",
+    "\n",
+    "model = SingleTaskGP(train_X, train_Y)\n",
+    "mll = ExactMarginalLogLikelihood(model.likelihood, model)\n",
+    "fit_gpytorch_mll(mll);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "originalKey": "e2966b4e-bf48-4fa2-a2bd-96970b803026",
+    "showInput": false
+   },
+   "source": [
+    "### Define acquisition function\n",
+    "\n",
+    "We'll use `qExpectedImprovement` with a `StochasticSampler` that uses a small number of MC samples. This results in a stochastic acquisition function that one should not attempt to optimize with the quasi-second order methods that are used by default in BoTorch's `optimize_acqf` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false,
+    "customOutput": null,
+    "executionStartTime": 1668651603261,
+    "executionStopTime": 1668651603264,
+    "jupyter": {
+     "outputs_hidden": false
     },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "fbf016a1-1c6e-4c87-881b-be870163a306",
-        "showInput": false
-      },
-      "source": [
-        "#### Choosing initial conditions via a heuristic\n",
-        "\n",
-        "Using random initial conditions in conjunction with gradient-based optimizers can be problematic because qEI values and their corresponding gradients are often zero in large parts of the feature space. To mitigate this issue, BoTorch provides a heuristic for generating promising initial conditions (this dirty and not-so-little secret of Bayesian Optimization is actually very important for overall closed-loop performance).\n",
-        "\n",
-        "Given a set of `q`-batches $X'$ and associated acquisiton function values $Y'$, the `initialize_q_batch_nonneg` samples promising initial conditions $X$ (without replacement) from the multinomial distribution\n",
-        "\n",
-        "$$ \\mathbb{P}(X = X'_i) \\sim \\exp (\\eta \\tilde{Y}_i), \\qquad \\text{where} \\;\\; \\tilde{Y}_i = \\frac{Y'_i - \\mu(Y)}{\\sigma(Y)} \\;\\; \\text{if} \\;\\; Y'_i >0 $$\n",
-        "\n",
-        "and $\\mathbb{P}(X = X'_j) = 0$ for all $j$ such that $Y'_j = 0$. \n",
-        "\n",
-        "Fortunately, thanks to the high degree of parallelism in BoTorch, evaluating the acquisition function at a large number of randomly chosen points is quite cheap."
-      ]
+    "originalKey": "bee44c18-723f-4eb3-a446-dc091b80305d",
+    "requestMsgId": "eda81892-99db-479a-9cad-a620c9c7fbf5"
+   },
+   "outputs": [],
+   "source": [
+    "from botorch.acquisition import qExpectedImprovement\n",
+    "from botorch.sampling.stochastic_samplers import StochasticSampler\n",
+    "\n",
+    "sampler = StochasticSampler(sample_shape=torch.Size([128]))\n",
+    "qEI = qExpectedImprovement(model, best_f=train_Y.max(), sampler=sampler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "originalKey": "a952bcb8-745f-4164-8796-a85e7b9ee40c",
+    "showInput": false
+   },
+   "source": [
+    "### Optimizing the acquisition function\n",
+    "\n",
+    "We will perform optimization over `N=5` random initial `q`-batches with `q=2` in parallel. We use `N` random restarts because the acquisition function is non-convex and as a result we may get stuck in local minima."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false,
+    "customOutput": null,
+    "executionStartTime": 1668651603737,
+    "executionStopTime": 1668651603828,
+    "jupyter": {
+     "outputs_hidden": false
     },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "collapsed": false,
-        "customOutput": null,
-        "executionStartTime": 1668651604094,
-        "executionStopTime": 1668651604159,
-        "originalKey": "60ec6384-1820-4ba3-a4ac-a75e3ac2a1e8",
-        "requestMsgId": "b38d8d63-745d-4297-b99b-f4b4c5a15eae"
-      },
-      "outputs": [],
-      "source": [
-        "from botorch.optim.initializers import initialize_q_batch_nonneg\n",
-        "\n",
-        "# generate a large number of random q-batches\n",
-        "Xraw = bounds[0] + (bounds[1] - bounds[0]) * torch.rand(100 * N, q, d)\n",
-        "Yraw = qEI(Xraw)  # evaluate the acquisition function on these q-batches\n",
-        "\n",
-        "# apply the heuristic for sampling promising initial conditions\n",
-        "X = initialize_q_batch_nonneg(Xraw, Yraw, N)\n",
-        "\n",
-        "# we'll want gradients for the input\n",
-        "X.requires_grad_(True);"
-      ]
+    "originalKey": "608b95ac-4445-41e6-ad5f-c64a77653a3e",
+    "requestMsgId": "5a4dfcc2-2643-46ce-abdf-7d170965aaee"
+   },
+   "outputs": [],
+   "source": [
+    "N = 5\n",
+    "q = 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "originalKey": "fbf016a1-1c6e-4c87-881b-be870163a306",
+    "showInput": false
+   },
+   "source": [
+    "#### Choosing initial conditions via a heuristic\n",
+    "\n",
+    "Using random initial conditions in conjunction with gradient-based optimizers can be problematic because qEI values and their corresponding gradients are often zero in large parts of the feature space. To mitigate this issue, BoTorch provides a heuristic for generating promising initial conditions (this dirty and not-so-little secret of Bayesian Optimization is actually very important for overall closed-loop performance).\n",
+    "\n",
+    "Given a set of `q`-batches $X'$ and associated acquisiton function values $Y'$, the `initialize_q_batch_nonneg` samples promising initial conditions $X$ (without replacement) from the multinomial distribution\n",
+    "\n",
+    "$$ \\mathbb{P}(X = X'_i) \\sim \\exp (\\eta \\tilde{Y}_i), \\qquad \\text{where} \\;\\; \\tilde{Y}_i = \\frac{Y'_i - \\mu(Y)}{\\sigma(Y)} \\;\\; \\text{if} \\;\\; Y'_i >0 $$\n",
+    "\n",
+    "and $\\mathbb{P}(X = X'_j) = 0$ for all $j$ such that $Y'_j = 0$. \n",
+    "\n",
+    "Fortunately, thanks to the high degree of parallelism in BoTorch, evaluating the acquisition function at a large number of randomly chosen points is quite cheap."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false,
+    "customOutput": null,
+    "executionStartTime": 1668651604094,
+    "executionStopTime": 1668651604159,
+    "jupyter": {
+     "outputs_hidden": false
     },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "c48909f8-747b-42d7-88e0-dd97b4ae9d87",
-        "showInput": false
-      },
-      "source": [
-        "#### Optimizing the acquisition function\n",
-        "\n",
-        "If you have used PyTorch, the basic optimization loop should be quite familiar. However, it is important to note that there is a **key difference** here compared to training ML models: When training ML models, one typically computes the gradient of an empirical loss function w.r.t. the model's parameters, while here we take the gradient of the acquisition function w.r.t. to the candidate set.\n",
-        "\n",
-        "Thus, when setting the optimizer from `torch.optim`, we **do not** add the acquisition function's parameters as parameters to optimize (that would be quite bad!).\n",
-        "\n",
-        "In this example, we use a vanilla `Adam` optimizer with fixed learning rate for a fixed number of iterations in order to keep things simple. But you can get as fancy as you want with learning rate scheduling, early termination, etc.\n",
-        "\n",
-        "A couple of things to note:\n",
-        "1. Evaluating the acquisition function on the `N x q x d`-dim inputs means evaluating `N` `q`-batches in `t`-batch mode. The result of this is an `N`-dim tensor of acquisition function values, evaluated independently. To compute the gradient of the full input `X` via back-propagation, we can for convenience just compute the gradient of the sum of the losses. \n",
-        "2. `torch.optim` does not have good built in support for constraints (general constrained stochastic optimization is hard and still an open research area). Here we do something simple and project the value obtained after taking the gradient step to the feasible set - that is, we perform \"projected stochastic gradient descent\". Since the feasible set here is a hyperrectangle, this can be done by simple clamping. Another approach would be to transform the feasible interval for each dimension to the real line, e.g. by using a sigmoid function, and then optimizing in the unbounded transformed space. "
-      ]
+    "originalKey": "60ec6384-1820-4ba3-a4ac-a75e3ac2a1e8",
+    "requestMsgId": "b38d8d63-745d-4297-b99b-f4b4c5a15eae"
+   },
+   "outputs": [],
+   "source": [
+    "from botorch.optim.initializers import initialize_q_batch_nonneg\n",
+    "\n",
+    "# generate a large number of random q-batches\n",
+    "Xraw = bounds[0] + (bounds[1] - bounds[0]) * torch.rand(100 * N, q, d)\n",
+    "Yraw = qEI(Xraw)  # evaluate the acquisition function on these q-batches\n",
+    "\n",
+    "# apply the heuristic for sampling promising initial conditions\n",
+    "X, _ = initialize_q_batch_nonneg(Xraw, Yraw, N)\n",
+    "\n",
+    "# we'll want gradients for the input\n",
+    "X.requires_grad_(True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "originalKey": "c48909f8-747b-42d7-88e0-dd97b4ae9d87",
+    "showInput": false
+   },
+   "source": [
+    "#### Optimizing the acquisition function\n",
+    "\n",
+    "If you have used PyTorch, the basic optimization loop should be quite familiar. However, it is important to note that there is a **key difference** here compared to training ML models: When training ML models, one typically computes the gradient of an empirical loss function w.r.t. the model's parameters, while here we take the gradient of the acquisition function w.r.t. to the candidate set.\n",
+    "\n",
+    "Thus, when setting the optimizer from `torch.optim`, we **do not** add the acquisition function's parameters as parameters to optimize (that would be quite bad!).\n",
+    "\n",
+    "In this example, we use a vanilla `Adam` optimizer with fixed learning rate for a fixed number of iterations in order to keep things simple. But you can get as fancy as you want with learning rate scheduling, early termination, etc.\n",
+    "\n",
+    "A couple of things to note:\n",
+    "1. Evaluating the acquisition function on the `N x q x d`-dim inputs means evaluating `N` `q`-batches in `t`-batch mode. The result of this is an `N`-dim tensor of acquisition function values, evaluated independently. To compute the gradient of the full input `X` via back-propagation, we can for convenience just compute the gradient of the sum of the losses. \n",
+    "2. `torch.optim` does not have good built in support for constraints (general constrained stochastic optimization is hard and still an open research area). Here we do something simple and project the value obtained after taking the gradient step to the feasible set - that is, we perform \"projected stochastic gradient descent\". Since the feasible set here is a hyperrectangle, this can be done by simple clamping. Another approach would be to transform the feasible interval for each dimension to the real line, e.g. by using a sigmoid function, and then optimizing in the unbounded transformed space. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false,
+    "customOutput": null,
+    "executionStartTime": 1668651604492,
+    "executionStopTime": 1668651604767,
+    "jupyter": {
+     "outputs_hidden": false
     },
+    "originalKey": "5bc4484c-9f7e-478b-990a-4614e05238df",
+    "requestMsgId": "e4eae94a-20a2-49bc-8abd-2322681bf1fa"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "collapsed": false,
-        "customOutput": null,
-        "executionStartTime": 1668651604492,
-        "executionStopTime": 1668651604767,
-        "originalKey": "5bc4484c-9f7e-478b-990a-4614e05238df",
-        "requestMsgId": "e4eae94a-20a2-49bc-8abd-2322681bf1fa"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Iteration  15/75 - Loss: -0.924\n",
-            "Iteration  30/75 - Loss: -1.281\n",
-            "Iteration  45/75 - Loss: -1.374\n",
-            "Iteration  60/75 - Loss: -1.363\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Iteration  75/75 - Loss: -1.361\n"
-          ]
-        }
-      ],
-      "source": [
-        "# set up the optimizer, make sure to only pass in the candidate set here\n",
-        "optimizer = torch.optim.Adam([X], lr=0.01)\n",
-        "X_traj = []  # we'll store the results\n",
-        "\n",
-        "# run a basic optimization loop\n",
-        "for i in range(75):\n",
-        "    optimizer.zero_grad()\n",
-        "    # this performs batch evaluation, so this is an N-dim tensor\n",
-        "    losses = -qEI(X)  # torch.optim minimizes\n",
-        "    loss = losses.sum()\n",
-        "\n",
-        "    loss.backward()  # perform backward pass\n",
-        "    optimizer.step()  # take a step\n",
-        "\n",
-        "    # clamp values to the feasible set\n",
-        "    for j, (lb, ub) in enumerate(zip(*bounds)):\n",
-        "        X.data[..., j].clamp_(lb, ub)  # need to do this on the data not X itself\n",
-        "\n",
-        "    # store the optimization trajecatory\n",
-        "    X_traj.append(X.detach().clone())\n",
-        "\n",
-        "    if (i + 1) % 15 == 0:\n",
-        "        print(f\"Iteration {i+1:>3}/75 - Loss: {loss.item():>4.3f}\")\n",
-        "\n",
-        "    # use your favorite convergence criterion here..."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration  15/75 - Loss: -0.924\n",
+      "Iteration  30/75 - Loss: -1.281\n",
+      "Iteration  45/75 - Loss: -1.374\n",
+      "Iteration  60/75 - Loss: -1.363\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 7,
-      "metadata": {
-        "collapsed": false,
-        "customOutput": null,
-        "executionStartTime": 1668651605000,
-        "executionStopTime": 1668651605005,
-        "originalKey": "76b4392a-d688-498a-9205-d95afbb0aca9",
-        "requestMsgId": "26a3b8ad-350d-4890-886b-7b38f7842e74"
-      },
-      "outputs": [],
-      "source": []
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "python3",
-      "language": "python",
-      "name": "python3"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration  75/75 - Loss: -1.361\n"
+     ]
     }
+   ],
+   "source": [
+    "# set up the optimizer, make sure to only pass in the candidate set here\n",
+    "optimizer = torch.optim.Adam([X], lr=0.01)\n",
+    "X_traj = []  # we'll store the results\n",
+    "\n",
+    "# run a basic optimization loop\n",
+    "for i in range(75):\n",
+    "    optimizer.zero_grad()\n",
+    "    # this performs batch evaluation, so this is an N-dim tensor\n",
+    "    losses = -qEI(X)  # torch.optim minimizes\n",
+    "    loss = losses.sum()\n",
+    "\n",
+    "    loss.backward()  # perform backward pass\n",
+    "    optimizer.step()  # take a step\n",
+    "\n",
+    "    # clamp values to the feasible set\n",
+    "    for j, (lb, ub) in enumerate(zip(*bounds)):\n",
+    "        X.data[..., j].clamp_(lb, ub)  # need to do this on the data not X itself\n",
+    "\n",
+    "    # store the optimization trajecatory\n",
+    "    X_traj.append(X.detach().clone())\n",
+    "\n",
+    "    if (i + 1) % 15 == 0:\n",
+    "        print(f\"Iteration {i+1:>3}/75 - Loss: {loss.item():>4.3f}\")\n",
+    "\n",
+    "    # use your favorite convergence criterion here..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false,
+    "customOutput": null,
+    "executionStartTime": 1668651605000,
+    "executionStopTime": 1668651605005,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "originalKey": "76b4392a-d688-498a-9205-d95afbb0aca9",
+    "requestMsgId": "26a3b8ad-350d-4890-886b-7b38f7842e74"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
   },
-  "nbformat": 4,
-  "nbformat_minor": 2
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
 }