diff --git a/tutorials/fobo.ipynb b/tutorials/fobo.ipynb new file mode 100644 index 0000000000..ba0c16e834 --- /dev/null +++ b/tutorials/fobo.ipynb @@ -0,0 +1,574 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ce096015-21cf-4974-9a07-c9c0d13a057b", + "metadata": { + "tags": [] + }, + "source": [ + "## BO with Derivatives\n", + "\n", + "This tutorial demonstrates how to use gradient information in Bayesian Optimization. Traditionally, Bayesian Optimization is a zeroth-order method that does not utilize gradient information because oftentimes the black-box functions being optimized does not provide gradients. However, there may be some cases where gradient information is available [1]. As a result, derivative-enabled Bayesian Optimization may use the additional gradient information. We will call zeroth-order Bayesian Optimization ZOBO and first-order Bayesian Optimization FOBO.\n", + "\n", + "The key differences between FOBO and ZOBO are that FOBO uses a modified Gaussian Process which uses derivative information (dGP) and the acquisition functions need to be able to handle a dGP [2].\n", + "\n", + "In this notebook, we use the Noisy Expected Improvement (qNEI) and Knowledge Gradient (qKG) acquisition functions on a dGP which is trained on evaluations from the 3D Rosenbrock function [3].\n", + "\n", + "[1]: [Quentin Bertrant, et al. Implicit Differentiation for Fast Hyperparameter Selection in Non-Smooth Convex Learning. Journal of Machine Learning Research, 2022.](https://dl.acm.org/doi/pdf/10.5555/3586589.3586738) \n", + "[2]: [Jian Wu, et al. Bayesian Optimization with Gradients. 31st Conference on Neural Information Processing Systems, 2017.](https://proceedings.neurips.cc/paper/2017/file/64a08e5f1e6c39faeb90108c430eb120-Paper.pdf) \n", + "[3]: [Frazier, Peter I. \"A tutorial on Bayesian optimization.\" arXiv preprint arXiv:1807.02811 (2018).](https://arxiv.org/pdf/1807.02811.pdf%C2%A0)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ddb3ae78-4f7c-49d4-a5ed-48b8b717734d", + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "\n", + "import torch\n", + "import gpytorch\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "\n", + "from botorch import fit_gpytorch_mll\n", + "from botorch.optim import optimize_acqf\n", + "from botorch.acquisition import qKnowledgeGradient, qNoisyExpectedImprovement\n", + "from botorch.acquisition.objective import ScalarizedPosteriorTransform\n", + "from botorch.models.gpytorch import GPyTorchModel\n", + "from botorch.sampling.normal import SobolQMCNormalSampler\n", + "from botorch.models.model import FantasizeMixin\n", + "from botorch.models.gpytorch import GPyTorchModel\n", + "\n", + "from gpytorch.mlls.sum_marginal_log_likelihood import ExactMarginalLogLikelihood\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f3c38a9d-2c6e-4804-a43e-ab1a9ca1e576", + "metadata": {}, + "outputs": [], + "source": [ + "# Set default data type to avoid numerical issues from low precision\n", + "torch.set_default_dtype(torch.double)\n", + "\n", + "# Set seed for consistency and good results\n", + "seed = 3\n", + "torch.manual_seed(seed)\n", + "np.random.seed(seed)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b0682efe-5d3d-4128-88fe-6783aec381bc", + "metadata": {}, + "outputs": [], + "source": [ + "# Used to test the notebook quickly\n", + "SMOKE_TEST = os.environ.get(\"SMOKE_TEST\")\n", + "\n", + "# Parameters\n", + "f_noise = 0.05 # Noise added to both function evaluations and their gradients\n", + "rs_init = 10 # Initialization points from random search\n", + "rb_dim = 3 # Rosenbrock dimensions\n", + "d_min = -2. # Lower bound of search space for each dimension of the domain\n", + "d_max = 2. # Upper bound of search space for each dimension of the domain\n", + "bo_iters = 40 if not SMOKE_TEST else 2 # Number of iterations of Bayesian Optimization\n", + "mc_samples = 256 if not SMOKE_TEST else 32 # Samples from Monte-Carlo sampler\n" + ] + }, + { + "cell_type": "markdown", + "id": "f873c969-0fd5-4b5b-9b4b-3259c7157c61", + "metadata": {}, + "source": [ + "## The Generalized Rosenbrock's Function\n", + "\n", + "We use the [Generalized Rosenbrock's Function](https://docs.scipy.org/doc/scipy-0.14.0/reference/tutorial/optimize.html#unconstrained-minimization-of-multivariate-scalar-functions-minimize) because it has a unique global minimum at $x_0=(1.0,1.0,1.0)$ with $f(x_0)=0.0$ when working in $3$ dimensions. Since Bayesian Optimization is typically concerned with maximizing an objective function, we augment the Rosenbrock function $f(x)$ by creating $g(x)=-f(x)$, the augmented Rosenbrock function. Further, we use backpropogation to calculate the gradient of the Rosenbrock function for each input $x$.\n", + "\n", + "The variant of the Rosenbrock function used here is:\n", + "$$f(\\boldsymbol{x}) = \\sum_{i=1}^{d-1} \\left [ 100(x_{i+1}-x_{i}^{2}) + (1-x_i)^2 \\right ], \\quad \\boldsymbol{x} \\in \\mathbb{R}^d$$" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "74c06a83-8480-49a5-8137-2b5d35a9ecf2", + "metadata": {}, + "outputs": [], + "source": [ + "# Rosenbrock function multiplied by -1 to make this a maximization problem\n", + "# Note: This only evaluates a single X at a time (not a vectorized function)\n", + "def Rosenbrock(X):\n", + " # Cast to calculate gradient\n", + " X.requires_grad_(True)\n", + " \n", + " # Get dimensions\n", + " N = X.shape[0]\n", + " \n", + " # Vector containing individual values in summation\n", + " cur_sum = torch.zeros_like(torch.empty(N-1))\n", + " \n", + " # Evaluate generaled N-dimensional Rosenbrock function\n", + " for i in range(0,N-1):\n", + " cur_sum[i] = 100*(X[i+1]-X[i]**2)**2+(1-X[i])**2\n", + " \n", + " # Compute sum\n", + " Y = torch.sum(cur_sum)\n", + " \n", + " # Make this a maximization problem!\n", + " Y = -1*Y\n", + " \n", + " # Calculate gradient\n", + " Y.backward()\n", + " grads = X.grad\n", + " \n", + " # Convert to doubles for better numerical stability\n", + " Y = Y.to(torch.double).detach()\n", + " grads = grads.to(torch.double)\n", + " \n", + " # Add noise to evaluation and gradient\n", + " grads += torch.normal(0,1,grads.shape)*f_noise\n", + " Y += torch.normal(0,1,Y.shape)*f_noise\n", + " \n", + " # Return function value and the gradient\n", + " return Y, grads" + ] + }, + { + "cell_type": "markdown", + "id": "acf0cc23-446d-4a94-9978-151828b8ef0f", + "metadata": {}, + "source": [ + "## Derivative-enabled Gaussian Process (dGP)\n", + "\n", + "We create a custom BoTorch dGP model using GPyTorch. This model requires modifying the mean and covariance matrix to incorporate gradient/derivative information. As such, a mean *vector* and the covariance matrix of the RBF kernel are used [2]. This class was modified from this [GPyTorch tutorial](https://docs.gpytorch.ai/en/latest/examples/08_Advanced_Usage/Simple_GP_Regression_Derivative_Information_1d.html). \n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a42af425-d89b-4398-9bae-fdee07a9121b", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the dGP\n", + "class GPWithDerivatives(GPyTorchModel, gpytorch.models.ExactGP, FantasizeMixin):\n", + " def __init__(self, train_X, train_Y):\n", + " # Dimension of model\n", + " dim = train_X.shape[-1] \n", + " # Multi-dimensional likelihood since we're modeling a function and its gradient\n", + " likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=1 + dim)\n", + " super().__init__(train_X, train_Y, likelihood)\n", + " # Gradient-enabled mean\n", + " self.mean_module = gpytorch.means.ConstantMeanGrad() \n", + " # Gradient-enabled kernel\n", + " self.base_kernel = gpytorch.kernels.RBFKernelGrad( \n", + " ard_num_dims=dim, # Separate lengthscale for each input dimension\n", + " )\n", + " # Adds lengthscale to the kernel\n", + " self.covar_module = gpytorch.kernels.ScaleKernel(self.base_kernel)\n", + " # Output dimension is 1 (function value) + dim (number of partial derivatives)\n", + " self._num_outputs = 1 + dim\n", + " # Used to extract function value and not gradients during optimization\n", + " self.scale_tensor = torch.tensor([1.0] + [0.0]*dim, dtype=torch.double)\n", + "\n", + " def forward(self, x):\n", + " mean_x = self.mean_module(x)\n", + " covar_x = self.covar_module(x)\n", + " return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)" + ] + }, + { + "cell_type": "markdown", + "id": "9b8024cb-34da-4c16-870e-5dcd9d1b7197", + "metadata": {}, + "source": [ + "## Random Search\n", + "\n", + "To initialize our GP we obtain initialization data using random search on the optimization domain $[-2,2]^3$." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "18331985-8df4-49fb-9ccd-aa681a52995a", + "metadata": {}, + "outputs": [], + "source": [ + "# Random search locations\n", + "train_X = torch.rand((rs_init, rb_dim))*(d_max - d_min)+d_min\n", + "\n", + "# Populate random search evaluations\n", + "train_Y = torch.empty((rs_init, rb_dim + 1))\n", + "for i in range(rs_init):\n", + " obj, deriv = Rosenbrock(train_X[i])\n", + " train_Y[i][0] = obj\n", + " train_Y[i][1:] = deriv\n" + ] + }, + { + "cell_type": "markdown", + "id": "805acd60-bcd7-425d-ba57-7969d893f453", + "metadata": {}, + "source": [ + "## qNEI FOBO Loop" + ] + }, + { + "cell_type": "markdown", + "id": "c3371766-c05f-4a8c-9720-9f236203a925", + "metadata": {}, + "source": [ + "Here we have the FOBO loop with the qNEI acquisition function on a dGP. Note that we have to include a `ScalarizedPosteriorTransform` to ensure that we're optiming over the black-box function's function evaluation instead of its gradients (since the output of the dGP is a mean vector of the black-box function's value and gradients, as opposed to a mean scalar of just the black-box function's value). Also, we scale the domain and range of the training data [to avoid numerical issues](https://github.com/pytorch/botorch/issues/1745)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e8d2138-81ec-49da-8fca-06991197a95c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for i in range(bo_iters):\n", + " # Standardize domain and range, this prevents numerical issues\n", + " mean_Y = train_Y.mean(dim=0)\n", + " std_Y = train_Y.std(dim=0)\n", + " unscaled_train_Y = train_Y\n", + " scaled_train_Y = (train_Y - mean_Y) / std_Y\n", + " \n", + " mean_X = train_X.mean(dim=0)\n", + " std_X = train_X.std(dim=0)\n", + " unscaled_train_X = train_X\n", + " scaled_train_X = (train_X - mean_X) / std_X\n", + "\n", + " # Initialize the dGP and fit it to the training data\n", + " dGP_model = GPWithDerivatives(scaled_train_X, scaled_train_Y) # Define dGP model\n", + " mll = ExactMarginalLogLikelihood(dGP_model.likelihood, dGP_model) # Define MLL\n", + " fit_gpytorch_mll(mll, max_attempts=20)\n", + "\n", + " # Extract only the function value from the multi-output GP, the dGP\n", + " scal_transf = ScalarizedPosteriorTransform(weights=dGP_model.scale_tensor)\n", + "\n", + " # Create qNEI acquisition function\n", + " sampler = SobolQMCNormalSampler(mc_samples)\n", + " qNEI = qNoisyExpectedImprovement(dGP_model,\\\n", + " train_X,\\\n", + " sampler,\\\n", + " posterior_transform=scal_transf)\n", + "\n", + " # Set bounds for optimization: [-2,2]^d\n", + " bounds = torch.vstack([torch.tensor([d_min]*rb_dim),\\\n", + " torch.tensor([d_max]*rb_dim)])\n", + "\n", + " # Rescale bounds based on training data\n", + " bounds = (bounds - mean_X) / std_X\n", + "\n", + " # Get candidate point for objective\n", + " candidates, _ = optimize_acqf(\n", + " acq_function=qNEI,\n", + " bounds=bounds,\n", + " q=1,\n", + " num_restarts=100,\n", + " raw_samples=512, # used for intialization heuristic\n", + " options={\"batch_limit\": 1, \"maxiter\": 1000},\n", + " )\n", + "\n", + " # Rescale candidate back to original domain\n", + " candidate = (candidates[0] * std_X) + mean_X\n", + " \n", + " # Evaluate the objective and add it to the list of data for the model\n", + " obj, deriv = Rosenbrock(candidate)\n", + " new_Y = torch.cat([obj.unsqueeze(0),deriv])\n", + " \n", + " # Append evaluation to training data\n", + " train_X = torch.vstack((train_X, candidate)).detach().clone()\n", + " train_Y = torch.vstack((train_Y, new_Y)).detach().clone()\n" + ] + }, + { + "cell_type": "markdown", + "id": "d51a9838-1b0d-4369-b0cf-0baffe996925", + "metadata": {}, + "source": [ + "## Plot Results" + ] + }, + { + "cell_type": "markdown", + "id": "c74d9ca3-2fa1-4df0-b1ea-b33b3e46f24e", + "metadata": {}, + "source": [ + "From plotting the results we see that derivative-enabled Bayesian Optimization is able to utilize gradient information to maximize the objective function." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0b235183-05ed-428c-b32d-2a8da25bb296", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Extract maximum value per iteration\n", + "maxima = torch.zeros(rs_init + bo_iters)\n", + "cur_max = train_Y[0][0]\n", + "\n", + "for i in range(rs_init + bo_iters):\n", + " cur_max = cur_max if cur_max > train_Y[i][0] else train_Y[i][0]\n", + " maxima[i] = cur_max\n", + "\n", + "# Get plotting values\n", + "plt_y = maxima.numpy()\n", + "plt_x = list(range(1,len(plt_y)+1))\n", + "\n", + "# Have the first x-value in the plot start at 1 to be consistent with above prints\n", + "plt_y = np.hstack([plt_y])\n", + "plt_x = np.hstack([plt_x])\n", + "\n", + "# Plot all values\n", + "plt.plot([plt_x[rs_init-1], plt_x[rs_init]], [plt_y[rs_init-1], plt_y[rs_init]],\\\n", + " linestyle='--', color='gray')\n", + "plt.plot(plt_x[0:rs_init], plt_y[0:rs_init], color='red', label=\"Init values\", marker='.')\n", + "plt.plot(plt_x[rs_init:], plt_y[rs_init:], color='black', label=\"BO values\", marker='.')\n", + "plt.xlabel(\"BO Iteration\")\n", + "plt.ylabel(\"Maximum value\")\n", + "plt.title(\"Best value found per BO iteration including initialization\")\n", + "plt.legend()\n", + "plt.show()\n", + "plt.close()\n", + "\n", + "plt.plot([plt_x[rs_init-1], plt_x[rs_init]], [plt_y[rs_init-1], plt_y[rs_init]],\\\n", + " linestyle='--', color='gray')\n", + "plt.plot(plt_x[rs_init-1], plt_y[rs_init-1], color='red', label=\"Init values\", marker='.')\n", + "plt.plot(plt_x[rs_init:], plt_y[rs_init:], color='black', label=\"BO values\", marker='.')\n", + "plt.xlabel(\"BO Iteration\")\n", + "plt.ylabel(\"Maximum value\")\n", + "plt.title(\"Best value found per BO iteration after initialization\")\n", + "plt.legend()\n", + "plt.show()\n", + "plt.close()\n" + ] + }, + { + "cell_type": "markdown", + "id": "9fd8e9fc-d42a-4215-800d-55c51decb920", + "metadata": {}, + "source": [ + "## qKG FOBO Loop" + ] + }, + { + "cell_type": "markdown", + "id": "8dc5e991-8508-4050-a1c3-d3da3b14b1cb", + "metadata": {}, + "source": [ + "Here we have the same FOBO loop, but with the qKG acquisition function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b034797-d2ae-4b73-ba59-d1f94fa88e2d", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Reset training data to the same as for qNEI\n", + "train_X = train_X[0:rs_init]\n", + "train_Y = train_Y[0:rs_init]\n", + "\n", + "for i in range(bo_iters):\n", + " # Standardize domain and range, this prevents numerical issues\n", + " mean_Y = train_Y.mean(dim=0)\n", + " std_Y = train_Y.std(dim=0)\n", + " unscaled_train_Y = train_Y\n", + " scaled_train_Y = (train_Y - mean_Y) / std_Y\n", + " \n", + " mean_X = train_X.mean(dim=0)\n", + " std_X = train_X.std(dim=0)\n", + " unscaled_train_X = train_X\n", + " scaled_train_X = (train_X - mean_X) / std_X\n", + "\n", + " # Initialize the dGP and fit it to the training data\n", + " dGP_model = GPWithDerivatives(scaled_train_X, scaled_train_Y) # Define dGP model\n", + " mll = ExactMarginalLogLikelihood(dGP_model.likelihood, dGP_model) # Define MLL\n", + " fit_gpytorch_mll(mll, max_attempts=20)\n", + "\n", + " # Extract only the function value from the multi-output GP, the dGP\n", + " scal_transf = ScalarizedPosteriorTransform(weights=dGP_model.scale_tensor)\n", + "\n", + " # Create the qKG acquisition function\n", + " qKG = qKnowledgeGradient(dGP_model,\\\n", + " posterior_transform=scal_transf,\\\n", + " num_fantasies=5)\n", + "\n", + " # Set bounds for optimization: [-2,2]^d\n", + " bounds = torch.vstack([torch.tensor([d_min]*rb_dim),\\\n", + " torch.tensor([d_max]*rb_dim)])\n", + "\n", + " # Rescale domain based on training data\n", + " bounds = (bounds - mean_X) / std_X\n", + "\n", + " # Get candidate point for objective\n", + " candidates, _ = optimize_acqf(\n", + " acq_function=qKG,\n", + " bounds=bounds,\n", + " q=1,\n", + " num_restarts=100,\n", + " raw_samples=512, # used for intialization heuristic\n", + " options={\"batch_limit\": 1, \"maxiter\": 1000},\n", + " )\n", + "\n", + " # Rescale candidate back to original domain\n", + " candidate = (candidates[0] * std_X) + mean_X\n", + " \n", + " # Evaluate the objective and add it to the list of data for the model\n", + " obj, deriv = Rosenbrock(candidate)\n", + " new_Y = torch.cat([obj.unsqueeze(0),deriv])\n", + " \n", + " # Append evaluation to training data\n", + " train_X = torch.vstack((train_X, candidate)).detach().clone()\n", + " train_Y = torch.vstack((train_Y, new_Y)).detach().clone()\n" + ] + }, + { + "cell_type": "markdown", + "id": "16d05516-6e89-43a1-8904-5acd950e9f00", + "metadata": {}, + "source": [ + "## Plot Results" + ] + }, + { + "cell_type": "markdown", + "id": "466e0aec-424b-4ac4-b908-256eb6eee790", + "metadata": {}, + "source": [ + "From plotting the results we see that derivative-enabled Bayesian Optimization is able to find better maxima values." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c68f082f-92ac-4c9c-bcdc-8d428f0b6e44", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Extract maximum value per iteration\n", + "maxima = torch.zeros(rs_init + bo_iters)\n", + "cur_max = train_Y[0][0]\n", + "\n", + "for i in range(rs_init + bo_iters):\n", + " cur_max = cur_max if cur_max > train_Y[i][0] else train_Y[i][0]\n", + " maxima[i] = cur_max\n", + "\n", + "# Get plotting values\n", + "plt_y = maxima.numpy()\n", + "plt_x = list(range(1,len(plt_y)+1))\n", + "\n", + "# Have the first x-value in the plot start at 1 to be consistent with above prints\n", + "plt_y = np.hstack([plt_y])\n", + "plt_x = np.hstack([plt_x])\n", + "\n", + "# Plot all values\n", + "plt.plot([plt_x[rs_init-1], plt_x[rs_init]], [plt_y[rs_init-1], plt_y[rs_init]],\\\n", + " linestyle='--', color='gray')\n", + "plt.plot(plt_x[0:rs_init], plt_y[0:rs_init], color='red', label=\"Init values\", marker='.')\n", + "plt.plot(plt_x[rs_init:], plt_y[rs_init:], color='black', label=\"BO values\", marker='.')\n", + "plt.xlabel(\"BO Iteration\")\n", + "plt.ylabel(\"Maximum value\")\n", + "plt.title(\"Best value found per BO iteration including initialization\")\n", + "plt.legend()\n", + "plt.show()\n", + "plt.close()\n", + "\n", + "plt.plot([plt_x[rs_init-1], plt_x[rs_init]], [plt_y[rs_init-1], plt_y[rs_init]],\\\n", + " linestyle='--', color='gray')\n", + "plt.plot(plt_x[rs_init-1], plt_y[rs_init-1], color='red', label=\"Init values\", marker='.')\n", + "plt.plot(plt_x[rs_init:], plt_y[rs_init:], color='black', label=\"BO values\", marker='.')\n", + "plt.xlabel(\"BO Iteration\")\n", + "plt.ylabel(\"Maximum value\")\n", + "plt.title(\"Best value found per BO iteration after initialization\")\n", + "plt.legend()\n", + "plt.show()\n", + "plt.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "botorch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/website/tutorials.json b/website/tutorials.json index 03d73e065d..26532a0cd4 100644 --- a/website/tutorials.json +++ b/website/tutorials.json @@ -147,6 +147,10 @@ { "id": "composite_mtbo", "title": "Composite Bayesian Optimization with Multi-Task Gaussian Processes" + }, + { + "id": "fobo", + "title": "BO with Derivatives" } ] }