Updates to First Tutorial (#465)

akleeman · web-flow · commit bdc79fe9505f · 2023-11-08T08:58:55.000-08:00
diff --git a/tutorials/tutorial_1_one_dimension.ipynb b/tutorials/tutorial_1_one_dimension.ipynb
@@ -15,6 +15,7 @@
     "import sys\n",
     "import scipy\n",
     "import pandas as pd\n",
+    "import traceback\n",
     "\n",
     "import numpy as np\n",
     "import seaborn as sns\n",
@@ -304,7 +305,88 @@
     "$$\n",
     "A z + \\mu \\leftarrow \\mathcal{N}\\left(\\mu, \\Sigma\\right)\n",
     "$$\n",
-    "One way to get a matrix $A$ such that $A A^T = \\Sigma$ is using the cholesky decomposition. There are python utilities to help with this."
+    "One way to get a matrix $A$ such that $A A^T = \\Sigma$ is using the cholesky decomposition. There are python utilities to help with this:\n",
+    "\n",
+    "- `np.linalg.cholesky(X)` - returns a lower triangular matrix $L$ such that $L L^T = X$. (Note that, `scipy.linalg.cholesky(X)` is an alternative, but it returns the upper triangular portion, $L.T$ unless you provide a `lower=True` argument.)\n",
+    "\n",
+    "Another tip, if you aren't already familiar with [numpy broadcasting rules](https://numpy.org/doc/stable/user/basics.broadcasting.html), it might be worth reading a bit about how it works. For example, if we have a matrix $A$ and a vector $b$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a85b3f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = np.ones(shape=(3, 2))\n",
+    "b = np.arange(3)\n",
+    "print(\"A: \\n\", A)\n",
+    "print(\"b: \\n\", b)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d59b9d38",
+   "metadata": {},
+   "source": [
+    "It might be tempting to do `A + b`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "233401cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A + b"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "741970a2",
+   "metadata": {},
+   "source": [
+    "But that fails, to make it work you can make `b` a column vector (ie, a `(3, 1)` matrix), and then add the two. There are a few ways to do that:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7267760",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A + b.reshape((b.size, 1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9575386",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A + b[:, None]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac5bfbb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A + b[:, np.newaxis]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d09e3f40",
+   "metadata": {},
+   "source": [
+    "We should now be able to write a function which starts by sampling independent random normal variables, correlates them using the cholesky and adds a mean to end up drawing random samples from a mulitivariate normal distribution,"
    ]
   },
   {
@@ -320,14 +402,15 @@
     "    # this function should return one sample per column.\n",
     "    #\n",
     "    # Note that you could just use np.random.multivariate_normal but that's cheating!\n",
+    "    #\n",
+    "    white_noise = np.random.normal(size=(mean.size, size))\n",
+    "    #\n",
     "    # YOUR CODE HERE\n",
     "    #\n",
-    "    # n = \n",
-    "    # A = \n",
+    "    # cholesky =\n",
     "    # random_samples = \n",
     "    return random_samples\n",
     "\n",
-    "\n",
     "TEST_SAMPLE_FROM(sample_from)\n",
     "\n",
     "xs = np.linspace(0., 10., 21)\n",
@@ -357,7 +440,11 @@
     "xs = np.linspace(0., 10., 101)\n",
     "cov = squared_exponential(xs, xs)\n",
     "\n",
-    "samps = sample_from(np.zeros(xs.size), cov, size=20)    \n",
+    "try:\n",
+    "    samps = sample_from(np.zeros(xs.size), cov, size=20)\n",
+    "except Exception as e:\n",
+    "    print(traceback.format_exc())\n",
+    "    print(e)\n",
     "\n",
     "### SPOILER: YOU SHOULD SEE A FAILURE ###"
    ]
@@ -389,7 +476,7 @@
    "id": "751f5d2a",
    "metadata": {},
    "source": [
-    "The condition number is a representation of the differing scales of information captured in a matrix and 1e19 is a MASSIVE condition number. With a condition number that large, even slightly different methods for computing the condition number itself have different values! This is an example of floating point error. One of the strict requirements of a covariance function is that it produce covariance matrices which are positive definite (aka invertible), meaning all the eigen values need to be greater than zero. You can see that _technically_ the matrix we created _is_ positive definite (the smallest eigen value is 1e-18), but with a condition number that large floating point arithmetic errors can accumulate making it look like the matrix is not invertible. We'd say the matrix is not \"numerically positive definite\". This is unfortunately a relatively common problem, but thankfully, there's an easy band-aid: add some noise. By adding relatively small values to the diagonal of our covariance matrix we can resolve the issue:"
+    "The condition number is a representation of the differing scales of information captured in a matrix and 1e19 is a MASSIVE condition number. With a condition number that large, even slightly different methods for computing the condition number itself have different values! This is an example of floating point error. One of the strict requirements of a covariance function is that it produce covariance matrices which are positive definite (aka invertible), meaning all the eigen values need to be greater than zero. You can see that _technically_ the matrix we created _is_ positive definite (the smallest eigen value is greater than zero), but with a condition number that large floating point arithmetic errors can accumulate making it look like the matrix is not invertible. We'd say the matrix is not \"numerically positive definite\". Unfortunately this a relatively common problem, but thankfully, there's an easy band-aid: add some noise. By adding relatively small values to the diagonal of our covariance matrix we can resolve the issue:"
    ]
   },
   {
@@ -416,7 +503,21 @@
    "id": "92349e9d",
    "metadata": {},
    "source": [
-    "Much better! Just adding `1e-12` to the diagonal made our matrix invertible. It still has a pretty large condition number, but we seem to be getting reasonable results from it now. The values we added to the diagonal are sometimes called a \"nugget\" which can be thought of as measurement noise. By adding a nugget you're acknowledging that nothing can be estimated perfectly. This diagonal addition puts a floor on the eigen values, notice that the minimum eigen value is (almost) exactly our nugget."
+    "Much better! Just adding `1e-12` to the diagonal made our matrix invertible. It still has a pretty large condition number, but we seem to be getting reasonable results from it now. The values we added to the diagonal are sometimes called a \"nugget\" which can be thought of as measurement noise. By adding a nugget you're acknowledging that nothing can be estimated perfectly.\n",
+    "\n",
+    "This diagonal addition puts a floor on the eigen values, notice that the minimum eigen value is (almost) exactly our nugget, this is not a coincidence. Take the eigen decomposition for example,\n",
+    "$$\n",
+    "A = Q \\Lambda Q^{-1}\n",
+    "$$\n",
+    "where $Q$ is a matrix holding the eigen vectors and $\\Lambda$ is a diagonal matrix with eigen values on the diagonal. Now add a nugget, $\\eta^2$,\n",
+    "$$\n",
+    "\\begin{align}\n",
+    "A + \\sigma^2 I &= Q \\Lambda Q^{-1} + \\eta^2 I \\\\\n",
+    "&= Q \\Lambda Q^{-1} + \\eta^2 Q Q^{-1} \\\\\n",
+    "&= Q \\left( \\Lambda + \\eta^2 I\\right) Q^{-1} \\\\\n",
+    "\\end{align}\n",
+    "$$\n",
+    "The eigen vectors, $Q$, are all the same, and the nugget we've added is directly added to each eigen value, so if the smallest eigen value of $A$ is $\\lambda_{min}$ then after adding a nugget the smallest eigen value will be $\\lambda_{min} + \\eta^2$"
    ]
   },
   {
@@ -446,11 +547,16 @@
     "$$\n",
     "notice that we're going to treat the mean as zero from now on. If you really want a non-zero mean you can keep all the math the same and just subtract the mean from all your measurements ahead of time, then add it to all predictions after. This mean zero assumption is _very_ common.\n",
     "\n",
+    "One possible point of confusion, we use $\\Sigma_{yy}$ to represent the covariance between all the measurements, but to create the covariance you need to evaluate the covariance function at the locations $x$ that correspond to the measurements $y$. In otherwords, row $i$ and column $j$ of $\\Sigma_{yy}$ would be given by,\n",
+    "$$\n",
+    "\\left[\\Sigma_{yy}\\right]_{ij} = c(x_i, x_j)\n",
+    "$$\n",
+    "\n",
     "Similarly we can build the prior for the function at all the locations we'd like to predict,\n",
     "$$\n",
     "\\mathbf{f}^* \\sim \\mathcal{N}\\left(0, \\Sigma_{**}\\right).\n",
     "$$\n",
-    "Here we do not add measurement noise because we're interested in the value of the function itself, not the value of measurements of the function. We need to compute one more covariance matrix, $\\Sigma_{*y}$ (note that we don't need $\\Sigma_{y*}$ because $\\Sigma_{y*} = \\Sigma_{y*}^T$). $\\Sigma_{*y}$ captures the correlation between what we've observed and what we want to predict. Once we've constructed these matrices we can build an augmented distribution which describes both the measurements we made and what we want to predict,\n",
+    "Here we do not add measurement noise because we're interested in the value of the function itself, not the value of measurements of the function. We need to compute one more covariance matrix, $\\Sigma_{*y}$ (note that we don't need $\\Sigma_{y*}$ because $\\Sigma_{y*} = \\Sigma_{*y}^T$). $\\Sigma_{*y}$ captures the correlation between what we've observed and what we want to predict. Once we've constructed these matrices we can build an augmented distribution which describes both the measurements we made and what we want to predict,\n",
     "$$\n",
     "\\begin{bmatrix}\n",
     "\\mathbf{y} \\\\\n",
@@ -482,6 +588,8 @@
     "def fit_and_predict(cov_func, X, y, x_star, meas_noise):\n",
     "    # Using cov_func build the matrices\n",
     "    #\n",
+    "    # Since we can't use greek letters in the code, we'll use S for \\Sigma\n",
+    "    #\n",
     "    # S_yy = \n",
     "    # S_sy = \n",
     "    # S_ss =\n",
@@ -490,7 +598,7 @@
     "    #\n",
     "    # mean = [a column vector holding the mean]\n",
     "    # cov = [a square matrix holding the posterior covariance]\n",
-    "    # return mean, cov\n",
+    "    return mean, cov\n",
     "\n",
     "TEST_FIT_AND_PREDICT(fit_and_predict)\n",
     "\n",
@@ -515,7 +623,7 @@
    "outputs": [],
    "source": [
     "# note we need to add a nugget here to make sure the posterior covariance is numerically definite\n",
-    "samps = sample_from(pred_mean, pred_cov + 1e-16 * np.eye(pred_mean.size), size=50)\n",
+    "samps = sample_from(pred_mean, pred_cov + 1e-12 * np.eye(pred_mean.size), size=50)\n",
     "for i in range(samps.shape[1]):\n",
     "    plt.plot(x_gridded, samps[:, i], color=\"steelblue\", alpha=0.5)\n",
     "plot_truth()\n",
@@ -538,14 +646,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def plot_spread(xs, mean, variances):\n",
-    "    sd = np.sqrt(variances)\n",
-    "    plt.plot(xs, mean, lw=5, color='steelblue', label=\"prediction\")\n",
-    "    plt.fill_between(xs, mean + 2*sd, mean - 2*sd,\n",
-    "                     color='steelblue', alpha=0.2, label=\"2 sigma\")\n",
-    "    plt.fill_between(xs, mean + sd, mean - sd,\n",
-    "                     color='steelblue', alpha=0.5, label=\"sigma\")\n",
-    "\n",
     "plot_spread(x_gridded, pred_mean, np.diag(pred_cov))\n",
     "    \n",
     "plot_truth()\n",
@@ -611,7 +711,8 @@
     "    cov_func = partial(squared_exponential, ell=ell, sigma=sigma)\n",
     "    return -log_likelihood(cov_func, X, y, meas_noise=meas_noise)\n",
     "\n",
-    "mle_params = scipy.optimize.minimize(compute_negative_log_likelihood, np.zeros(3), method=\"L-BFGS-B\")\n",
+    "mle_params = scipy.optimize.minimize(compute_negative_log_likelihood,\n",
+    "                                     np.zeros(3), method=\"L-BFGS-B\")\n",
     "mle_sigma, mle_ell, mle_meas_noise = np.exp(mle_params.x)\n",
     "\n",
     "print(f\"MLE PARAMS:\\n sigma : {mle_sigma}\\n ell: {mle_ell}\\n meas_noise: {mle_meas_noise}\")\n",
@@ -626,7 +727,7 @@
    "source": [
     "Still not perfect ... but the true function is about as smooth as the true function and now mostly within the uncertainty bounds. Notice that a lot of the measurements are outside of the bounds. That's OK! We explicitly asked for the posterior distribution of the unknown function _not_ the posterior distribution of measurements of the function. Subtle distinictions like that are important to pay attention to.\n",
     "\n",
-    "Anothing thing worth noting, the $\\sigma_{se}$ that maximized likelihood is about $2$, but the posterior distribution has function values which are 3 to 4. It might be tempting to think the value of $2$ means the function will be within $\\left[-2, 2\\right]$, but it can be very common for the function estimates to exceed the sigma from the prior. Sometimes multiple times over. Here, for example, is the posterior with $\\sigma_{se} = 1$,"
+    "Another thing worth noting, the $\\sigma_{se}$ that maximized likelihood is about $2$ and it might be tempting to think the value of $2$ means the function will mostly be within $\\left[-2, 2\\right]$, but it can be very common for the function estimates to exceed the sigma from the prior. Sometimes multiple times over. Here, for example are the predictions with $\\sigma_{se} = 0.5$,"
    ]
   },
   {
@@ -636,15 +737,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_fit_and_predict(ell=2.5, sigma=1, meas_noise=0.4)"
+    "fit_sizes = [1, 5, 20, 100]\n",
+    "fig, axes = plt.subplots(1, len(fit_sizes), figsize=(36, 8))\n",
+    "cov_func = partial(squared_exponential, ell=mle_ell, sigma=0.5)\n",
+    "\n",
+    "for ax, n in zip(axes, fit_sizes):\n",
+    "    X_sub = X[:n]\n",
+    "    y_sub = y[:n]\n",
+    "    \n",
+    "    pred_mean, pred_cov = fit_and_predict(cov_func, X_sub, y_sub, x_gridded, meas_noise=mle_meas_noise)\n",
+    "    ax.scatter(X_sub, y_sub, color=\"black\", s=50)\n",
+    "    plot_spread(x_gridded, pred_mean, np.diag(pred_cov), ax=ax)\n",
+    "    ax.set_ylim([-2, 4])\n"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "a961ffe1",
    "metadata": {},
    "source": [
-    "According to the prior with $\\sigma_{se} = 1$, there's only a $0.3\\%$ chance of the function taking on a value of $3$, yet that prior actually results in a relatively good fit. The posterior even states there's a reasonable chance the true function approaches $4$. Takeaway: the hyper parameters describe the prior we place on a function, but ultimately it can be the data that drives the posterior (depending of course on measurement noise, quantity and other factors)."
+    "It still does a pretty good job and according to the prior, $\\sigma_{se} = 0.5$, there's only a $2 x 10^{-7}$ percent chance of the function taking on a value of $3$, yet we're seeing that happen. The point here is that the data can eventually override the prior. When we fit the model to a single data point the resulting predictions are very close to the prior, but ultimately the data drives the estimate. The prior is still very important, we saw some bad choices of parmeters earlier, but it's really the interaction of the prior and the data that matter."
    ]
   },
   {
diff --git a/tutorials/tutorial_utils.py b/tutorials/tutorial_utils.py
@@ -4,6 +4,7 @@
 
 from scipy.stats import ks_1samp, norm
 from functools import partial
+from inspect import signature, Parameter
 
 EXAMPLE_SLOPE_VALUE = np.sqrt(2.0)
 EXAMPLE_CONSTANT_VALUE = 3.14159
@@ -21,6 +22,7 @@
 
 x_gridded = np.linspace(LOWEST, HIGHEST, 301)
 
+
 def reshape_inputs(x):
     if x.ndim == 1:
         return x[:, None]
@@ -45,7 +47,7 @@ def sinc(xs):
 
 
 def truth(xs):
-    return (EXAMPLE_SCALE_VALUE * sinc(xs - EXAMPLE_TRANSLATION_VALUE))
+    return EXAMPLE_SCALE_VALUE * sinc(xs - EXAMPLE_TRANSLATION_VALUE)
 
 
 def generate_training_data(n=N):
@@ -131,30 +133,43 @@ def example_fit_and_predict(cov_func, X, y, x_star, meas_noise):
 
 
 def sinc(xs):
-    return np.where(xs == 0, np.ones(xs.size), np.sin(xs) / xs)
+    non_zero = np.nonzero(xs)[0]
+    output = np.ones(xs.shape)
+    output[non_zero] = np.sin(xs[non_zero]) / xs[non_zero]
+    return output
 
 
 def truth(xs):
-    return (EXAMPLE_SCALE_VALUE * sinc(xs - EXAMPLE_TRANSLATION_VALUE))
+    return EXAMPLE_SCALE_VALUE * sinc(xs - EXAMPLE_TRANSLATION_VALUE)
 
 
 def plot_truth(xs):
-    plt.plot(xs, truth(xs),
-             lw=5,
-             color="firebrick", label="truth")
+    plt.plot(xs, truth(xs), lw=5, color="firebrick", label="truth")
 
 
-def plot_measurements(xs, ys):
-    plt.scatter(xs, ys, s=50, color='black', label="measurements")
+def plot_measurements(xs, ys, color="black", label="measurements"):
+    plt.scatter(xs, ys, s=50, color=color, label=label)
 
 
-def plot_spread(xs, mean, variances):
+def plot_spread(xs, mean, variances, ax=None):
+    if ax is None:
+        ax = plt.gca()
+    xs = np.reshape(xs, -1)
+    mean = np.reshape(mean, -1)
+    variances = np.reshape(variances, -1)
     sd = np.sqrt(variances)
-    plt.plot(xs, mean, lw=5, color='steelblue', label="prediction")
-    plt.fill_between(xs, mean + 2*sd, mean - 2*sd,
-                     color='steelblue', alpha=0.2, label="uncertainty")
-    plt.fill_between(xs, mean + sd, mean - sd,
-                     color='steelblue', alpha=0.5, label="uncertainty")
+    ax.plot(xs, mean, lw=5, color="steelblue", label="prediction")
+    ax.fill_between(
+        xs,
+        mean + 2 * sd,
+        mean - 2 * sd,
+        color="steelblue",
+        alpha=0.2,
+        label="uncertainty",
+    )
+    ax.fill_between(
+        xs, mean + sd, mean - sd, color="steelblue", alpha=0.5, label="uncertainty"
+    )
 
 
 def TEST_FIT_AND_PREDICT(f):
@@ -183,15 +198,13 @@ def TEST_FIT_AND_PREDICT(f):
             f"Incorrect covariance [.\n Expected: f{expected_cov} \n Actual: f{actual_cov}"
         )
 
+
 def example_fit(cov_func, X, y, meas_noise):
     K_yy = cov_func(X, X) + meas_noise * meas_noise * np.eye(y.size)
     L = np.linalg.cholesky(K_yy)
     v = scipy.linalg.cho_solve((L, True), y)
-    
-    return {"train_locations": X,
-            "information": v,
-            "cholesky": L,
-            "cov_func": cov_func}
+
+    return {"train_locations": X, "information": v, "cholesky": L, "cov_func": cov_func}
 
 
 def example_predict(fit_model, x_star):