diff --git a/tutorials/W1D5_Microcircuits/W1D5_Tutorial1.ipynb b/tutorials/W1D5_Microcircuits/W1D5_Tutorial1.ipynb
index d27eaab2b..1662611cc 100644
--- a/tutorials/W1D5_Microcircuits/W1D5_Tutorial1.ipynb
+++ b/tutorials/W1D5_Microcircuits/W1D5_Tutorial1.ipynb
@@ -386,7 +386,7 @@
     "        ax.legend(ncol = 2)\n",
     "        remove_edges(ax)\n",
     "        ax.set_xlim(left = 0, right = 100)\n",
-    "        add_labels(ax, xlabel = '$\\\\tau$', ylabel = 'Count')\n",
+    "        add_labels(ax, xlabel = 'Value', ylabel = 'Count')\n",
     "    plt.show()\n",
     "\n",
     "def plot_temp_diff_separate_histograms(signal, lags, lags_list, tau = True):\n",
@@ -656,46 +656,6 @@
     "        plt.show()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Helper functions\n",
-    "\n",
-    "def normalize(mat):\n",
-    "    \"\"\"\n",
-    "    Normalize input matrix from 0 to 255 values (in RGB range).\n",
-    "\n",
-    "    Inputs:\n",
-    "    - mat (np.ndarray): data to normalize.\n",
-    "\n",
-    "    Outpus:\n",
-    "    - (np.ndarray): normalized data.\n",
-    "    \"\"\"\n",
-    "    mat_norm = (mat - np.percentile(mat, 10))/(np.percentile(mat, 90) - np.percentile(mat, 10))\n",
-    "    mat_norm = mat_norm*255\n",
-    "    mat_norm[mat_norm > 255] = 255\n",
-    "    mat_norm[mat_norm < 0] = 0\n",
-    "    return mat_norm\n",
-    "\n",
-    "def lists2list(xss):\n",
-    "    \"\"\"\n",
-    "    Flatten a list of lists into a single list.\n",
-    "\n",
-    "    Inputs:\n",
-    "    - xss (list): list of lists. The list of lists to be flattened.\n",
-    "\n",
-    "    Outputs:\n",
-    "    - (list): The flattened list.\n",
-    "    \"\"\"\n",
-    "    return [x for xs in xss for x in xs]"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -738,6 +698,78 @@
     "                    fid.write(r.content) # Write the downloaded content to a file"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Helper functions\n",
+    "\n",
+    "def normalize(mat):\n",
+    "    \"\"\"\n",
+    "    Normalize input matrix from 0 to 255 values (in RGB range).\n",
+    "\n",
+    "    Inputs:\n",
+    "    - mat (np.ndarray): data to normalize.\n",
+    "\n",
+    "    Outpus:\n",
+    "    - (np.ndarray): normalized data.\n",
+    "    \"\"\"\n",
+    "    mat_norm = (mat - np.percentile(mat, 10))/(np.percentile(mat, 90) - np.percentile(mat, 10))\n",
+    "    mat_norm = mat_norm*255\n",
+    "    mat_norm[mat_norm > 255] = 255\n",
+    "    mat_norm[mat_norm < 0] = 0\n",
+    "    return mat_norm\n",
+    "\n",
+    "def lists2list(xss):\n",
+    "    \"\"\"\n",
+    "    Flatten a list of lists into a single list.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - xss (list): list of lists. The list of lists to be flattened.\n",
+    "\n",
+    "    Outputs:\n",
+    "    - (list): The flattened list.\n",
+    "    \"\"\"\n",
+    "    return [x for xs in xss for x in xs]\n",
+    "\n",
+    "# exercise solutions for correct plots\n",
+    "\n",
+    "def ReLU(x, theta = 0):\n",
+    "    \"\"\"\n",
+    "    Calculates ReLU function for the given level of theta.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - x (np.ndarray): input data.\n",
+    "    - theta (float, default = 0): threshold parameter.\n",
+    "\n",
+    "    Outputs:\n",
+    "    - thres_x (np.ndarray): filtered values.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    thres_x = np.maximum(x - theta, 0)\n",
+    "\n",
+    "    return thres_x\n",
+    "\n",
+    "sig = np.load('sig.npy')\n",
+    "temporal_diff = np.abs(np.diff(sig))\n",
+    "\n",
+    "num_taus = 10\n",
+    "taus = np.linspace(1, 91, num_taus).astype(int)\n",
+    "taus_list = [np.abs(sig[tau:] - sig[:-tau]) for tau in taus]\n",
+    "\n",
+    "T_ar = np.arange(len(sig))\n",
+    "\n",
+    "freqs = np.linspace(0.001, 1, 100)\n",
+    "set_sigs = [np.sin(T_ar*f) for f in freqs]\n",
+    "\n",
+    "reg = OrthogonalMatchingPursuit(fit_intercept = True, n_nonzero_coefs = 10).fit(np.vstack(set_sigs).T, sig)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1055,6 +1087,11 @@
    },
    "outputs": [],
    "source": [
+    "###################################################################\n",
+    "## Fill out the following then remove\n",
+    "raise NotImplementedError(\"Student exercise: complete `thres_x` array calculation as defined.\")\n",
+    "###################################################################\n",
+    "\n",
     "def ReLU(x, theta = 0):\n",
     "    \"\"\"\n",
     "    Calculates ReLU function for the given level of theta.\n",
@@ -1066,10 +1103,6 @@
     "    Outputs:\n",
     "    - thres_x (np.ndarray): filtered values.\n",
     "    \"\"\"\n",
-    "    ###################################################################\n",
-    "    ## Fill out the following then remove\n",
-    "    raise NotImplementedError(\"Student exercise: complete `thres_x` array calculation as defined.\")\n",
-    "    ###################################################################\n",
     "\n",
     "    thres_x = ...\n",
     "\n",
@@ -1294,6 +1327,7 @@
    "source": [
     "\n",
     "Denote the pixel value at time $t$ by $pixel_t$. Mathematically, we define the (absolute) temporal differences as\n",
+    "\n",
     "$$\\Delta_t = |pixel_t - pixel_{t-1}|$$\n",
     "\n",
     "In code, define these absolute temporal differences to compute `temporal_diff` by applying `np.diff` on the signal `sig` and then applying `np.abs` to get absolute values."
@@ -1403,6 +1437,7 @@
    },
    "source": [
     "What happens if we look at differences at longer delays $\\tau>1$?\n",
+    "\n",
     "$$\\Delta_t(\\tau) = |pixel_t - pixel_{t-\\tau}|$$"
    ]
   },
@@ -1569,7 +1604,7 @@
     "    plot_edge = lists2list( [[e1 , e2] for e1 , e2 in zip(plot_e1, plot_e2)])\n",
     "    ax.plot(plot_edge, np.repeat(filter, 2), alpha = 0.3, color = 'purple')\n",
     "    ax.scatter(plot_edge_mean, filter, color = 'purple')\n",
-    "    add_labels(ax,ylabel = 'filter value', title = 'box filter', xlabel = 'Frame')"
+    "    add_labels(ax,ylabel = 'Filter Value', title = 'Box Filter', xlabel = 'Value')"
    ]
   },
   {
@@ -1903,7 +1938,9 @@
    },
    "source": [
     "The $\\ell_0$ pseudo-norm is defined as the number of non-zero features in the signal. Particularly, let $h \\in \\mathbb{R}^{J}$ be a vector with $J$ \"latent activity\" features. Then:\n",
+    "\n",
     "$$\\|h\\|_0 = \\sum_{j = 1}^J \\mathbb{1}_{h_{j} \\neq 0}$$\n",
+    "\n",
     "Hence, the  $\\|\\ell\\|_0$ pseudo-norm can be used to promote sparsity by adding it to a cost function to \"punish\" the number of non-zero features.\n",
     "\n",
     "Let's assume that we have a simple linear model where we want to capture the observations $y$ using the linear model $D$ (which we will later call dictionary). $D$'s features (columns) can have sparse weights denoted by $h$. This is known as a generative model, as it generates the sensory input. \n",
@@ -1911,10 +1948,13 @@
     "For instance, in the brain, $D$ can represent a basis of neuronal networks while $h$ can capture their sparse time-changing contributions to the overall brain activity (e.g. see the dLDS model in [3]). \n",
     "\n",
     "Hence, we are looking for the weights $h$ under the assumption that:\n",
+    "\n",
     "$$ y = Dh + \\epsilon$$\n",
+    "\n",
     "where $\\epsilon$ is an *i.i.d* Gaussian noise with zero mean and std of $\\sigma_\\epsilon$, i.e., $\\epsilon \\sim \\mathcal{N}(0, \\sigma_\\epsilon^2)$.\n",
     "\n",
     "To enforce that $h$ is sparse, we penalize the number of non-zero features with penalty $\\lambda$. We thus want to solve the following minimization problem:\n",
+    "\n",
     "$$\n",
     "\\hat{h} = \\arg \\min_x \\|y - Dh \\|_2^2 + \\lambda \\|h\\|_0\n",
     "$$\n",
diff --git a/tutorials/W1D5_Microcircuits/W1D5_Tutorial2.ipynb b/tutorials/W1D5_Microcircuits/W1D5_Tutorial2.ipynb
index 68e2771af..62976eeb1 100644
--- a/tutorials/W1D5_Microcircuits/W1D5_Tutorial2.ipynb
+++ b/tutorials/W1D5_Microcircuits/W1D5_Tutorial2.ipynb
@@ -484,7 +484,111 @@
     "  numerator = x*g\n",
     "  # Calculate normalized x\n",
     "  xnorm = numerator/denominator\n",
-    "  return xnorm"
+    "  return xnorm\n",
+    "\n",
+    "# Exercise solutions for correct plot output\n",
+    "\n",
+    "class ReLUNet(nn.Module):\n",
+    "  \"\"\"\n",
+    "  ReLUNet architecture\n",
+    "  Structure is as follows:\n",
+    "  y = Σi(ai * ReLU(θi - x))\n",
+    "  \"\"\"\n",
+    "  # Define the structure of your network\n",
+    "  def __init__(self, n_units):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      n_units (int): Number of hidden units\n",
+    "\n",
+    "    Returns:\n",
+    "      Nothing\n",
+    "    \"\"\"\n",
+    "    super(ReLUNet, self).__init__()\n",
+    "    # Create input thresholds\n",
+    "    self.input_threshold_weights = nn.Parameter(torch.abs(torch.randn(n_units)))\n",
+    "    self.non_linearity = nn.ReLU()\n",
+    "    self.output_layer = nn.Linear(n_units, 1)\n",
+    "    nn.init.xavier_normal_(self.output_layer.weight)\n",
+    "\n",
+    "  def forward(self, x):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      x: torch.Tensor\n",
+    "        Input tensor of size ([1])\n",
+    "    \"\"\"\n",
+    "    op = self.input_threshold_weights - x #prepare the input to be passed through ReLU\n",
+    "    op = self.non_linearity(op) #apply ReLU\n",
+    "    op = self.output_layer(op) #run through output layer\n",
+    "    return op\n",
+    "\n",
+    "  # Choose the most likely label predicted by the network\n",
+    "  def predict(self, x):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      x: torch.Tensor\n",
+    "        Input tensor of size ([1])\n",
+    "    \"\"\"\n",
+    "    output = self.forward(x)\n",
+    "    return output\n",
+    "\n",
+    "non_linearities = {\n",
+    "    'ReLU': nn.ReLU(),\n",
+    "    'ReLU6': nn.ReLU6(),\n",
+    "    'SoftPlus': nn.Softplus(),\n",
+    "    'Sigmoid': nn.Sigmoid(),\n",
+    "    'Tanh': nn.Tanh()\n",
+    "}\n",
+    "\n",
+    "def HardTanh(x):\n",
+    "  \"\"\"\n",
+    "  Calculates `tanh` output for the given input data.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - x (np.ndarray): input data.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.ndarray): `tanh(x)`.\n",
+    "  \"\"\"\n",
+    "  min_val = -1\n",
+    "  max_val = 1\n",
+    "  output = np.copy(x)\n",
+    "  output[output>max_val] = max_val\n",
+    "  output[output<min_val] = min_val\n",
+    "  return output\n",
+    "\n",
+    "def LeakyHardTanh(x, leak_slope=0.03):\n",
+    "  \"\"\"\n",
+    "  Calculate `tanh` output for the given input data with the leaky term.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - x (np.ndarray): input data.\n",
+    "  - leak_slope (float, default = 0.03): leaky term.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.ndarray): `tanh(x)`.\n",
+    "  \"\"\"\n",
+    "  output = np.copy(x)\n",
+    "  output = HardTanh(output) + leak_slope*output\n",
+    "  return output\n",
+    "\n",
+    "def InverseLeakyHardTanh(y, leak_slope=0.03):\n",
+    "  \"\"\"\n",
+    "  Calculate input into the `tanh` function with the leaky term for the given output.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - y (np.array): output of leaky tanh function.\n",
+    "  - leak_slope (float, default = 0.03): leaky term.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.array): input into leaky tanh function.\n",
+    "  \"\"\"\n",
+    "  ycopy = np.copy(y)\n",
+    "  output = np.where(\n",
+    "      np.abs(ycopy) >= 1+leak_slope, \\\n",
+    "      (ycopy - np.sign(ycopy))/leak_slope, \\\n",
+    "      ycopy/(1+leak_slope)\n",
+    "  )\n",
+    "  return output"
    ]
   },
   {
@@ -742,6 +846,11 @@
    },
    "outputs": [],
    "source": [
+    "###################################################################\n",
+    "## Fill out the following then remove\n",
+    "raise NotImplementedError(\"Student exercise: complete forward pass.\")\n",
+    "###################################################################\n",
+    "\n",
     "class ReLUNet(nn.Module):\n",
     "  \"\"\"\n",
     "  ReLUNet architecture\n",
@@ -770,10 +879,6 @@
     "      x: torch.Tensor\n",
     "        Input tensor of size ([1])\n",
     "    \"\"\"\n",
-    "    ###################################################################\n",
-    "    ## Fill out the following then remove\n",
-    "    raise NotImplementedError(\"Student exercise: complete forward pass.\")\n",
-    "    ###################################################################\n",
     "    op = ... - ... #prepare the input to be passed through ReLU\n",
     "    op = self.non_linearity(...) #apply ReLU\n",
     "    op = ... #run through output layer\n",
@@ -2979,7 +3084,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize the functions\n",
+    "# @title Visualize the functions\n",
     "\n",
     "# with plt.xkcd():\n",
     "fig, ax = plt.subplots(1, 3, figsize=(12, 4))\n",
@@ -3028,7 +3133,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Data\n",
+    "# @title Data\n",
     "\n",
     "def normalize(x, sigma, p, g):\n",
     "  \"\"\"\n",
@@ -3110,7 +3215,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize input\n",
+    "# @title Visualize input\n",
     "with plt.xkcd():\n",
     "    sns.kdeplot(ynorm_sec31[:, 0], color='r', label='$(s \\mathbf{x})_{norm}$')\n",
     "    sns.kdeplot(x_sec31[:, 0], color='k', label='$\\mathbf{x}$')\n",
@@ -3140,7 +3245,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize noisy transmitted signal\n",
+    "# @title Visualize noisy transmitted signal\n",
     "with plt.xkcd():\n",
     "    plt.figure(figsize=(7.5, 7.5))\n",
     "    sns.kdeplot(LeakyHardTanh(y_sec31, leak_slope)[:, 0], linestyle='--', color='b', label=r'LeakyHardTanh$(s \\mathbf{x})$')\n",
@@ -3170,7 +3275,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize estimated information\n",
+    "# @title Visualize estimated information\n",
     "with plt.xkcd():\n",
     "    sns.kdeplot(estimateNormalized_x[:, 0], color='r', label='$\\mathbf{\\hat{x}}_{norm}$')\n",
     "    sns.kdeplot(x_sec31[:, 0], color='k', label='$\\mathbf{x}$')\n",
@@ -3199,7 +3304,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Plot correlation between estimated information and true information\n",
+    "# @title Plot correlation between estimated information and true information\n",
     "\n",
     "with plt.xkcd():\n",
     "    fig, ax = plt.subplots(1, 2, figsize=(15, 5))\n",
@@ -3286,7 +3391,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Effect of scaling normalization ($g$)\n",
+    "# @title Effect of scaling normalization ($g$)\n",
     "norm_scales = np.arange(0.01, 5, 0.01)\n",
     "improvements = []\n",
     "\n",
diff --git a/tutorials/W1D5_Microcircuits/W1D5_Tutorial3.ipynb b/tutorials/W1D5_Microcircuits/W1D5_Tutorial3.ipynb
index 38d373d8f..33a927558 100644
--- a/tutorials/W1D5_Microcircuits/W1D5_Tutorial3.ipynb
+++ b/tutorials/W1D5_Microcircuits/W1D5_Tutorial3.ipynb
@@ -1304,7 +1304,7 @@
     "execution": {}
    },
    "source": [
-    "Now we plot the weights as a function of $\\mathbf{z}$. Manipulate the sliders to control the two attention gains $z_k$, which we've pre-assigned to two specific sigmoidal pattern vectors $\\mathbf{q}_k$. (Feel free to change those functions and see how the sliders change what they attend.) Observe how these sliders affect which part of the input is amplified or attenuated."
+    "Now we plot the weights as a function of $\\mathbf{z}$. Manipulate the sliders to control the two attention gains $z_k$, which we've pre-assigned to two specific sigmoidal pattern vectors $\\mathbf{q}_k$. (Feel free to change those functions and see how the sliders change what they attend.) Observe how these sliders affect which part of the input is amplified or attenuated. The x-axis represents the input variable to the functions that define $\\mathbf{q}_1$ and $\\mathbf{q}_2$ (observe that these are non-linear of this input)."
    ]
   },
   {
diff --git a/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial1.ipynb b/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial1.ipynb
index 84a06f3b0..de399df9e 100644
--- a/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial1.ipynb
+++ b/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial1.ipynb
@@ -386,7 +386,7 @@
     "        ax.legend(ncol = 2)\n",
     "        remove_edges(ax)\n",
     "        ax.set_xlim(left = 0, right = 100)\n",
-    "        add_labels(ax, xlabel = '$\\\\tau$', ylabel = 'Count')\n",
+    "        add_labels(ax, xlabel = 'Value', ylabel = 'Count')\n",
     "    plt.show()\n",
     "\n",
     "def plot_temp_diff_separate_histograms(signal, lags, lags_list, tau = True):\n",
@@ -656,46 +656,6 @@
     "        plt.show()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Helper functions\n",
-    "\n",
-    "def normalize(mat):\n",
-    "    \"\"\"\n",
-    "    Normalize input matrix from 0 to 255 values (in RGB range).\n",
-    "\n",
-    "    Inputs:\n",
-    "    - mat (np.ndarray): data to normalize.\n",
-    "\n",
-    "    Outpus:\n",
-    "    - (np.ndarray): normalized data.\n",
-    "    \"\"\"\n",
-    "    mat_norm = (mat - np.percentile(mat, 10))/(np.percentile(mat, 90) - np.percentile(mat, 10))\n",
-    "    mat_norm = mat_norm*255\n",
-    "    mat_norm[mat_norm > 255] = 255\n",
-    "    mat_norm[mat_norm < 0] = 0\n",
-    "    return mat_norm\n",
-    "\n",
-    "def lists2list(xss):\n",
-    "    \"\"\"\n",
-    "    Flatten a list of lists into a single list.\n",
-    "\n",
-    "    Inputs:\n",
-    "    - xss (list): list of lists. The list of lists to be flattened.\n",
-    "\n",
-    "    Outputs:\n",
-    "    - (list): The flattened list.\n",
-    "    \"\"\"\n",
-    "    return [x for xs in xss for x in xs]"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -738,6 +698,78 @@
     "                    fid.write(r.content) # Write the downloaded content to a file"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Helper functions\n",
+    "\n",
+    "def normalize(mat):\n",
+    "    \"\"\"\n",
+    "    Normalize input matrix from 0 to 255 values (in RGB range).\n",
+    "\n",
+    "    Inputs:\n",
+    "    - mat (np.ndarray): data to normalize.\n",
+    "\n",
+    "    Outpus:\n",
+    "    - (np.ndarray): normalized data.\n",
+    "    \"\"\"\n",
+    "    mat_norm = (mat - np.percentile(mat, 10))/(np.percentile(mat, 90) - np.percentile(mat, 10))\n",
+    "    mat_norm = mat_norm*255\n",
+    "    mat_norm[mat_norm > 255] = 255\n",
+    "    mat_norm[mat_norm < 0] = 0\n",
+    "    return mat_norm\n",
+    "\n",
+    "def lists2list(xss):\n",
+    "    \"\"\"\n",
+    "    Flatten a list of lists into a single list.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - xss (list): list of lists. The list of lists to be flattened.\n",
+    "\n",
+    "    Outputs:\n",
+    "    - (list): The flattened list.\n",
+    "    \"\"\"\n",
+    "    return [x for xs in xss for x in xs]\n",
+    "\n",
+    "# exercise solutions for correct plots\n",
+    "\n",
+    "def ReLU(x, theta = 0):\n",
+    "    \"\"\"\n",
+    "    Calculates ReLU function for the given level of theta.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - x (np.ndarray): input data.\n",
+    "    - theta (float, default = 0): threshold parameter.\n",
+    "\n",
+    "    Outputs:\n",
+    "    - thres_x (np.ndarray): filtered values.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    thres_x = np.maximum(x - theta, 0)\n",
+    "\n",
+    "    return thres_x\n",
+    "\n",
+    "sig = np.load('sig.npy')\n",
+    "temporal_diff = np.abs(np.diff(sig))\n",
+    "\n",
+    "num_taus = 10\n",
+    "taus = np.linspace(1, 91, num_taus).astype(int)\n",
+    "taus_list = [np.abs(sig[tau:] - sig[:-tau]) for tau in taus]\n",
+    "\n",
+    "T_ar = np.arange(len(sig))\n",
+    "\n",
+    "freqs = np.linspace(0.001, 1, 100)\n",
+    "set_sigs = [np.sin(T_ar*f) for f in freqs]\n",
+    "\n",
+    "reg = OrthogonalMatchingPursuit(fit_intercept = True, n_nonzero_coefs = 10).fit(np.vstack(set_sigs).T, sig)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1055,6 +1087,11 @@
    },
    "source": [
     "```python\n",
+    "###################################################################\n",
+    "## Fill out the following then remove\n",
+    "raise NotImplementedError(\"Student exercise: complete `thres_x` array calculation as defined.\")\n",
+    "###################################################################\n",
+    "\n",
     "def ReLU(x, theta = 0):\n",
     "    \"\"\"\n",
     "    Calculates ReLU function for the given level of theta.\n",
@@ -1066,10 +1103,6 @@
     "    Outputs:\n",
     "    - thres_x (np.ndarray): filtered values.\n",
     "    \"\"\"\n",
-    "    ###################################################################\n",
-    "    ## Fill out the following then remove\n",
-    "    raise NotImplementedError(\"Student exercise: complete `thres_x` array calculation as defined.\")\n",
-    "    ###################################################################\n",
     "\n",
     "    thres_x = ...\n",
     "\n",
@@ -1296,6 +1329,7 @@
    "source": [
     "\n",
     "Denote the pixel value at time $t$ by $pixel_t$. Mathematically, we define the (absolute) temporal differences as\n",
+    "\n",
     "$$\\Delta_t = |pixel_t - pixel_{t-1}|$$\n",
     "\n",
     "In code, define these absolute temporal differences to compute `temporal_diff` by applying `np.diff` on the signal `sig` and then applying `np.abs` to get absolute values."
@@ -1407,6 +1441,7 @@
    },
    "source": [
     "What happens if we look at differences at longer delays $\\tau>1$?\n",
+    "\n",
     "$$\\Delta_t(\\tau) = |pixel_t - pixel_{t-\\tau}|$$"
    ]
   },
@@ -1575,7 +1610,7 @@
     "    plot_edge = lists2list( [[e1 , e2] for e1 , e2 in zip(plot_e1, plot_e2)])\n",
     "    ax.plot(plot_edge, np.repeat(filter, 2), alpha = 0.3, color = 'purple')\n",
     "    ax.scatter(plot_edge_mean, filter, color = 'purple')\n",
-    "    add_labels(ax,ylabel = 'filter value', title = 'box filter', xlabel = 'Frame')"
+    "    add_labels(ax,ylabel = 'Filter Value', title = 'Box Filter', xlabel = 'Value')"
    ]
   },
   {
@@ -1909,7 +1944,9 @@
    },
    "source": [
     "The $\\ell_0$ pseudo-norm is defined as the number of non-zero features in the signal. Particularly, let $h \\in \\mathbb{R}^{J}$ be a vector with $J$ \"latent activity\" features. Then:\n",
+    "\n",
     "$$\\|h\\|_0 = \\sum_{j = 1}^J \\mathbb{1}_{h_{j} \\neq 0}$$\n",
+    "\n",
     "Hence, the  $\\|\\ell\\|_0$ pseudo-norm can be used to promote sparsity by adding it to a cost function to \"punish\" the number of non-zero features.\n",
     "\n",
     "Let's assume that we have a simple linear model where we want to capture the observations $y$ using the linear model $D$ (which we will later call dictionary). $D$'s features (columns) can have sparse weights denoted by $h$. This is known as a generative model, as it generates the sensory input. \n",
@@ -1917,10 +1954,13 @@
     "For instance, in the brain, $D$ can represent a basis of neuronal networks while $h$ can capture their sparse time-changing contributions to the overall brain activity (e.g. see the dLDS model in [3]). \n",
     "\n",
     "Hence, we are looking for the weights $h$ under the assumption that:\n",
+    "\n",
     "$$ y = Dh + \\epsilon$$\n",
+    "\n",
     "where $\\epsilon$ is an *i.i.d* Gaussian noise with zero mean and std of $\\sigma_\\epsilon$, i.e., $\\epsilon \\sim \\mathcal{N}(0, \\sigma_\\epsilon^2)$.\n",
     "\n",
     "To enforce that $h$ is sparse, we penalize the number of non-zero features with penalty $\\lambda$. We thus want to solve the following minimization problem:\n",
+    "\n",
     "$$\n",
     "\\hat{h} = \\arg \\min_x \\|y - Dh \\|_2^2 + \\lambda \\|h\\|_0\n",
     "$$\n",
diff --git a/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial2.ipynb b/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial2.ipynb
index cbd53c669..7b805c10d 100644
--- a/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial2.ipynb
+++ b/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial2.ipynb
@@ -484,7 +484,111 @@
     "  numerator = x*g\n",
     "  # Calculate normalized x\n",
     "  xnorm = numerator/denominator\n",
-    "  return xnorm"
+    "  return xnorm\n",
+    "\n",
+    "# Exercise solutions for correct plot output\n",
+    "\n",
+    "class ReLUNet(nn.Module):\n",
+    "  \"\"\"\n",
+    "  ReLUNet architecture\n",
+    "  Structure is as follows:\n",
+    "  y = Σi(ai * ReLU(θi - x))\n",
+    "  \"\"\"\n",
+    "  # Define the structure of your network\n",
+    "  def __init__(self, n_units):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      n_units (int): Number of hidden units\n",
+    "\n",
+    "    Returns:\n",
+    "      Nothing\n",
+    "    \"\"\"\n",
+    "    super(ReLUNet, self).__init__()\n",
+    "    # Create input thresholds\n",
+    "    self.input_threshold_weights = nn.Parameter(torch.abs(torch.randn(n_units)))\n",
+    "    self.non_linearity = nn.ReLU()\n",
+    "    self.output_layer = nn.Linear(n_units, 1)\n",
+    "    nn.init.xavier_normal_(self.output_layer.weight)\n",
+    "\n",
+    "  def forward(self, x):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      x: torch.Tensor\n",
+    "        Input tensor of size ([1])\n",
+    "    \"\"\"\n",
+    "    op = self.input_threshold_weights - x #prepare the input to be passed through ReLU\n",
+    "    op = self.non_linearity(op) #apply ReLU\n",
+    "    op = self.output_layer(op) #run through output layer\n",
+    "    return op\n",
+    "\n",
+    "  # Choose the most likely label predicted by the network\n",
+    "  def predict(self, x):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      x: torch.Tensor\n",
+    "        Input tensor of size ([1])\n",
+    "    \"\"\"\n",
+    "    output = self.forward(x)\n",
+    "    return output\n",
+    "\n",
+    "non_linearities = {\n",
+    "    'ReLU': nn.ReLU(),\n",
+    "    'ReLU6': nn.ReLU6(),\n",
+    "    'SoftPlus': nn.Softplus(),\n",
+    "    'Sigmoid': nn.Sigmoid(),\n",
+    "    'Tanh': nn.Tanh()\n",
+    "}\n",
+    "\n",
+    "def HardTanh(x):\n",
+    "  \"\"\"\n",
+    "  Calculates `tanh` output for the given input data.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - x (np.ndarray): input data.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.ndarray): `tanh(x)`.\n",
+    "  \"\"\"\n",
+    "  min_val = -1\n",
+    "  max_val = 1\n",
+    "  output = np.copy(x)\n",
+    "  output[output>max_val] = max_val\n",
+    "  output[output<min_val] = min_val\n",
+    "  return output\n",
+    "\n",
+    "def LeakyHardTanh(x, leak_slope=0.03):\n",
+    "  \"\"\"\n",
+    "  Calculate `tanh` output for the given input data with the leaky term.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - x (np.ndarray): input data.\n",
+    "  - leak_slope (float, default = 0.03): leaky term.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.ndarray): `tanh(x)`.\n",
+    "  \"\"\"\n",
+    "  output = np.copy(x)\n",
+    "  output = HardTanh(output) + leak_slope*output\n",
+    "  return output\n",
+    "\n",
+    "def InverseLeakyHardTanh(y, leak_slope=0.03):\n",
+    "  \"\"\"\n",
+    "  Calculate input into the `tanh` function with the leaky term for the given output.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - y (np.array): output of leaky tanh function.\n",
+    "  - leak_slope (float, default = 0.03): leaky term.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.array): input into leaky tanh function.\n",
+    "  \"\"\"\n",
+    "  ycopy = np.copy(y)\n",
+    "  output = np.where(\n",
+    "      np.abs(ycopy) >= 1+leak_slope, \\\n",
+    "      (ycopy - np.sign(ycopy))/leak_slope, \\\n",
+    "      ycopy/(1+leak_slope)\n",
+    "  )\n",
+    "  return output"
    ]
   },
   {
@@ -742,6 +846,11 @@
    },
    "source": [
     "```python\n",
+    "###################################################################\n",
+    "## Fill out the following then remove\n",
+    "raise NotImplementedError(\"Student exercise: complete forward pass.\")\n",
+    "###################################################################\n",
+    "\n",
     "class ReLUNet(nn.Module):\n",
     "  \"\"\"\n",
     "  ReLUNet architecture\n",
@@ -770,10 +879,6 @@
     "      x: torch.Tensor\n",
     "        Input tensor of size ([1])\n",
     "    \"\"\"\n",
-    "    ###################################################################\n",
-    "    ## Fill out the following then remove\n",
-    "    raise NotImplementedError(\"Student exercise: complete forward pass.\")\n",
-    "    ###################################################################\n",
     "    op = ... - ... #prepare the input to be passed through ReLU\n",
     "    op = self.non_linearity(...) #apply ReLU\n",
     "    op = ... #run through output layer\n",
@@ -2989,7 +3094,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize the functions\n",
+    "# @title Visualize the functions\n",
     "\n",
     "# with plt.xkcd():\n",
     "fig, ax = plt.subplots(1, 3, figsize=(12, 4))\n",
@@ -3038,7 +3143,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Data\n",
+    "# @title Data\n",
     "\n",
     "def normalize(x, sigma, p, g):\n",
     "  \"\"\"\n",
@@ -3120,7 +3225,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize input\n",
+    "# @title Visualize input\n",
     "with plt.xkcd():\n",
     "    sns.kdeplot(ynorm_sec31[:, 0], color='r', label='$(s \\mathbf{x})_{norm}$')\n",
     "    sns.kdeplot(x_sec31[:, 0], color='k', label='$\\mathbf{x}$')\n",
@@ -3150,7 +3255,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize noisy transmitted signal\n",
+    "# @title Visualize noisy transmitted signal\n",
     "with plt.xkcd():\n",
     "    plt.figure(figsize=(7.5, 7.5))\n",
     "    sns.kdeplot(LeakyHardTanh(y_sec31, leak_slope)[:, 0], linestyle='--', color='b', label=r'LeakyHardTanh$(s \\mathbf{x})$')\n",
@@ -3180,7 +3285,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize estimated information\n",
+    "# @title Visualize estimated information\n",
     "with plt.xkcd():\n",
     "    sns.kdeplot(estimateNormalized_x[:, 0], color='r', label='$\\mathbf{\\hat{x}}_{norm}$')\n",
     "    sns.kdeplot(x_sec31[:, 0], color='k', label='$\\mathbf{x}$')\n",
@@ -3209,7 +3314,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Plot correlation between estimated information and true information\n",
+    "# @title Plot correlation between estimated information and true information\n",
     "\n",
     "with plt.xkcd():\n",
     "    fig, ax = plt.subplots(1, 2, figsize=(15, 5))\n",
@@ -3296,7 +3401,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Effect of scaling normalization ($g$)\n",
+    "# @title Effect of scaling normalization ($g$)\n",
     "norm_scales = np.arange(0.01, 5, 0.01)\n",
     "improvements = []\n",
     "\n",
diff --git a/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial3.ipynb b/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial3.ipynb
index dff45f2db..042923746 100644
--- a/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial3.ipynb
+++ b/tutorials/W1D5_Microcircuits/instructor/W1D5_Tutorial3.ipynb
@@ -1306,7 +1306,7 @@
     "execution": {}
    },
    "source": [
-    "Now we plot the weights as a function of $\\mathbf{z}$. Manipulate the sliders to control the two attention gains $z_k$, which we've pre-assigned to two specific sigmoidal pattern vectors $\\mathbf{q}_k$. (Feel free to change those functions and see how the sliders change what they attend.) Observe how these sliders affect which part of the input is amplified or attenuated."
+    "Now we plot the weights as a function of $\\mathbf{z}$. Manipulate the sliders to control the two attention gains $z_k$, which we've pre-assigned to two specific sigmoidal pattern vectors $\\mathbf{q}_k$. (Feel free to change those functions and see how the sliders change what they attend.) Observe how these sliders affect which part of the input is amplified or attenuated. The x-axis represents the input variable to the functions that define $\\mathbf{q}_1$ and $\\mathbf{q}_2$ (observe that these are non-linear of this input)."
    ]
   },
   {
diff --git a/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial1.ipynb b/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial1.ipynb
index 21261af2b..49d2429ac 100644
--- a/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial1.ipynb
+++ b/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial1.ipynb
@@ -386,7 +386,7 @@
     "        ax.legend(ncol = 2)\n",
     "        remove_edges(ax)\n",
     "        ax.set_xlim(left = 0, right = 100)\n",
-    "        add_labels(ax, xlabel = '$\\\\tau$', ylabel = 'Count')\n",
+    "        add_labels(ax, xlabel = 'Value', ylabel = 'Count')\n",
     "    plt.show()\n",
     "\n",
     "def plot_temp_diff_separate_histograms(signal, lags, lags_list, tau = True):\n",
@@ -656,46 +656,6 @@
     "        plt.show()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "cellView": "form",
-    "execution": {}
-   },
-   "outputs": [],
-   "source": [
-    "# @title Helper functions\n",
-    "\n",
-    "def normalize(mat):\n",
-    "    \"\"\"\n",
-    "    Normalize input matrix from 0 to 255 values (in RGB range).\n",
-    "\n",
-    "    Inputs:\n",
-    "    - mat (np.ndarray): data to normalize.\n",
-    "\n",
-    "    Outpus:\n",
-    "    - (np.ndarray): normalized data.\n",
-    "    \"\"\"\n",
-    "    mat_norm = (mat - np.percentile(mat, 10))/(np.percentile(mat, 90) - np.percentile(mat, 10))\n",
-    "    mat_norm = mat_norm*255\n",
-    "    mat_norm[mat_norm > 255] = 255\n",
-    "    mat_norm[mat_norm < 0] = 0\n",
-    "    return mat_norm\n",
-    "\n",
-    "def lists2list(xss):\n",
-    "    \"\"\"\n",
-    "    Flatten a list of lists into a single list.\n",
-    "\n",
-    "    Inputs:\n",
-    "    - xss (list): list of lists. The list of lists to be flattened.\n",
-    "\n",
-    "    Outputs:\n",
-    "    - (list): The flattened list.\n",
-    "    \"\"\"\n",
-    "    return [x for xs in xss for x in xs]"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -738,6 +698,78 @@
     "                    fid.write(r.content) # Write the downloaded content to a file"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {}
+   },
+   "outputs": [],
+   "source": [
+    "# @title Helper functions\n",
+    "\n",
+    "def normalize(mat):\n",
+    "    \"\"\"\n",
+    "    Normalize input matrix from 0 to 255 values (in RGB range).\n",
+    "\n",
+    "    Inputs:\n",
+    "    - mat (np.ndarray): data to normalize.\n",
+    "\n",
+    "    Outpus:\n",
+    "    - (np.ndarray): normalized data.\n",
+    "    \"\"\"\n",
+    "    mat_norm = (mat - np.percentile(mat, 10))/(np.percentile(mat, 90) - np.percentile(mat, 10))\n",
+    "    mat_norm = mat_norm*255\n",
+    "    mat_norm[mat_norm > 255] = 255\n",
+    "    mat_norm[mat_norm < 0] = 0\n",
+    "    return mat_norm\n",
+    "\n",
+    "def lists2list(xss):\n",
+    "    \"\"\"\n",
+    "    Flatten a list of lists into a single list.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - xss (list): list of lists. The list of lists to be flattened.\n",
+    "\n",
+    "    Outputs:\n",
+    "    - (list): The flattened list.\n",
+    "    \"\"\"\n",
+    "    return [x for xs in xss for x in xs]\n",
+    "\n",
+    "# exercise solutions for correct plots\n",
+    "\n",
+    "def ReLU(x, theta = 0):\n",
+    "    \"\"\"\n",
+    "    Calculates ReLU function for the given level of theta.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - x (np.ndarray): input data.\n",
+    "    - theta (float, default = 0): threshold parameter.\n",
+    "\n",
+    "    Outputs:\n",
+    "    - thres_x (np.ndarray): filtered values.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    thres_x = np.maximum(x - theta, 0)\n",
+    "\n",
+    "    return thres_x\n",
+    "\n",
+    "sig = np.load('sig.npy')\n",
+    "temporal_diff = np.abs(np.diff(sig))\n",
+    "\n",
+    "num_taus = 10\n",
+    "taus = np.linspace(1, 91, num_taus).astype(int)\n",
+    "taus_list = [np.abs(sig[tau:] - sig[:-tau]) for tau in taus]\n",
+    "\n",
+    "T_ar = np.arange(len(sig))\n",
+    "\n",
+    "freqs = np.linspace(0.001, 1, 100)\n",
+    "set_sigs = [np.sin(T_ar*f) for f in freqs]\n",
+    "\n",
+    "reg = OrthogonalMatchingPursuit(fit_intercept = True, n_nonzero_coefs = 10).fit(np.vstack(set_sigs).T, sig)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1055,6 +1087,11 @@
    },
    "outputs": [],
    "source": [
+    "###################################################################\n",
+    "## Fill out the following then remove\n",
+    "raise NotImplementedError(\"Student exercise: complete `thres_x` array calculation as defined.\")\n",
+    "###################################################################\n",
+    "\n",
     "def ReLU(x, theta = 0):\n",
     "    \"\"\"\n",
     "    Calculates ReLU function for the given level of theta.\n",
@@ -1066,10 +1103,6 @@
     "    Outputs:\n",
     "    - thres_x (np.ndarray): filtered values.\n",
     "    \"\"\"\n",
-    "    ###################################################################\n",
-    "    ## Fill out the following then remove\n",
-    "    raise NotImplementedError(\"Student exercise: complete `thres_x` array calculation as defined.\")\n",
-    "    ###################################################################\n",
     "\n",
     "    thres_x = ...\n",
     "\n",
@@ -1279,6 +1312,7 @@
    "source": [
     "\n",
     "Denote the pixel value at time $t$ by $pixel_t$. Mathematically, we define the (absolute) temporal differences as\n",
+    "\n",
     "$$\\Delta_t = |pixel_t - pixel_{t-1}|$$\n",
     "\n",
     "In code, define these absolute temporal differences to compute `temporal_diff` by applying `np.diff` on the signal `sig` and then applying `np.abs` to get absolute values."
@@ -1387,6 +1421,7 @@
    },
    "source": [
     "What happens if we look at differences at longer delays $\\tau>1$?\n",
+    "\n",
     "$$\\Delta_t(\\tau) = |pixel_t - pixel_{t-\\tau}|$$"
    ]
   },
@@ -1546,7 +1581,7 @@
     "    plot_edge = lists2list( [[e1 , e2] for e1 , e2 in zip(plot_e1, plot_e2)])\n",
     "    ax.plot(plot_edge, np.repeat(filter, 2), alpha = 0.3, color = 'purple')\n",
     "    ax.scatter(plot_edge_mean, filter, color = 'purple')\n",
-    "    add_labels(ax,ylabel = 'filter value', title = 'box filter', xlabel = 'Frame')"
+    "    add_labels(ax,ylabel = 'Filter Value', title = 'Box Filter', xlabel = 'Value')"
    ]
   },
   {
@@ -1866,7 +1901,9 @@
    },
    "source": [
     "The $\\ell_0$ pseudo-norm is defined as the number of non-zero features in the signal. Particularly, let $h \\in \\mathbb{R}^{J}$ be a vector with $J$ \"latent activity\" features. Then:\n",
+    "\n",
     "$$\\|h\\|_0 = \\sum_{j = 1}^J \\mathbb{1}_{h_{j} \\neq 0}$$\n",
+    "\n",
     "Hence, the  $\\|\\ell\\|_0$ pseudo-norm can be used to promote sparsity by adding it to a cost function to \"punish\" the number of non-zero features.\n",
     "\n",
     "Let's assume that we have a simple linear model where we want to capture the observations $y$ using the linear model $D$ (which we will later call dictionary). $D$'s features (columns) can have sparse weights denoted by $h$. This is known as a generative model, as it generates the sensory input. \n",
@@ -1874,10 +1911,13 @@
     "For instance, in the brain, $D$ can represent a basis of neuronal networks while $h$ can capture their sparse time-changing contributions to the overall brain activity (e.g. see the dLDS model in [3]). \n",
     "\n",
     "Hence, we are looking for the weights $h$ under the assumption that:\n",
+    "\n",
     "$$ y = Dh + \\epsilon$$\n",
+    "\n",
     "where $\\epsilon$ is an *i.i.d* Gaussian noise with zero mean and std of $\\sigma_\\epsilon$, i.e., $\\epsilon \\sim \\mathcal{N}(0, \\sigma_\\epsilon^2)$.\n",
     "\n",
     "To enforce that $h$ is sparse, we penalize the number of non-zero features with penalty $\\lambda$. We thus want to solve the following minimization problem:\n",
+    "\n",
     "$$\n",
     "\\hat{h} = \\arg \\min_x \\|y - Dh \\|_2^2 + \\lambda \\|h\\|_0\n",
     "$$\n",
diff --git a/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial2.ipynb b/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial2.ipynb
index 86896eaf7..f7e3a073e 100644
--- a/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial2.ipynb
+++ b/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial2.ipynb
@@ -484,7 +484,111 @@
     "  numerator = x*g\n",
     "  # Calculate normalized x\n",
     "  xnorm = numerator/denominator\n",
-    "  return xnorm"
+    "  return xnorm\n",
+    "\n",
+    "# Exercise solutions for correct plot output\n",
+    "\n",
+    "class ReLUNet(nn.Module):\n",
+    "  \"\"\"\n",
+    "  ReLUNet architecture\n",
+    "  Structure is as follows:\n",
+    "  y = Σi(ai * ReLU(θi - x))\n",
+    "  \"\"\"\n",
+    "  # Define the structure of your network\n",
+    "  def __init__(self, n_units):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      n_units (int): Number of hidden units\n",
+    "\n",
+    "    Returns:\n",
+    "      Nothing\n",
+    "    \"\"\"\n",
+    "    super(ReLUNet, self).__init__()\n",
+    "    # Create input thresholds\n",
+    "    self.input_threshold_weights = nn.Parameter(torch.abs(torch.randn(n_units)))\n",
+    "    self.non_linearity = nn.ReLU()\n",
+    "    self.output_layer = nn.Linear(n_units, 1)\n",
+    "    nn.init.xavier_normal_(self.output_layer.weight)\n",
+    "\n",
+    "  def forward(self, x):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      x: torch.Tensor\n",
+    "        Input tensor of size ([1])\n",
+    "    \"\"\"\n",
+    "    op = self.input_threshold_weights - x #prepare the input to be passed through ReLU\n",
+    "    op = self.non_linearity(op) #apply ReLU\n",
+    "    op = self.output_layer(op) #run through output layer\n",
+    "    return op\n",
+    "\n",
+    "  # Choose the most likely label predicted by the network\n",
+    "  def predict(self, x):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "      x: torch.Tensor\n",
+    "        Input tensor of size ([1])\n",
+    "    \"\"\"\n",
+    "    output = self.forward(x)\n",
+    "    return output\n",
+    "\n",
+    "non_linearities = {\n",
+    "    'ReLU': nn.ReLU(),\n",
+    "    'ReLU6': nn.ReLU6(),\n",
+    "    'SoftPlus': nn.Softplus(),\n",
+    "    'Sigmoid': nn.Sigmoid(),\n",
+    "    'Tanh': nn.Tanh()\n",
+    "}\n",
+    "\n",
+    "def HardTanh(x):\n",
+    "  \"\"\"\n",
+    "  Calculates `tanh` output for the given input data.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - x (np.ndarray): input data.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.ndarray): `tanh(x)`.\n",
+    "  \"\"\"\n",
+    "  min_val = -1\n",
+    "  max_val = 1\n",
+    "  output = np.copy(x)\n",
+    "  output[output>max_val] = max_val\n",
+    "  output[output<min_val] = min_val\n",
+    "  return output\n",
+    "\n",
+    "def LeakyHardTanh(x, leak_slope=0.03):\n",
+    "  \"\"\"\n",
+    "  Calculate `tanh` output for the given input data with the leaky term.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - x (np.ndarray): input data.\n",
+    "  - leak_slope (float, default = 0.03): leaky term.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.ndarray): `tanh(x)`.\n",
+    "  \"\"\"\n",
+    "  output = np.copy(x)\n",
+    "  output = HardTanh(output) + leak_slope*output\n",
+    "  return output\n",
+    "\n",
+    "def InverseLeakyHardTanh(y, leak_slope=0.03):\n",
+    "  \"\"\"\n",
+    "  Calculate input into the `tanh` function with the leaky term for the given output.\n",
+    "\n",
+    "  Inputs:\n",
+    "  - y (np.array): output of leaky tanh function.\n",
+    "  - leak_slope (float, default = 0.03): leaky term.\n",
+    "\n",
+    "  Outputs:\n",
+    "  - output (np.array): input into leaky tanh function.\n",
+    "  \"\"\"\n",
+    "  ycopy = np.copy(y)\n",
+    "  output = np.where(\n",
+    "      np.abs(ycopy) >= 1+leak_slope, \\\n",
+    "      (ycopy - np.sign(ycopy))/leak_slope, \\\n",
+    "      ycopy/(1+leak_slope)\n",
+    "  )\n",
+    "  return output"
    ]
   },
   {
@@ -742,6 +846,11 @@
    },
    "outputs": [],
    "source": [
+    "###################################################################\n",
+    "## Fill out the following then remove\n",
+    "raise NotImplementedError(\"Student exercise: complete forward pass.\")\n",
+    "###################################################################\n",
+    "\n",
     "class ReLUNet(nn.Module):\n",
     "  \"\"\"\n",
     "  ReLUNet architecture\n",
@@ -770,10 +879,6 @@
     "      x: torch.Tensor\n",
     "        Input tensor of size ([1])\n",
     "    \"\"\"\n",
-    "    ###################################################################\n",
-    "    ## Fill out the following then remove\n",
-    "    raise NotImplementedError(\"Student exercise: complete forward pass.\")\n",
-    "    ###################################################################\n",
     "    op = ... - ... #prepare the input to be passed through ReLU\n",
     "    op = self.non_linearity(...) #apply ReLU\n",
     "    op = ... #run through output layer\n",
@@ -2835,7 +2940,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize the functions\n",
+    "# @title Visualize the functions\n",
     "\n",
     "# with plt.xkcd():\n",
     "fig, ax = plt.subplots(1, 3, figsize=(12, 4))\n",
@@ -2884,7 +2989,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Data\n",
+    "# @title Data\n",
     "\n",
     "def normalize(x, sigma, p, g):\n",
     "  \"\"\"\n",
@@ -2966,7 +3071,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize input\n",
+    "# @title Visualize input\n",
     "with plt.xkcd():\n",
     "    sns.kdeplot(ynorm_sec31[:, 0], color='r', label='$(s \\mathbf{x})_{norm}$')\n",
     "    sns.kdeplot(x_sec31[:, 0], color='k', label='$\\mathbf{x}$')\n",
@@ -2996,7 +3101,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize noisy transmitted signal\n",
+    "# @title Visualize noisy transmitted signal\n",
     "with plt.xkcd():\n",
     "    plt.figure(figsize=(7.5, 7.5))\n",
     "    sns.kdeplot(LeakyHardTanh(y_sec31, leak_slope)[:, 0], linestyle='--', color='b', label=r'LeakyHardTanh$(s \\mathbf{x})$')\n",
@@ -3026,7 +3131,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Visualize estimated information\n",
+    "# @title Visualize estimated information\n",
     "with plt.xkcd():\n",
     "    sns.kdeplot(estimateNormalized_x[:, 0], color='r', label='$\\mathbf{\\hat{x}}_{norm}$')\n",
     "    sns.kdeplot(x_sec31[:, 0], color='k', label='$\\mathbf{x}$')\n",
@@ -3055,7 +3160,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Plot correlation between estimated information and true information\n",
+    "# @title Plot correlation between estimated information and true information\n",
     "\n",
     "with plt.xkcd():\n",
     "    fig, ax = plt.subplots(1, 2, figsize=(15, 5))\n",
@@ -3142,7 +3247,7 @@
    },
    "outputs": [],
    "source": [
-    "# @markdown Effect of scaling normalization ($g$)\n",
+    "# @title Effect of scaling normalization ($g$)\n",
     "norm_scales = np.arange(0.01, 5, 0.01)\n",
     "improvements = []\n",
     "\n",
diff --git a/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial3.ipynb b/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial3.ipynb
index 120f4ec12..46a177de4 100644
--- a/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial3.ipynb
+++ b/tutorials/W1D5_Microcircuits/student/W1D5_Tutorial3.ipynb
@@ -1284,7 +1284,7 @@
     "execution": {}
    },
    "source": [
-    "Now we plot the weights as a function of $\\mathbf{z}$. Manipulate the sliders to control the two attention gains $z_k$, which we've pre-assigned to two specific sigmoidal pattern vectors $\\mathbf{q}_k$. (Feel free to change those functions and see how the sliders change what they attend.) Observe how these sliders affect which part of the input is amplified or attenuated."
+    "Now we plot the weights as a function of $\\mathbf{z}$. Manipulate the sliders to control the two attention gains $z_k$, which we've pre-assigned to two specific sigmoidal pattern vectors $\\mathbf{q}_k$. (Feel free to change those functions and see how the sliders change what they attend.) Observe how these sliders affect which part of the input is amplified or attenuated. The x-axis represents the input variable to the functions that define $\\mathbf{q}_1$ and $\\mathbf{q}_2$ (observe that these are non-linear of this input)."
    ]
   },
   {