diff --git a/markov_chain_monte_carlo/markov_chain_monte_carlo.Rmd b/markov_chain_monte_carlo/markov_chain_monte_carlo.Rmd
index d78ea32..84bb14d 100644
--- a/markov_chain_monte_carlo/markov_chain_monte_carlo.Rmd
+++ b/markov_chain_monte_carlo/markov_chain_monte_carlo.Rmd
@@ -442,7 +442,7 @@ points(c(q0[1], q1[1], q2[1]), c(q0[2], q1[2], q2[2]), col=c_light, pch=16, cex=
 _Iterating_ Markov transitions,
 $$
 \begin{align*}
-\tilde{q}_{1} &\sim T(q_{2} \mid q_{0})
+\tilde{q}_{1} &\sim T(q_{1} \mid q_{0})
 \\
 \tilde{q}_{2} &\sim T(q_{2} \mid \tilde{q}_{1})
 \\
@@ -2859,7 +2859,7 @@ $$
 \pi(q) =
 \text{normal}(\varpi_{1}(q) ; 1, 1)
 \cdot
-\text{normal}(\varpi_{1}(q) ; -1, 1).
+\text{normal}(\varpi_{2}(q) ; -1, 1).
 $$
 One advantage of this example is that the component means and variances are
 given immediately by the locations and scales, which allows us to compare Markov
diff --git a/modeling_sparsity/modeling_sparsity.Rmd b/modeling_sparsity/modeling_sparsity.Rmd
index 431dd63..13ac2cb 100644
--- a/modeling_sparsity/modeling_sparsity.Rmd
+++ b/modeling_sparsity/modeling_sparsity.Rmd
@@ -391,7 +391,7 @@ hist(samples$tau, breaks=seq(0, 50, 0.5),
      main="", col=c_dark, border=c_dark_highlight, add=T)
 ```
 
-This balance, however, is still too much large for the parameters with small
+This balance, however, is still too large for the parameters with small
 true values and a bit too small for the parameters with large true values.
 Because the balance favors the larger scale the over-regularization isn't too
 bad.
@@ -1310,7 +1310,7 @@ for (k in 1:9) {
 }
 ```
 
-Unfortunately the inferred scale is too large small enough to narrow the
+Unfortunately the inferred scale is too large to narrow the
 marginal posterior distributions of the small parameters below $\sigma = 0.5$.
 
 ```{r}
diff --git a/modeling_sparsity/stan_programs/cauchy_narrow.stan b/modeling_sparsity/stan_programs/cauchy_narrow.stan
index ad9d5da..817985e 100644
--- a/modeling_sparsity/stan_programs/cauchy_narrow.stan
+++ b/modeling_sparsity/stan_programs/cauchy_narrow.stan
@@ -7,12 +7,12 @@ data {
 }
 
 parameters {
-  // Horseshoe parameters
+  // Cauchy parameters
   vector[K] theta;
 }
 
 model {
-  // Horseshoe prior model
+  // Cauchy prior model
   theta ~ cauchy(0, 0.1);
 
   // Observational model
diff --git a/modeling_sparsity/stan_programs/cauchy_wide.stan b/modeling_sparsity/stan_programs/cauchy_wide.stan
index d297ebe..027eb35 100644
--- a/modeling_sparsity/stan_programs/cauchy_wide.stan
+++ b/modeling_sparsity/stan_programs/cauchy_wide.stan
@@ -7,12 +7,12 @@ data {
 }
 
 parameters {
-  // Horseshoe parameters
+  // Cauchy parameters
   vector[K] theta;
 }
 
 model {
-  // Horseshoe prior model
+  // Cauchy prior model
   theta ~ cauchy(0, 10);
 
   // Observational model
diff --git a/modeling_sparsity/stan_programs/hier_cauchy_cp.stan b/modeling_sparsity/stan_programs/hier_cauchy_cp.stan
index 732bc2f..0540f5e 100644
--- a/modeling_sparsity/stan_programs/hier_cauchy_cp.stan
+++ b/modeling_sparsity/stan_programs/hier_cauchy_cp.stan
@@ -7,13 +7,13 @@ data {
 }
 
 parameters {
-  // Horseshoe parameters
+  // Cauchy parameters
   vector[K] theta;
   real<lower=0> tau;
 }
 
 model {
-  // Horseshoe prior model
+  // Cauchy prior model
   theta ~ cauchy(0, tau);
   tau ~ normal(0, 10);
 
diff --git a/modeling_sparsity/stan_programs/hier_laplace_cp.stan b/modeling_sparsity/stan_programs/hier_laplace_cp.stan
index b2fe0b5..19172e8 100644
--- a/modeling_sparsity/stan_programs/hier_laplace_cp.stan
+++ b/modeling_sparsity/stan_programs/hier_laplace_cp.stan
@@ -7,13 +7,13 @@ data {
 }
 
 parameters {
-  // Horseshoe parameters
+  // Laplace parameters
   vector[K] theta;
   real<lower=0> tau;
 }
 
 model {
-  // Horseshoe prior model
+  // Laplace prior model
   theta ~ double_exponential(0, tau);
   tau ~ normal(0, 10);
 
diff --git a/modeling_sparsity/stan_programs/laplace_narrow.stan b/modeling_sparsity/stan_programs/laplace_narrow.stan
index d6ab939..686034e 100644
--- a/modeling_sparsity/stan_programs/laplace_narrow.stan
+++ b/modeling_sparsity/stan_programs/laplace_narrow.stan
@@ -7,12 +7,12 @@ data {
 }
 
 parameters {
-  // Horseshoe parameters
+  // Laplace parameters
   vector[K] theta;
 }
 
 model {
-  // Horseshoe prior model
+  // Laplace prior model
   theta ~ double_exponential(0, 0.1);
 
   // Observational model
diff --git a/modeling_sparsity/stan_programs/laplace_wide.stan b/modeling_sparsity/stan_programs/laplace_wide.stan
index 07a6ecf..5967c8a 100644
--- a/modeling_sparsity/stan_programs/laplace_wide.stan
+++ b/modeling_sparsity/stan_programs/laplace_wide.stan
@@ -7,12 +7,12 @@ data {
 }
 
 parameters {
-  // Horseshoe parameters
+  // Laplace parameters
   vector[K] theta;
 }
 
 model {
-  // Horseshoe prior model
+  // Laplace prior model
   theta ~ double_exponential(0, 10);
 
   // Observational model
diff --git a/modeling_sparsity/stan_programs/normal_narrow.stan b/modeling_sparsity/stan_programs/normal_narrow.stan
index bd27f90..dc0226a 100644
--- a/modeling_sparsity/stan_programs/normal_narrow.stan
+++ b/modeling_sparsity/stan_programs/normal_narrow.stan
@@ -7,12 +7,12 @@ data {
 }
 
 parameters {
-  // Horseshoe parameters
+  // Location parameters
   vector[K] theta;
 }
 
 model {
-  // Horseshoe prior model
+  // Prior model
   theta ~ normal(0, 0.1);
   
   // Observational model
diff --git a/modeling_sparsity/stan_programs/normal_wide.stan b/modeling_sparsity/stan_programs/normal_wide.stan
index f7f2c2f..a7fae0a 100644
--- a/modeling_sparsity/stan_programs/normal_wide.stan
+++ b/modeling_sparsity/stan_programs/normal_wide.stan
@@ -7,12 +7,12 @@ data {
 }
 
 parameters {
-  // Horseshoe parameters
+  // Location parameters
   vector[K] theta;
 }
 
 model {
-  // Horseshoe prior model
+  // Prior model
   theta ~ normal(0, 10);
   
   // Observational model
diff --git a/principled_bayesian_workflow/principled_bayesian_workflow.Rmd b/principled_bayesian_workflow/principled_bayesian_workflow.Rmd
index b549ff4..e770eb2 100644
--- a/principled_bayesian_workflow/principled_bayesian_workflow.Rmd
+++ b/principled_bayesian_workflow/principled_bayesian_workflow.Rmd
@@ -1081,7 +1081,7 @@ Consequently any modification to the phenomena, environment, or experimental
 probe spanned by the model will in general invalidate a Bayesian calibration.
 For example even if the latent phenomena are the same, varying environments and
 experimental probes can lead to very different utilities.  What is good enough
-to answer questions in one particular comtext may not be sufficient to answer
+to answer questions in one particular context may not be sufficient to answer
 questions in different contexts!
 
 Because at least some aspect of the phenomena, environment, and probe are unique
@@ -1825,7 +1825,7 @@ can construct a collection of powerful visualizations by projecting to subspaces
 of the observational space that isolating particular consequences of our
 retrodictions that highlight potential limitations.  Fortunately we've already
 considered how to isolate the features of the observation space relevant to our
-scientific questions when we were motivating summary statistics we for prior
+scientific questions when we were motivating summary statistics for prior
 predictive checks!  In other words we can reuse those summary statistics to
 construct _posterior retrodictive checks_ that visually compare the pushforwards
 of the posterior predictive distribution, $\pi_{t(Y) \mid Y}(t \mid \tilde{y})$,
@@ -1953,7 +1953,7 @@ knitr::include_graphics("figures/posterior_checks/posterior_retrodictive_heldout
 <br><br>
 </center>
 
-This might, for example, but due to an overly flexible model overfitting to
+This might, for example, be due to an overly flexible model overfitting to
 $\tilde{y}_{1}$.  At the same time it could also be a consequence of
 $\tilde{y}_{1}$ manifesting misfit less clearly than $\tilde{y}_{2}$, or even
 $\tilde{y}_{2}$ being an unlikely tail event.  Identifying which requires
@@ -2369,7 +2369,7 @@ anticipated.
 Importantly the outcomes of this step should be only informal, conceptual
 narratives of the measurement process.  All we're trying to do is sit down with
 the domain experts, whether ourselves or our colleagues, and ask
-_"How is are data being generated?"_.
+_"How is our data being generated?"_.
 
 **Requirements:** Domain Expertise
 
diff --git a/probabilistic_computation/probabilistic_computation.Rmd b/probabilistic_computation/probabilistic_computation.Rmd
index 50e1bb8..1ddb1a4 100644
--- a/probabilistic_computation/probabilistic_computation.Rmd
+++ b/probabilistic_computation/probabilistic_computation.Rmd
@@ -1732,7 +1732,7 @@ richness of the variational family and the structure of the divergence function.
 Quantifying estimator errors in a general application is typically infeasible,
 and we once again have to be weary of fragility.  Moreover that fragility can
 be amplified when the variational family is specified by a family of
-probability density functions...in a given parameterization.  While the
+probability density functions in a given parameterization.  While the
 variational construction is invariant, its implementation might not be.
 
 Variational methods are relatively new to statistics and both the theory and 
@@ -2037,7 +2037,7 @@ rapidly disperses.  In the best case this will only inflate the estimation error
 but in the worst case it can render $w(q) \, f(q)$ no longer square integrable 
 and invalidating the importance sampling estimator entirely!
 
-In low-dimensional problems typical sets are board.  Constructing a good 
+In low-dimensional problems typical sets are broad.  Constructing a good 
 auxiliary probability distribution whose typical set strongly overlaps the 
 typical set of the target distribution isn't trivial but it is often feasible.
 
diff --git a/probability_theory/probability_theory.Rmd b/probability_theory/probability_theory.Rmd
index 63dc957..f807eaa 100644
--- a/probability_theory/probability_theory.Rmd
+++ b/probability_theory/probability_theory.Rmd
@@ -51,7 +51,7 @@ confuse the reader, but rather a consequence of the fact that we cannot
 explicitly construct abstract probability distributions in any meaningful sense.
 Instead we must utilize problem-specific _representations_ of abstract
 probability distributions which means that concrete examples will have to wait
-until we introduce these representations in Section 3.
+until we introduce these representations in Section 4.
 
 # Setting A Foundation {#sec:foundation}
 
@@ -533,7 +533,7 @@ $$
 >
 \mathbb{P}_{\pi} [ \cup_{n = 1}^{N} \mathfrak{A}_{n} ],
 $$
-even when $\mathfrak{A}_{n} \cap \mathfrak{A}_{m} = 0, n \ne m$.  We can also
+even when $\mathfrak{A}_{n} \cap \mathfrak{A}_{m} = \emptyset, n \ne m$.  We can also
 combine a finite number of different non-constructible subsets to achieve
 _super-additivity_,
 $$
@@ -569,7 +569,7 @@ $$
 \sum_{n = 1}^{\infty} \mathbb{P}_{\pi} [ A_{n} ],
 $$
 $$
-A_{n} \cap A_{m} = 0, \, n \ne m.
+A_{n} \cap A_{m} = \emptyset, \, n \ne m.
 $$
 
 The more familiar rules of probability theory can all be derived from these
@@ -904,7 +904,7 @@ corresponding interval in the full real line.
 ![](figures/embeddings/interval/interval.png)
 <br><br>
 
-If our target space is itself the real line then the identify function serves
+If our target space is itself the real line then the identity function serves
 as an appropriate embedding.
 
 <br><br>
@@ -1523,7 +1523,7 @@ To demonstrate a probability density function consider the ubiquitous
  _Gaussian_ probability density functions which allocate probabilities across 
 real line, $X = \mathbb{R}$, 
 $$
-\pi(x \mid \mu, \sigma) = \frac{1}{\sqrt{2 \pi}}
+\pi(x \mid \mu, \sigma) = \frac{1}{\sqrt{2 \pi} \sigma}
 \exp \left( - \frac{1}{2} \left(\frac{x - \mu}{\sigma} \right)^{2} \right).
 $$
 Each Gaussian probability density function is specified by a location parameter,
@@ -1590,7 +1590,7 @@ plot_norm_probs(mu, sigma, -8, B1_min)
 plot_norm_probs(mu, sigma, B1_max, 8)
 ```
 
-We can compute it using the cumulative probability function,
+We can compute it using the cumulative distribution function,
 ```{r}
 (1 - pnorm(B1_max, mu, sigma)) + pnorm(B1_min, mu, sigma)
 ```
@@ -1653,7 +1653,7 @@ norm_prob(B_union_min, B_union_max, mu, sigma)
 
 ### Computing Expectations
 
-The real line has a unique embedding into the real line -- the identify 
+The real line has a unique embedding into the real line -- the identity 
 function -- so means and variances are well-defined for the Gaussian family of
 probability density functions.  In line with their names, the mean of any
 member is given by the location parameter,
@@ -2105,7 +2105,7 @@ This is consistent with the exact computation,
 ```{r}
 poisson_prob(A1, l)
 ```
-And we an readily visualize how the Monte Carlo estimator converges to the exact 
+And we readily visualize how the Monte Carlo estimator converges to the exact 
 value as the size of the sample increases.  The bands here in red cover the 
 Monte Carlo estimator plus/minus 1, 2, and 3 standard errors to demonstrate the 
 variation expected from the Monte Carlo Central Limit Theorem.
@@ -2138,7 +2138,7 @@ plot_mc_evo <- function(iter, mc_stats, truth) {
 plot_mc_evo(iter, mc_stats, poisson_prob(A1, l))
 ```
 
-Now we can apply this machinery to any desired probabilist computation.  The 
+Now we can apply this machinery to any desired probabilistic computation.  The 
 probability of the complement of $A_{1}$?
 ```{r}
 pushforward_samples = sapply(stan_samples, function(x) 1 - indicator(x, A1))
diff --git a/stan_intro/stan_intro.Rmd b/stan_intro/stan_intro.Rmd
index 0e22a05..3d7c9ff 100644
--- a/stan_intro/stan_intro.Rmd
+++ b/stan_intro/stan_intro.Rmd
@@ -2590,7 +2590,7 @@ parameters {
 model {
   mu ~ normal(0, 1);
   log_sigma ~ ???;
-  y ~ normal(mu, log_sigma);
+  y ~ normal(mu, exp(log_sigma));
 }
 ```
 
diff --git a/variate_covariate_modeling/variate_covariate_modeling.Rmd b/variate_covariate_modeling/variate_covariate_modeling.Rmd
index 747f56f..909ec7b 100644
--- a/variate_covariate_modeling/variate_covariate_modeling.Rmd
+++ b/variate_covariate_modeling/variate_covariate_modeling.Rmd
@@ -136,7 +136,7 @@ knitr::include_graphics("figures/covariation/covariation.png")
 </center>
 
 If we can learn this covariation from complete observations then we might be
-able apply it to predicting missing variates.
+able to apply it to predicting missing variates.
 
 Mathematically the covariation between $y$ and $x$ is captured in the
 conditional observational model
@@ -3239,7 +3239,7 @@ plot_pred_res_by_index(x2, reverse_conditional_samples$x2_pred,
 We can see the source of this discrepancy already in the behavior of the
 location function and the predictive distribution that concentrates around the
 location function.  The configuration of the conditional variate model for the
-complete observation does not generalize to the incomplete observation becuase
+complete observation does not generalize to the incomplete observation because
 of the heterogeneity induced by the confounding parameters.
 
 ```{r}