diff --git a/markov_chain_monte_carlo/markov_chain_monte_carlo.Rmd b/markov_chain_monte_carlo/markov_chain_monte_carlo.Rmd index d78ea32..84bb14d 100644 --- a/markov_chain_monte_carlo/markov_chain_monte_carlo.Rmd +++ b/markov_chain_monte_carlo/markov_chain_monte_carlo.Rmd @@ -442,7 +442,7 @@ points(c(q0[1], q1[1], q2[1]), c(q0[2], q1[2], q2[2]), col=c_light, pch=16, cex= _Iterating_ Markov transitions, $$ \begin{align*} -\tilde{q}_{1} &\sim T(q_{2} \mid q_{0}) +\tilde{q}_{1} &\sim T(q_{1} \mid q_{0}) \\ \tilde{q}_{2} &\sim T(q_{2} \mid \tilde{q}_{1}) \\ @@ -2859,7 +2859,7 @@ $$ \pi(q) = \text{normal}(\varpi_{1}(q) ; 1, 1) \cdot -\text{normal}(\varpi_{1}(q) ; -1, 1). +\text{normal}(\varpi_{2}(q) ; -1, 1). $$ One advantage of this example is that the component means and variances are given immediately by the locations and scales, which allows us to compare Markov diff --git a/modeling_sparsity/modeling_sparsity.Rmd b/modeling_sparsity/modeling_sparsity.Rmd index 431dd63..13ac2cb 100644 --- a/modeling_sparsity/modeling_sparsity.Rmd +++ b/modeling_sparsity/modeling_sparsity.Rmd @@ -391,7 +391,7 @@ hist(samples$tau, breaks=seq(0, 50, 0.5), main="", col=c_dark, border=c_dark_highlight, add=T) ``` -This balance, however, is still too much large for the parameters with small +This balance, however, is still too large for the parameters with small true values and a bit too small for the parameters with large true values. Because the balance favors the larger scale the over-regularization isn't too bad. @@ -1310,7 +1310,7 @@ for (k in 1:9) { } ``` -Unfortunately the inferred scale is too large small enough to narrow the +Unfortunately the inferred scale is too large to narrow the marginal posterior distributions of the small parameters below $\sigma = 0.5$. ```{r} diff --git a/modeling_sparsity/stan_programs/cauchy_narrow.stan b/modeling_sparsity/stan_programs/cauchy_narrow.stan index ad9d5da..817985e 100644 --- a/modeling_sparsity/stan_programs/cauchy_narrow.stan +++ b/modeling_sparsity/stan_programs/cauchy_narrow.stan @@ -7,12 +7,12 @@ data { } parameters { - // Horseshoe parameters + // Cauchy parameters vector[K] theta; } model { - // Horseshoe prior model + // Cauchy prior model theta ~ cauchy(0, 0.1); // Observational model diff --git a/modeling_sparsity/stan_programs/cauchy_wide.stan b/modeling_sparsity/stan_programs/cauchy_wide.stan index d297ebe..027eb35 100644 --- a/modeling_sparsity/stan_programs/cauchy_wide.stan +++ b/modeling_sparsity/stan_programs/cauchy_wide.stan @@ -7,12 +7,12 @@ data { } parameters { - // Horseshoe parameters + // Cauchy parameters vector[K] theta; } model { - // Horseshoe prior model + // Cauchy prior model theta ~ cauchy(0, 10); // Observational model diff --git a/modeling_sparsity/stan_programs/hier_cauchy_cp.stan b/modeling_sparsity/stan_programs/hier_cauchy_cp.stan index 732bc2f..0540f5e 100644 --- a/modeling_sparsity/stan_programs/hier_cauchy_cp.stan +++ b/modeling_sparsity/stan_programs/hier_cauchy_cp.stan @@ -7,13 +7,13 @@ data { } parameters { - // Horseshoe parameters + // Cauchy parameters vector[K] theta; real tau; } model { - // Horseshoe prior model + // Cauchy prior model theta ~ cauchy(0, tau); tau ~ normal(0, 10); diff --git a/modeling_sparsity/stan_programs/hier_laplace_cp.stan b/modeling_sparsity/stan_programs/hier_laplace_cp.stan index b2fe0b5..19172e8 100644 --- a/modeling_sparsity/stan_programs/hier_laplace_cp.stan +++ b/modeling_sparsity/stan_programs/hier_laplace_cp.stan @@ -7,13 +7,13 @@ data { } parameters { - // Horseshoe parameters + // Laplace parameters vector[K] theta; real tau; } model { - // Horseshoe prior model + // Laplace prior model theta ~ double_exponential(0, tau); tau ~ normal(0, 10); diff --git a/modeling_sparsity/stan_programs/laplace_narrow.stan b/modeling_sparsity/stan_programs/laplace_narrow.stan index d6ab939..686034e 100644 --- a/modeling_sparsity/stan_programs/laplace_narrow.stan +++ b/modeling_sparsity/stan_programs/laplace_narrow.stan @@ -7,12 +7,12 @@ data { } parameters { - // Horseshoe parameters + // Laplace parameters vector[K] theta; } model { - // Horseshoe prior model + // Laplace prior model theta ~ double_exponential(0, 0.1); // Observational model diff --git a/modeling_sparsity/stan_programs/laplace_wide.stan b/modeling_sparsity/stan_programs/laplace_wide.stan index 07a6ecf..5967c8a 100644 --- a/modeling_sparsity/stan_programs/laplace_wide.stan +++ b/modeling_sparsity/stan_programs/laplace_wide.stan @@ -7,12 +7,12 @@ data { } parameters { - // Horseshoe parameters + // Laplace parameters vector[K] theta; } model { - // Horseshoe prior model + // Laplace prior model theta ~ double_exponential(0, 10); // Observational model diff --git a/modeling_sparsity/stan_programs/normal_narrow.stan b/modeling_sparsity/stan_programs/normal_narrow.stan index bd27f90..dc0226a 100644 --- a/modeling_sparsity/stan_programs/normal_narrow.stan +++ b/modeling_sparsity/stan_programs/normal_narrow.stan @@ -7,12 +7,12 @@ data { } parameters { - // Horseshoe parameters + // Location parameters vector[K] theta; } model { - // Horseshoe prior model + // Prior model theta ~ normal(0, 0.1); // Observational model diff --git a/modeling_sparsity/stan_programs/normal_wide.stan b/modeling_sparsity/stan_programs/normal_wide.stan index f7f2c2f..a7fae0a 100644 --- a/modeling_sparsity/stan_programs/normal_wide.stan +++ b/modeling_sparsity/stan_programs/normal_wide.stan @@ -7,12 +7,12 @@ data { } parameters { - // Horseshoe parameters + // Location parameters vector[K] theta; } model { - // Horseshoe prior model + // Prior model theta ~ normal(0, 10); // Observational model diff --git a/principled_bayesian_workflow/principled_bayesian_workflow.Rmd b/principled_bayesian_workflow/principled_bayesian_workflow.Rmd index b549ff4..e770eb2 100644 --- a/principled_bayesian_workflow/principled_bayesian_workflow.Rmd +++ b/principled_bayesian_workflow/principled_bayesian_workflow.Rmd @@ -1081,7 +1081,7 @@ Consequently any modification to the phenomena, environment, or experimental probe spanned by the model will in general invalidate a Bayesian calibration. For example even if the latent phenomena are the same, varying environments and experimental probes can lead to very different utilities. What is good enough -to answer questions in one particular comtext may not be sufficient to answer +to answer questions in one particular context may not be sufficient to answer questions in different contexts! Because at least some aspect of the phenomena, environment, and probe are unique @@ -1825,7 +1825,7 @@ can construct a collection of powerful visualizations by projecting to subspaces of the observational space that isolating particular consequences of our retrodictions that highlight potential limitations. Fortunately we've already considered how to isolate the features of the observation space relevant to our -scientific questions when we were motivating summary statistics we for prior +scientific questions when we were motivating summary statistics for prior predictive checks! In other words we can reuse those summary statistics to construct _posterior retrodictive checks_ that visually compare the pushforwards of the posterior predictive distribution, $\pi_{t(Y) \mid Y}(t \mid \tilde{y})$, @@ -1953,7 +1953,7 @@ knitr::include_graphics("figures/posterior_checks/posterior_retrodictive_heldout

-This might, for example, but due to an overly flexible model overfitting to +This might, for example, be due to an overly flexible model overfitting to $\tilde{y}_{1}$. At the same time it could also be a consequence of $\tilde{y}_{1}$ manifesting misfit less clearly than $\tilde{y}_{2}$, or even $\tilde{y}_{2}$ being an unlikely tail event. Identifying which requires @@ -2369,7 +2369,7 @@ anticipated. Importantly the outcomes of this step should be only informal, conceptual narratives of the measurement process. All we're trying to do is sit down with the domain experts, whether ourselves or our colleagues, and ask -_"How is are data being generated?"_. +_"How is our data being generated?"_. **Requirements:** Domain Expertise diff --git a/probabilistic_computation/probabilistic_computation.Rmd b/probabilistic_computation/probabilistic_computation.Rmd index 50e1bb8..1ddb1a4 100644 --- a/probabilistic_computation/probabilistic_computation.Rmd +++ b/probabilistic_computation/probabilistic_computation.Rmd @@ -1732,7 +1732,7 @@ richness of the variational family and the structure of the divergence function. Quantifying estimator errors in a general application is typically infeasible, and we once again have to be weary of fragility. Moreover that fragility can be amplified when the variational family is specified by a family of -probability density functions...in a given parameterization. While the +probability density functions in a given parameterization. While the variational construction is invariant, its implementation might not be. Variational methods are relatively new to statistics and both the theory and @@ -2037,7 +2037,7 @@ rapidly disperses. In the best case this will only inflate the estimation error but in the worst case it can render $w(q) \, f(q)$ no longer square integrable and invalidating the importance sampling estimator entirely! -In low-dimensional problems typical sets are board. Constructing a good +In low-dimensional problems typical sets are broad. Constructing a good auxiliary probability distribution whose typical set strongly overlaps the typical set of the target distribution isn't trivial but it is often feasible. diff --git a/probability_theory/probability_theory.Rmd b/probability_theory/probability_theory.Rmd index 63dc957..f807eaa 100644 --- a/probability_theory/probability_theory.Rmd +++ b/probability_theory/probability_theory.Rmd @@ -51,7 +51,7 @@ confuse the reader, but rather a consequence of the fact that we cannot explicitly construct abstract probability distributions in any meaningful sense. Instead we must utilize problem-specific _representations_ of abstract probability distributions which means that concrete examples will have to wait -until we introduce these representations in Section 3. +until we introduce these representations in Section 4. # Setting A Foundation {#sec:foundation} @@ -533,7 +533,7 @@ $$ > \mathbb{P}_{\pi} [ \cup_{n = 1}^{N} \mathfrak{A}_{n} ], $$ -even when $\mathfrak{A}_{n} \cap \mathfrak{A}_{m} = 0, n \ne m$. We can also +even when $\mathfrak{A}_{n} \cap \mathfrak{A}_{m} = \emptyset, n \ne m$. We can also combine a finite number of different non-constructible subsets to achieve _super-additivity_, $$ @@ -569,7 +569,7 @@ $$ \sum_{n = 1}^{\infty} \mathbb{P}_{\pi} [ A_{n} ], $$ $$ -A_{n} \cap A_{m} = 0, \, n \ne m. +A_{n} \cap A_{m} = \emptyset, \, n \ne m. $$ The more familiar rules of probability theory can all be derived from these @@ -904,7 +904,7 @@ corresponding interval in the full real line. ![](figures/embeddings/interval/interval.png)

-If our target space is itself the real line then the identify function serves +If our target space is itself the real line then the identity function serves as an appropriate embedding.

@@ -1523,7 +1523,7 @@ To demonstrate a probability density function consider the ubiquitous _Gaussian_ probability density functions which allocate probabilities across real line, $X = \mathbb{R}$, $$ -\pi(x \mid \mu, \sigma) = \frac{1}{\sqrt{2 \pi}} +\pi(x \mid \mu, \sigma) = \frac{1}{\sqrt{2 \pi} \sigma} \exp \left( - \frac{1}{2} \left(\frac{x - \mu}{\sigma} \right)^{2} \right). $$ Each Gaussian probability density function is specified by a location parameter, @@ -1590,7 +1590,7 @@ plot_norm_probs(mu, sigma, -8, B1_min) plot_norm_probs(mu, sigma, B1_max, 8) ``` -We can compute it using the cumulative probability function, +We can compute it using the cumulative distribution function, ```{r} (1 - pnorm(B1_max, mu, sigma)) + pnorm(B1_min, mu, sigma) ``` @@ -1653,7 +1653,7 @@ norm_prob(B_union_min, B_union_max, mu, sigma) ### Computing Expectations -The real line has a unique embedding into the real line -- the identify +The real line has a unique embedding into the real line -- the identity function -- so means and variances are well-defined for the Gaussian family of probability density functions. In line with their names, the mean of any member is given by the location parameter, @@ -2105,7 +2105,7 @@ This is consistent with the exact computation, ```{r} poisson_prob(A1, l) ``` -And we an readily visualize how the Monte Carlo estimator converges to the exact +And we readily visualize how the Monte Carlo estimator converges to the exact value as the size of the sample increases. The bands here in red cover the Monte Carlo estimator plus/minus 1, 2, and 3 standard errors to demonstrate the variation expected from the Monte Carlo Central Limit Theorem. @@ -2138,7 +2138,7 @@ plot_mc_evo <- function(iter, mc_stats, truth) { plot_mc_evo(iter, mc_stats, poisson_prob(A1, l)) ``` -Now we can apply this machinery to any desired probabilist computation. The +Now we can apply this machinery to any desired probabilistic computation. The probability of the complement of $A_{1}$? ```{r} pushforward_samples = sapply(stan_samples, function(x) 1 - indicator(x, A1)) diff --git a/stan_intro/stan_intro.Rmd b/stan_intro/stan_intro.Rmd index 0e22a05..3d7c9ff 100644 --- a/stan_intro/stan_intro.Rmd +++ b/stan_intro/stan_intro.Rmd @@ -2590,7 +2590,7 @@ parameters { model { mu ~ normal(0, 1); log_sigma ~ ???; - y ~ normal(mu, log_sigma); + y ~ normal(mu, exp(log_sigma)); } ``` diff --git a/variate_covariate_modeling/variate_covariate_modeling.Rmd b/variate_covariate_modeling/variate_covariate_modeling.Rmd index 747f56f..909ec7b 100644 --- a/variate_covariate_modeling/variate_covariate_modeling.Rmd +++ b/variate_covariate_modeling/variate_covariate_modeling.Rmd @@ -136,7 +136,7 @@ knitr::include_graphics("figures/covariation/covariation.png") If we can learn this covariation from complete observations then we might be -able apply it to predicting missing variates. +able to apply it to predicting missing variates. Mathematically the covariation between $y$ and $x$ is captured in the conditional observational model @@ -3239,7 +3239,7 @@ plot_pred_res_by_index(x2, reverse_conditional_samples$x2_pred, We can see the source of this discrepancy already in the behavior of the location function and the predictive distribution that concentrates around the location function. The configuration of the conditional variate model for the -complete observation does not generalize to the incomplete observation becuase +complete observation does not generalize to the incomplete observation because of the heterogeneity induced by the confounding parameters. ```{r}