bayes-proof-concept.Rmd

---
title: "Bayesian Psychometrics for Diagnostic Assessments: A Proof of Concept"
shorttitle: "Bayesian Modeling for DCMs"
subtitle: "Research Report #19-01"
program: "DLM"
date: "November 2019"
knit: "bookdown::render_book"
site: bookdown::bookdown_site
output: ratlas::techreport_pdf
bibliography: ["bib/refs.bib", "bib/packages.bib"]
biblio-style: apa
biblatexoptions:
  - sortcites
csl: csl/apa.csl
link-citations: yes
lot: true
lof: true
subparagraph: yes
mainfont: Palatino LT Std
fontsize: 11pt
acknowledgements: >
  `r if (knitr::is_latex_output()) ratlas::inc("front-matter/preface.Rmd")`
---

```{r setup, include=FALSE}
needed_packages <- c("ratlas", "knitr", "english", "kableExtra", "tidyverse",
                     "rstan", "loo", "tidybayes",
                     "here", "glue", "fs")
load_packages <- function(x) {
  if (!(x %in% installed.packages())) {
    install.packages(x, repos = "https://cran.rstudio.com/")
  }
  
  suppressPackageStartupMessages(require(x, character.only = TRUE))
}
vapply(needed_packages, load_packages, logical(1))

extrafont::loadfonts(quiet = TRUE)
set_theme(font = "Palatino")

options(knitr.kable.NA = "")
options(knitr.table.format = "latex")
knitr::opts_chunk$set(cache = TRUE)

if (!dir_exists(here("data", "estimated-models"))) {
  dir_create(here("data", "estimated-models"))
}
```

```{r functions, include = FALSE}
logit <- function(x) {
  log(x / (1 - x))
}
inv_logit <- function(x) {
  exp(x) / (1 + exp(x))
}
trunc_sample <- function(func, n, lb = -Inf, ub = Inf, ...) {
  full_sample <- func(n = n, ...)
  trunc_sample <- full_sample[between(full_sample, lb, ub)]
  
  while(length(trunc_sample) < n) {
    full_sample <- func(n = n, ...)
    trunc_sample <- c(trunc_sample, full_sample[between(full_sample, lb, ub)])
  }
  
  sample(trunc_sample, size = n, replace = FALSE)
}
rep_data <- function(model, obs) {
  draws <- model %>%
    spread_draws(nu[c], pi[i,c]) %>% 
    ungroup()
  
  tidy_draws <- full_join(
    draws %>%
      select(.chain:.draw, nu, c) %>%
      distinct() %>%
      spread(key = c, value = nu) %>%
      rename(nm_nu = `1`, ms_nu = `2`),
    draws %>%
      select(.chain:.draw, pi, i, c) %>%
      distinct() %>%
      spread(key = c, value = pi) %>%
      rename(nm_pi = `1`, ms_pi = `2`),
    by = c(".chain", ".iteration", ".draw")
  )
  
  replicated_data <- tidy_draws %>%
    group_nest(.chain, .iteration, .draw, .key = "params") %>%
    mutate(data_rep = map(params, function(x, obs) {
      master_probs <- obs %>%
        left_join(x, by = c("item_id" = "i")) %>%
        mutate(log_nm = (score * log(nm_pi)) + ((1 - score) * log(1 - nm_pi)),
               log_ms = (score * log(ms_pi)) + ((1 - score) * log(1 - ms_pi))) %>%
        group_by(stu_id, nm_nu, ms_nu) %>%
        summarize(log_nm = sum(log_nm), log_ms = sum(log_ms)) %>%
        ungroup() %>%
        mutate(prob_nm = nm_nu * exp(log_nm),
               prob_ms = ms_nu * exp(log_ms),
               master_prob = prob_ms / (prob_nm + prob_ms)) %>%
        select(stu_id, master_prob)
      
      rand_strc_mastery <- distinct(obs, stu_id) %>%
        mutate(ms_nu = unique(x$ms_nu),
               rand = runif(n(), min = 0, max = 1),
               master = rand <= ms_nu) %>%
        select(stu_id, master)
      
      ppmc_data <- obs %>%
        select(-score) %>%
        left_join(master_probs, by = "stu_id") %>%
        left_join(select(x, i, ms_nu, nm_pi, ms_pi),
                  by = c("item_id" = "i")) %>%
        
        # Mastery status for PPMC
        ## Option 1: Random from structural parameter
        left_join(rand_strc_mastery, by = "stu_id") %>%
        ## Option 2: Determined by individual mastery probability
        ## mutate(master = master_prob > 0.5) %>%
        
        # Correct score calculation
        ## Option 1: 
        mutate(prob_correct = case_when(master ~ ms_pi,
                                        TRUE ~ nm_pi)) %>%
        ## Option 2: Mixture model (does not currently work with Option 1 above)
        ## mutate(prob_correct = (master_prob * ms_pi) +
        ##          ((1 - master_prob) * nm_pi)) %>%
        
        mutate(rand = runif(n = nrow(.), min = 0, max = 1),
               score = case_when(rand <= prob_correct ~ 1L,
                                 TRUE ~ 0L)) %>%
        select(stu_id, master, item_id, score)
      
      pvals <- list(
        ppmc_data %>%
          group_by(item_id) %>%
          summarize(ppmc_pval = mean(score)) %>%
          ungroup(),
        obs %>%
          group_by(item_id) %>%
          summarize(obs_pval = mean(score)) %>%
          ungroup(),
        ppmc_data %>%
          mutate(master = case_when(master ~ "ms", TRUE ~ "nm")) %>%
          group_by(item_id, master) %>%
          summarize(pval = mean(score)) %>%
          pivot_wider(names_from = master, values_from = pval) %>%
          rename_at(vars(ms, nm), ~paste0(., "_ppmc_pval")) %>%
          ungroup(),
        obs %>%
          left_join(master_probs, by = "stu_id") %>%
          mutate(master = case_when(master_prob > 0.5 ~ "ms", TRUE ~ "nm")) %>%
          select(-master_prob) %>%
          group_by(item_id, master) %>%
          summarize(pval = mean(score)) %>%
          pivot_wider(names_from = master, values_from = pval) %>%
          rename_at(vars(ms, nm), ~paste0(., "_obs_pval")) %>%
          ungroup()
      ) %>%
        reduce(full_join, by = "item_id")
      
      ppmc_data <- select(ppmc_data, -master)
      
      ret_df <- tibble(
        mastery_probs = list(master_probs),
        ppmc_data = list(ppmc_data),
        pvals = list(pvals)
      )
      
      return(ret_df)
    }, obs = obs)) %>%
    unnest(cols = c(data_rep))
  
  return(replicated_data)
}
```

```{r ggplot2-extras, include = FALSE}
StatBin2 <- ggproto(
  "StatBin2", 
  StatBin,
  compute_group = function (data, scales, binwidth = NULL, bins = NULL, 
                            center = NULL, boundary = NULL, 
                            closed = c("right", "left"), pad = FALSE, 
                            breaks = NULL, origin = NULL, right = NULL, 
                            drop = NULL, width = NULL) {
    if (!is.null(breaks)) {
      if (!scales$x$is_discrete()) {
        breaks <- scales$x$transform(breaks)
      }
      bins <- ggplot2:::bin_breaks(breaks, closed)
    }
    else if (!is.null(binwidth)) {
      if (is.function(binwidth)) {
        binwidth <- binwidth(data$x)
      }
      bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth, 
                                         center = center, boundary = boundary, 
                                         closed = closed)
    }
    else {
      bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins, 
                                        center = center, boundary = boundary, 
                                        closed = closed)
    }
    res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)
    
    # drop 0-count bins completely before returning the dataframe
    res <- res[res$x <= max(res[res$count > 0, "x"]) & res$x >= min(res[res$count > 0, "x"]), ]

    res
  })
```

# Executive Summary {-}

Diagnostic assessments measure the knowledge, skills, and understandings of students at a smaller and more actionable grain size than traditional scale-score assessments. Results of diagnostic assessments are reported as a mastery profile, indicating which knowledge, skills, and understandings the student has mastered and which ones may need more instruction. These mastery decisions are based on probabilities of mastery derived from diagnostic classification models (DCMs).

This report outlines a Bayesian framework for the estimation and evaluation of DCMs. Specifically, this report describes the following:

* a model definition that allows for various parameter equality constraints within a consistent conceptual framework
* the role of prior distributions in the model building process
* an estimation process utilizing the popular *Stan* programming language
* the assessment of estimation diagnostics, such as the $\widehat{R}$ and effective sample size
* the evaluation of model fit using posterior predictive model checks
* model comparison using the cross-validation approximations and model averaging

Findings illustrate the utility of the Bayesian framework for estimating and evaluating DCMs in applied settings. Specifically, the findings demonstrate how a variety of DCMs can be defined within the same conceptual framework. Additionally, using this framework, the evaluation of model fit is more straightforward and easier to interpret with intuitive graphics. Throughout, recommendations are made for specific implementation decisions for the estimation process and the assessment of model fit.

# Implications for the Field {-}

DCMs offer many benefits over traditional scale-score reporting methods. For example, DCMs can provide more actionable results through a fine-grained mastery profile [@bradshaw_sr; @clark_sr] and more reliable scores with a shorter test length [@tb_reli; @wang_reli]. However, despite a growing field of literature describing the benefits of DCM-based assessments, these models have not seen wide-spread use in applied or operational settings [@sessoms_2018]. One reason put forward for this gap between the theory and practice of DCMs is a lack a clarity in the applied research community for how these models should be estimated and evaluated [@ravand_2015; @ravand_2019; @rupp_2018]. This report attempts to bridge the gap between theory and practice by describing a Bayesian framework for estimating DCM models using the *Stan* programming language and evaluating model fit using posterior predictive model checks.

This framework, which is used in an applied setting for the Dynamic Learning Maps^&reg;^ (DLM^&reg;^) alternate assessment, provides a flexible method for defining different types of DCMs. Additionally, the model estimation processes and model fit measures are applicable to the variations in model definition. That is, the same estimation and evaluation procedures can be applied to a wide range of DCMs. Thus, this report provides a practical guide for applied researchers in order to integrate DCMs into their own work.


\newpage

# Purpose of the Report

Diagnostic classification models (DCMs) are able to provide fine-grained and actionable scores for a set of assessed skills or attributes [@rupp_dcm; @bradshaw_dcm]. However, because this class of models is relatively new to operational use, many psychometric properties require further investigation to support the use of the assessments. One key feature that is not well-defined in the literature is how best to assess the model fit of DCMs [@chen_2013; @hu_2016; @rupp_dcm]. Most evaluations of model fit rely solely on measures of  relative fit [@sen_2017], which are limited in that these indices are unable to evaluate the fit of the model to the data. Rather, these measures can only make judgments relative to alternative comparison models. The other widely used method for evaluating model fit is limited-information fit indices [e.g., @liu_2016]. In general, these methods consist of univariate, bivariate, and trivariate item tests that rely on $\chi^2$ tests that are known to be asymptotically incorrect [@maydeu_2006]. The $M_2$ statistic developed by @maydeu_2005 can correct for the distributional assumptions, but that statistic is still only based on limited information (i.e., limited sets of items), and therefore may fail to capture higher-order characteristics of the data.

Due to these concerns, this document investigates a Bayesian framework for the estimation of this class of models. This approach allows for the estimation of alternative methods for the evaluation of model fit through posterior predictive model checking.

# Defining the Bayesian Model {#model-def}

The general form of DCMs can be seen in equation \@ref(eq:dcm), where the probability of respondent $j$ providing a given item response can be modeled as shown in equation \@ref(eq:dcm).

\begin{equation}
  P(\text{X}_j = \text{x}_j) = \sum_{c=1}^C\nu_c\prod_{i=1}^{I}\pi_{ic}^{x_{ij}}(1 - \pi_{ic})^{1-x_{ij}}
  (\#eq:dcm)
\end{equation}

In equation \@ref(eq:dcm), $\pi_{ic}$ is the probability of a respondent in class $c$ providing a correct response to item $i$, and $x_{ij}$ is the observed response (i.e., 0, 1) of respondent $j$ to item $i$. Thus, $\pi_{ic}^{x_{ij}}(1 - \pi_{ic})^{1-x_{ij}}$ represents the probability of a respondent in class $c$ providing the observed response to item $i$. These probabilities are then multiplied across all items, giving the probability of a respondent in class $c$ providing the observed response pattern. Finally, this probability is multiplied by $\nu_c$, which is the base rate probability that any given respondent belongs to class $c$. Thus, this product represents the probability that a given respondent is in class $c$ and provides the observed response pattern.

Although DCMs can be estimated with multiple attributes that have more than two latent categories [@bradshaw_dcm], for illustrative purposes, this paper limits the discussion to single-attribute DCMs with a binary latent trait. Thus, for each model, there are two potential mastery profiles for each respondent (e.g., master and non-master). Note, however, that the methods presented in this paper do generalize to models with multiple attributes with nonbinary latent categories.

Where different types of DCMs differ is in how $\pi_{ic}$ is defined. For example, the log-linear cognitive diagnosis model [LCDM; @lcdm] defines $\pi_{ic}$ similar to the way generalized linear models with a logit link function are defined. Specifically, $\pi_{ic}$ is defined as seen in equation \@ref(eq:meas-lcdm), where $\alpha_c$ is a binary indicator of the mastery status for a respondent in class $c$.

\begin{equation}
  \pi_{ic} = P(\text{X}_{ic}=1\ |\ \alpha_c) = \frac{\exp(\lambda_{i,0} + \lambda_{i,1,1}\alpha_c)}{1 + \exp(\lambda_{i,0} + \lambda_{i,1,1}\alpha_c)}
  (\#eq:meas-lcdm)
\end{equation}

When using this notation introduced by @rupp_dcm, the $\lambda$ subscripts follow the order of item, effect, then attribute. That is, the first subscript identifies the item for the parameter (noted as $i$). The second subscript denotes the type of effect. Because this discussion is limited to single-attribute models, there are only two types of effects where zero identifies an intercept and one identifies a main effect. In models with multiple attributes, there may be additional effects for two-, three-, or *A*-way interactions. Finally, the last element of the subscript identifies the attribute or attributes. Again, as these are single-attribute models, this element is either nonexistent (for intercept terms where no attribute is involved) or 1 (for all other effects). It is included here only for consistency with the notation in @rupp_dcm.

For additional flexibility, equation \@ref(eq:meas-lcdm) can be modified slightly in order to include both attribute- and item-level effects, similar to multilevel models.

\begin{equation}
  \pi_{ic} = P(\text{X}_{ic}=1\ |\ \alpha_c) = \frac{\exp[\lambda_{0} + b_{i,0} + (\lambda_{1,1} + b_{i,1,1})\alpha_c]}{1 + \exp[\lambda_{0} + b_{i,0} + (\lambda_{1,1} + b_{i,1,1})\alpha_c]}
  (\#eq:dlm-lcdm)
\end{equation}

Equation \@ref(eq:dlm-lcdm) shows the similarity to multilevel models. In this model, $\lambda_0$ and $\lambda_{1,1}$ represent the attribute-level intercept and main effect, respectively. These are akin to the average intercept and main effect for all items (the fixed effects in the multilevel model literature). In addition to the attribute-level parameters, there are also item-level intercepts ($b_{i,0}$) and main effects ($b_{i,1,1}$). These parameters represent the deviation from the attribute-level effect for each item. Thus, the full intercept for item one would be calculated as $\lambda_0 + b_{1,0}$. This is similar to the estimation of random intercepts and slopes for each item [@stroup_glmm]. The difference between the proposed model and multilevel models is the treatment of the variance of these item-level parameters. In multilevel models, the variance of these effects would be estimated. However, the variance of the item-level parameters can also be fixed to pre-specified values.

If the item-level parameters are constrained to be zero, then all items will have parameters equal to the attribute-level parameter (i.e., all of the $b_{i,0}$ and $b_{i,1,1}$ parameters would be zero). This is mathematically equivalent to what is referred to here as the *fungible* model. Alternatively, the item-level parameters can be allowed to vary freely with no constraints (i.e., a *non-fungible* model). Conceptually, these two models can be thought of as using a zero-variance prior (i.e., $\mathcal{N}(0,\ 0)$) or infinite-variance or flat prior (e.g., $\mathcal{N}(0, \infty)$), respectively. Finally, a non-flat prior can be placed on the item-level parameters, such that the parameters are not constrained to be zero but also not allowed to vary completely freely either.

## Prior Specification for Attribute-Level Effects {#attr-priors}

In equation \@ref(eq:dlm-lcdm), there are two attribute-level effects that require prior specifications. The first attribute-level effect is $\lambda_{0}$, which represents the average intercept across all items. Thus, this parameter also represents the log-odds (due to the logit link function) of a non-master providing a correct response to an average item. For this parameter, a $\mathcal{N}(\mu = 0,\ \sigma=2)$ distribution was used as the prior. This prior distribution was chosen because 99% of the distribution encompasses the plausible values for this parameter. Specifically, the middle 99% of the distribution consists of the log-odds range -5.15 to 5.15, which covers nearly all of the probability scale when other parameters are equal to zero, as seen in Figure \@ref(fig:log-odds).

```{r log-odds, fig.cap = "Log-odds to probability conversion."}
tibble(x = seq(-5, 5, by = 0.01)) %>%
  mutate(y = 1 / (1 + exp(-x))) %>%
  ggplot(aes(x = x, y = y)) +
    geom_line() +
    scale_x_continuous(breaks = seq(-8, 8, by = 2)) +
    labs(x = "Log-odds", y = "Probability") -> plot

plot %>%
  ggsave2(fig_path(".png")) %>%
  ggsave2(fig_path(".pdf"))

include_graphics(fig_path(".pdf"))
```

The main effect parameters in the LCDM are constrained to be positive, thus ensuring monotonicity in the model [e.g., masters always have a higher probability of providing a correct response; @lcdm]. Thus, the attribute-level main effect, $\lambda_{1,1}$, uses a lognormal prior: $\text{Lognormal}(\mu = 0, \sigma = 1)$. Similar to the attribute-level intercept, this distribution was chosen because 99% of the distribution covers the range of plausible values. Specifically, the lower 99% of this distribution covers the log-odds range of 0 to 10.24. An upper limit of approximately 10 was desired, as a main effect of 10 would allow for an estimated probability of providing a correct response near 1.0 in the extreme case where the intercept was -5 (the lower tail of the attribute-level intercept prior distribution).

The distributions for these parameters are visualized in Figure \@ref(fig:attr-prior-dist).

```{r attr-prior-dist, fig.cap = "Prior distributions for attribute-level effects."}
bind_rows(
  tibble(.variable = "lambda[0]", x = seq(-10, 10, by = 0.01)) %>%
    mutate(y = dnorm(x, mean = 0, sd = 2)),
  tibble(.variable = "lambda[list(1,1)]", x = seq(0, 10, by = 0.01)) %>%
    mutate(y = dlnorm(x, meanlog = 0, sdlog = 1))
) %>%
  ggplot(aes(x = x, y = y)) +
    facet_wrap(~ .variable, nrow = 1, scales = "free", labeller = label_parsed) +
    geom_line() +
    labs(x = "Parameter Value", y = "Density") +
    theme(strip.text = element_text(size = 20)) -> plot

plot %>%
  ggsave2(fig_path(".png")) %>%
  ggsave2(fig_path(".pdf"))

include_graphics(fig_path(".pdf"))
```

## Prior Specification for Item-Level Effects {#item-priors}

The prior distributions for the item-level effects, $b_{i,0}$ and $b_{i,1,1}$, are determined by the type of model that is being estimated. For this proof of concept, three models are considered: fungible, non-fungible, and partial equivalency.

In the fungible model, it is assumed that all items measuring the attribute have the same item parameters. That is, the item-level effects are equivalent to the attribute-level effect. Thus, the item-level deviations from the attribute-level effects are all equal to 0. Conceptually, this means using a $\mathcal{N}(\mu=0,\ \sigma=0)$ prior for all $b_{i,0}$ and $b_{i,1,1}$ terms. In practice, to increase computational efficiency, these terms are left out of the model, and only the attribute-level effects are estimated.

In contrast, the non-fungible model assumes that the item parameters are independent of one another. In other words, the parameters for one item do not dictate the parameters of other items. Conceptually, this means that the item-level deviations from the attribute-level effects are unconstrained, and thus an infinite uniform prior, $\mathcal{U}(-\infty,\ +\infty)$, would be used for all $b_{i,0}$ and $b_{i,1,1}$ terms. In practice, it is more efficient to directly estimate individual parameters for each item rather than attribute-level effects with unconstrained item-level deviations. Therefore, this model more closely resembles a true LCDM in equation \@ref(eq:meas-lcdm), with the $\lambda_{i,0}$ and $\lambda_{i,1,1}$ parameters using the prior distributions described for the [attribute-level priors](#attr-priors).

The partial equivalency model represents a compromise between the fungible and non-fungible models. In this model, item-level parameters are not entirely independent but are also not constrained to be equivalent. Instead, the item-level parameters are assumed to come from some distribution of deviations. The smaller the variance of the distribution, the more fungible the items are. Conversely, a large variance would correspond to less fungibility. Conceptually and in practice, this model is similar to multilevel models. The item-level deviations use a hierarchical normal prior, $\mathcal{N}(\mu=0,\ \sigma)$, where $\sigma$ is an estimated parameter in the model. The $\sigma$ parameter uses a half-Student's *t*-distribution with $df = 3$ (Figure \@ref(fig:sigma-prior)). This prior ensures that the variance is always positive and also allows for larger variances than a normal distribution would. However, the variances are also constrained to reasonable values (i.e., less than approximately 5).

```{r sigma-prior, fig.cap = "Prior distribution for hierarchical variance prior."}
ggplot(data = tibble(x = c(0, 6)), aes(x = x)) +
  stat_function(fun = dt, n = 500, args = list(df = 3)) +
  geom_segment(x = 0, y = 0, xend = 0, yend = dt(0, df = 3)) +
  labs(x = "Parameter Value", y = "Density") -> plot

plot %>%
  ggsave2(fig_path(".png")) %>%
  ggsave2(fig_path(".pdf"))

include_graphics(fig_path(".pdf"))
```

## Prior Specification for Class-Level Parameters {#strc-priors}

The last parameter that requires a prior is the structural parameter in equation \@ref(eq:dcm), $\nu_c$. This parameter defines the base rate of inclusion for each class. As such, $\nu$ is constrained so that all elements sum to one (i.e., there are no non-class respondents). Because this discussion is limited to models with a single binary attribute, there are only two classes and therefore two elements of $\nu$. No assumptions are made about the base rate of mastery for attributes; therefore, a uniform Dirichlet prior, $\text{Dir}(1)$, was used for the prior distribution. As there are only two classes, this is equivalent to using a uniform Beta distribution, $\text{Beta}(\alpha=1,\ \beta=1)$, for $\nu_1$ and then calculating $\nu_2$ as $\nu_2 = 1 - \nu_1$.

# The Bayesian Framework in Practice

In order to demonstrate the utility and benefits of using the Bayesian model definition and estimation process, a single simulated data set was generated. This data set was then used to walk through each step of the Bayesian model fit process, from model estimation to model evaluations and comparisons. All analyses were performed in *R* version `r getRversion()` [@R-base].

## Measures

To demonstrate the Bayesian framework in practice, the Dynamic Learning Maps^&reg;^ (DLM^&reg;^) Alternate Assessment System is used as an example diagnostic assessment where this framework is applicable. DLM assessments in English language arts (ELA), mathematics, and science are administered in 19 states to students with the most significant cognitive disabilities. For exemplary purposes, the through-course assessment model, which features instructionally embedded assessments during the year, is used as a template.

In the instructionally embedded model, students cover the entire testing blueprint during each of two testing windows. The first testing window occurs during the fall, from September through December. The second window is open during the spring from February through May. During each window, students take one or more testlets, each consisting of three to nine items, for each alternate content standard (called an Essential Element [EE]) required for blueprint coverage. To ensure that each EE is accessible to all students, each EE is associated with multiple skills that represent the EE at varying levels of depth, breadth, and complexity (called linkage levels). There are five linkage levels for each EE in ELA and mathematics and three linkage levels for each EE in science. Due to the intended flexibility of the instructionally embedded testing model, students may or may not test on the same EE and linkage level multiple times within a testing window or across testing windows. Thus, the number of responses that can be used to estimate student mastery of a linkage level varies by student. For more details on the assignment of testlets, see Chapter 4 of @dlm_tech_1415_im.

For modeling and scoring the DLM assessments, the linkage level is the unit of analysis. That is, a latent class analysis [LCA; @bartholomew_lca] with two classes is estimated for each linkage level [see Chapter 5 of @dlm_tech_1516_im]. The latent class model currently employed for operational use represents an unconstrained version of the models defined in Section \@ref(model-def) [@lcdm; @rupp_dcm]. Specifically, as discussed in Section \@ref(item-priors), the main effects of equation \@ref(eq:dlm-lcdm) are constrained to be positive to ensure monotonicity in the model. When using the unconstrained latent class model, post hoc analysis is needed to ensure the mastery classes are properly defined (i.e., the labels of master and non-master are applied to the correct classes).

Regardless of the choice between the LCA or DCM for estimation, the resulting score is the probability that the student has mastered the linkage level. This probability is often dichotomized into a mastery categorization [@bradshaw_2019]. For example, the DLM assessments use a mastery threshold of 0.8 [see Chapter 5 of @dlm_tech_1516_im]. That is, students with a mastery probability of 0.8 or higher are classified as masters, and students with a mastery probability of less than 0.8 are classified as non-masters. Thus, the scores used for reporting are a profile of mastery classification decisions for each linkage level. For further details on the scoring model for DLM assessments, see Chapter 5 of @dlm_tech_1516_im.

## Simulated Data

```{r example-data, include = FALSE, cache = TRUE}
set.seed(9416)
num_stu <- 1700
att_mastery <- 0.6
rt_pct <- 0.1

# simulate items
items <- tibble(testlet_id = 101:104,
       window = rep(c("IE", "SP"), each = 2)) %>%
  mutate(num_item = sample(3:5, n(), replace = TRUE)) %>%
  uncount(weights = num_item, .id = "testlet_item_id") %>%
  rowid_to_column(var = "item_id") %>%
  mutate(attr_intercept = runif(1, -2.25, -1.00),
         attr_maineffect = runif(1, 1.00, 4.50),
         int_mean = case_when(testlet_id == 103L ~ 0.5, TRUE ~ -0.5),
         mef_mean = case_when(testlet_id == 103L ~ -0.5, TRUE ~ 0.5),
         item_intercept = map_dbl(int_mean, ~rnorm(1, mean = .x, sd = 0.85)),
         item_maineffect = map2_dbl(mef_mean, attr_maineffect,
                                    ~trunc_sample(rnorm, n = 1, lb = -1 * .y,
                                                  mean = .x, sd = 0.85)),
         intercept = attr_intercept + item_intercept,
         maineffect = attr_maineffect + item_maineffect,
         nm_prob = map_dbl(intercept, inv_logit),
         ms_prob = map_dbl(intercept + maineffect, inv_logit)) %>%
  select(-int_mean, -mef_mean) %>%
  write_csv(here("data", "item-parameters.csv"))

all_response <- crossing(fall = c("101", "102", "101,102"),
                         sp = c("103", "104", "103,104")) %>%
  mutate(testlet_id = glue("{fall},{sp}"),
         testlet_id = as.character(testlet_id)) %>%
  select(-fall, -sp) %>%
  mutate(testlets = str_count(testlet_id, ",") + 1,
         weight = case_when(testlets == 2 ~ ((1 - rt_pct)^2) * 0.25,
                            testlets == 3 ~ ((1 - rt_pct) * rt_pct) * 0.5,
                            testlets == 4 ~ (rt_pct * rt_pct))) %>%
  filter(weight > 0) %>%
  select(-testlets) %>%
  sample_n(size = num_stu, replace = TRUE, weight = weight) %>%
  select(-weight) %>%
  rowid_to_column(var = "stu_id") %>%
  mutate(mastery = sample(c(0L, 1L), n(), replace = TRUE,
                          prob = c(1 - att_mastery, att_mastery))) %>%
  separate_rows(testlet_id, convert = TRUE) %>%
  left_join(
    items %>%
      select(testlet_id,  item_id, nm_prob, ms_prob),
    by = "testlet_id"
  ) %>%
  mutate(prob_correct = (mastery * ms_prob) + ((1 - mastery) * nm_prob),
         rand = runif(n(), 0, 1),
         score = case_when(rand <= prob_correct ~ 1L, TRUE ~ 0L)) %>%
  write_rds(here("data", "all_response.rds"))

mastery <- distinct(all_response, stu_id, mastery) %>%
  write_csv(here("data", "student-parameters.csv"))

# Format data for Stan
response_matrix <- all_response %>%
  select(stu_id, item_id, score) %>%
  arrange(stu_id, item_id)
ragged_array <-  response_matrix %>%
  rowid_to_column() %>%
  group_by(stu_id) %>%
  summarize(start = min(rowid), num = n())
stan_data = list(
  I = nrow(items),
  J = num_stu,
  N = nrow(response_matrix),
  ii = response_matrix$item_id,
  jj = response_matrix$stu_id,
  y = response_matrix$score,
  s = ragged_array$start,
  l = ragged_array$num
)
```

To illustrate the Bayesian methods for estimating and evaluating diagnostic models, a single data set was generated. Simulated data was chosen for two reasons. First, because the data is simulated, the expected results of the analysis are known. Thus, the results can be compared to the *a priori* expectations to confirm that the methods work as expected. Second, by using simulated data, it is possible to ensure that some models fit the example data and others do not. This means that when examining model fit, there will be examples of fitting and non-fitting models that can be compared. Although this is useful for illustrating the methods, it is important to remember that the data was generated to serve this purpose.

When simulating the example data set, the data was structured similarly to the DLM assessments. In this way, the structure of the simulated data matched what could reasonably be expected from an operational assessment scaled with a DCM. Specifically, items were grouped together into testlets, and testlets were assigned to either the fall or spring testing window. By assigning testlets to the testing windows, it was possible to simulate data with students testing on combinations of testlets consistent with observed data. In other words, the amount and structure of missing data (from testlets not assigned to a student) was comparable across the simulated and observed data. Additionally, following the DLM test design, all items were assumed to follow a simple Q-matrix structure, where all items measure a single attribute [@dlm_tech_1415_im]. Item parameters were simulated according to the partial equivalency model defined in equation \@ref(eq:dlm-lcdm). Thus, the partial equivalency and non-fungible models are expected to show adequate model fit, as these are the true model and a less-constrained model, respectively. Conversely, the fungible model should show poor fit, as the fungible model is more constrained than the partial equivalency model.^[The partial equivalency model was chosen in order to illustrate differences between fitting and non-fitting models and thus should not imply that this model best represents DLM data. See @dlm_tech_1516_im for more information on the operational model used for DLM assessments.]

The attribute-level intercept, $\lambda_0$, was drawn from a $\mathcal{U}(-2.25, -1.00)$ distribution, and the attribute-level main effect, $\lambda_{1,1}$, from a $\mathcal{U}(1.00, 4.50)$. The item-level deviations $b_{i,0}$ and $b_{i,1,1}$ were drawn from a $\mathcal{N}(\mu=0,\ \sigma = 1.0)$ distribution. This resulted in total item intercepts and main effects consistent with those reported for other measures that have been scaled with the LCDM [e.g., @dtmr; @hdcm; @ecpe]. The true parameter values for each testlet and item that were used to simulate the data can be seen in Table \@ref(tab:true-item-param).

```{r true-item-param}
items %>%
  select(-testlet_item_id, -nm_prob, -ms_prob) %>%
  mutate(window = case_when(window == "IE" ~ "Fall",
                            window == "SP" ~  "Spring")) %>%
  mutate_if(is.double, ~sprintf("%0.2f", .)) %>%
  select(window, testlet_id, item_id, everything()) %>%
  kable(align = c("c", "c", "c", rep("r", 4), "c", "c"), booktabs = TRUE,
        linesep = "", escape = FALSE, caption = "True Item Parameters",
        col.names = c("Window", "Testlet", "Item",
                      "$\\pmb{\\lambda_0}$",
                      "$\\pmb{\\lambda_{1,1}}$", "$\\pmb{b_{i,0}}$",
                      "$\\pmb{b_{i,1,1}}$", "$\\pmb{\\lambda_0 + b_{i,0}}$",
                      "$\\pmb{\\lambda_{1,1} + b_{i,1,1}}$")) %>%
  kable_styling(latex_options = "HOLD_position", position = "left") %>%
  row_spec(0, bold = TRUE, align = "c") %>%
  collapse_rows(columns = 1:2, latex_hline = "custom",
                custom_latex_hline = 2, valign = "middle")
```

To mimic the DLM test structure, students were randomly assigned a combination of the simulated testlets. Following the test administration design for the instructionally embedded DLM testing model [for details see Chapter 4 of @dlm_tech_1415_im], students were assigned testlets from both the instructionally embedded and spring pools. During spring assessments, students were randomly assigned only one testlet. For instructionally embedded assessments, students had a `r str_extract(indefinite((1 - rt_pct) * 100), "\\w+")` `r (1 - rt_pct) * 100`% chance of taking only one testlet and `r str_extract(indefinite(rt_pct * 100), "\\w+")` `r rt_pct * 100`% chance of taking both testlets. This is consistent with the reported usage of the instructionally embedded assessment window [@ie_usage]. The resulting probabilities for each possible combination of assigned testlets can be seen in Table \@ref(tab:testlet-prob), along with the total number of students actually simulated to have that combination. In total, `r prettyNum(num_stu, big.mark = ",")` students were simulated, which is consistent with the total number of students that test on a single attribute in a given year from states participating in the instructionally embedded assessment model [see Chapter 7 of @dlm_tech_1415_im].

```{r testlet-prob}
crossing(fall = c("101", "102", "101,102"), sp = c("103", "104", "103,104")) %>%
  mutate(testlet_id = glue("{fall},{sp}"),
         testlet_id = as.character(testlet_id)) %>%
  select(-fall, -sp) %>%
  mutate(testlets = str_count(testlet_id, ",") + 1,
         weight = case_when(testlets == 2 ~ ((1 - rt_pct)^2) * 0.25,
                            testlets == 3 ~ ((1 - rt_pct) * rt_pct) * 0.5,
                            testlets == 4 ~ (rt_pct * rt_pct))) %>%
  filter(weight > 0) %>%
  arrange(testlets, testlet_id) %>%
  select(-testlets) %>%
  left_join(
    all_response %>%
      select(stu_id, testlet_id) %>%
      group_by(stu_id) %>%
      summarize(testlet_id = paste(sort(unique(testlet_id)), collapse = ",")) %>%
      count(testlet_id),
    by = "testlet_id"
  ) %>%
  mutate(testlet_id = str_replace_all(testlet_id, ",", ", "),
         weight = sprintf("%0.3f", weight),
         n = prettyNum(n, big.mark = ",")) %>%
  kable(align = c("c", "c", "r"), booktabs = TRUE, linesep = "", escape = FALSE,
        col.names = c("Testlet Combination", "Probability", "\\textit{n}"),
        caption = "Number of Simulated Students Assigned to Each Testlet Combination") %>%
  kable_styling(latex_options = "HOLD_position", position = "left") %>%
  row_spec(0, bold = TRUE, align = "c")
```

## Model Estimation {#estimate}

```{r estimate-models, dependson = "example-data", cache = TRUE, include = FALSE}
chains <- 4
iter <- 2000
warmup <- 1000

set.seed(1992)
fung_init <- map(seq_len(chains), function(x, num) {
  list(
    mean_intercept = runif(1, -2.25, -1.00),
    mean_maineffect = runif(1, 1.00, 4.50)
  )
})
pteq_init <- map(seq_len(chains), function(x, num) {
  list(
    mean_intercept = runif(1, -2.25, -1.00),
    mean_maineffect = runif(1, 1.00, 4.50),
    intercept_dev = runif(num, -0.5, 0.5),
    maineffect_dev = runif(num, -0.5, 0.5)
  )
}, num = nrow(items))
nfng_init <- map(seq_len(chains), function(x, num) {
  list(
    intercept = runif(num, -2.25, -1.00),
    maineffect = runif(num, 1.00, 4.50)
  )
}, num = nrow(items))

if (file_exists(here("data", "estimated-models", "fung.rds"))) {
  fung <- read_rds(here("data", "estimated-models", "fung.rds"))
} else {
  fung <- stan(here("Stan", "lca_fungible.stan"), data = stan_data,
             init = fung_init, chains = chains, iter = iter, warmup = warmup,
             cores = chains, refresh = 0, seed = 924,
             control = list(adapt_delta = 0.99, max_treedepth = 15))
  write_rds(fung, here("data", "estimated-models", "fung.rds"), compress = "gz")
}

if (file_exists(here("data", "estimated-models", "pteq.rds"))) {
  pteq <- read_rds(here("data", "estimated-models", "pteq.rds"))
} else {
  pteq <- stan(here("Stan", "lca_parteqest.stan"), data = stan_data,
             init = pteq_init, chains = chains, iter = iter, warmup = warmup,
             cores = chains, refresh = 0, seed = 924,
             control = list(adapt_delta = 0.99, max_treedepth = 15))
  write_rds(pteq, here("data", "estimated-models", "pteq.rds"), compress = "gz")
}

if (file_exists(here("data", "estimated-models", "nfng.rds"))) {
  nfng <- read_rds(here("data", "estimated-models", "nfng.rds"))
} else {
  nfng <- stan(here("Stan", "lca_nonfungible.stan"), data = stan_data,
             init = nfng_init, chains = chains, iter = iter, warmup = warmup,
             cores = chains, refresh = 0, seed = 924,
             control = list(adapt_delta = 0.99, max_treedepth = 15))
  write_rds(nfng, here("data", "estimated-models", "nfng.rds"), compress = "gz")
}
```

```{r ppmc-samples, dependson = "estimate-models", include = FALSE}
# ppmc
if (file_exists(here("data", "estimated-models", "fung_ppmc.rds"))) {
  fung_ppmc <- read_rds(here("data", "estimated-models", "fung_ppmc.rds"))
} else {
  fung_ppmc <- rep_data(fung, obs = response_matrix) %>%
    write_rds(here("data", "estimated-models", "fung_ppmc.rds"))
}

if (file_exists(here("data", "estimated-models", "pteq_ppmc.rds"))) {
  pteq_ppmc <- read_rds(here("data", "estimated-models", "pteq_ppmc.rds"))
} else {
  pteq_ppmc <- rep_data(pteq, obs = response_matrix) %>%
    write_rds(here("data", "estimated-models", "pteq_ppmc.rds"))
}

if (file_exists(here("data", "estimated-models", "nfng_ppmc.rds"))) {
  nfng_ppmc <- read_rds(here("data", "estimated-models", "nfng_ppmc.rds"))
} else {
  nfng_ppmc <- rep_data(nfng, obs = response_matrix) %>%
    write_rds(here("data", "estimated-models", "nfng_ppmc.rds"))
}

# compare
if (all(file_exists(here("data", "estimated-models",
                         glue("{c('fung_loo', 'fung_waic')}.rds"))))) {
  fung_loo <- read_rds(here("data", "estimated-models", "fung_loo.rds"))
  fung_waic <- read_rds(here("data", "estimated-models", "fung_waic.rds"))
} else {
  fung_log_lik <- extract_log_lik(fung)
  fung_loo <- loo(fung) %>%
    write_rds(here("data", "estimated-models", "fung_loo.rds"))
  fung_waic <- waic(fung_log_lik) %>%
    write_rds(here("data", "estimated-models", "fung_waic.rds"))
}

if (all(file_exists(here("data", "estimated-models",
                         glue("{c('pteq_loo', 'pteq_waic')}.rds"))))) {
  pteq_loo <- read_rds(here("data", "estimated-models", "pteq_loo.rds"))
  pteq_waic <- read_rds(here("data", "estimated-models", "pteq_waic.rds"))
} else {
  pteq_log_lik <- extract_log_lik(pteq)
  pteq_loo <- loo(pteq) %>%
    write_rds(here("data", "estimated-models", "pteq_loo.rds"))
  pteq_waic <- waic(pteq_log_lik) %>%
    write_rds(here("data", "estimated-models", "pteq_waic.rds"))
}

if (all(file_exists(here("data", "estimated-models",
                         glue("{c('nfng_loo', 'nfng_waic')}.rds"))))) {
  nfng_loo <- read_rds(here("data", "estimated-models", "nfng_loo.rds"))
  nfng_waic <- read_rds(here("data", "estimated-models", "nfng_waic.rds"))
} else {
  nfng_log_lik <- extract_log_lik(nfng)
  nfng_loo <- loo(nfng) %>%
    write_rds(here("data", "estimated-models", "nfng_loo.rds"))
  nfng_waic <- waic(nfng_log_lik) %>%
    write_rds(here("data", "estimated-models", "nfng_waic.rds"))
}
```

The models are estimated in *R* version `r getRversion()` [@R-base] using the **rstan** package interface [@R-rstan] to *Stan* [@stan], which utilizes Markov chain Monte Carlo (MCMC) and the Hamiltonian Monte Carlo (HMC) algorithm to efficiently transition between draws of the posterior distribution [@hmc; @hmc_intro]. Specifically, *Stan* utilizes the No-U-Turn sampler [NUTS; @nuts] to dynamically choose a step size and leap trajectory for the HMC algorithm in order to ensure efficient estimation [@hmc_step]. A complete description HMC with NUTS can be found in @nuts. For a less technical introduction to MCMC and HMC, see @sr_mcmc.

The *Stan* code for all models can be found in the [online repository for this report](https://github.com/atlas-aai/bayes-concept). The models were estimated with `r words(chains)` chains, each with `r prettyNum(iter, big.mark = ",")` iterations. The first `r prettyNum(warmup, big.mark = ",")` iterations of each chain were discarded for warm-up, leaving a total of `r prettyNum((iter - warmup) * chains, big.mark = ",")` retained iterations that made up the posterior distributions. There were also several settings specific to NUTS [@nuts] used by *Stan*. First, the adaptive threshold was set to 0.99 to avoid divergent transitions [@betancourt_diverge]. Secondly, the maximum tree depth, which determines how far the algorithm can go before making a U-turn [@betancourt_rstan], was set to 15. These are both more conservative than the values suggested by the @stan_user. The implications of these setting are discussed in the following sections, along with diagnostics to assess their impact.

After estimating the model but before the parameters can be analyzed and inferences can be made, the model is checked to ensure the estimation process completed in an appropriate manner. This diagnostic information is critical to MCMC estimation, as without proper estimation, no valid inferences can be made. Checks include evaluating convergence, efficiency of the sampler, and parameter recovery. Each check is described in detail below using the estimated partial equivalency model as an example, as this was the true data-generating model.

### Convergence {#converge}

A check of convergence evaluates whether the MCMC chain successfully found the high density area of the posterior distribution and stayed there. When multiple chains are estimated, this can be checked by verifying that each chain is drawing estimates from the same parameter space. For a single chain, this is checked by verifying that the parameter is sampled from roughly the same area at the beginning of the chain (after warm-up) as it is at the end of the chain. This is commonly assessed through trace plots. An example of a trace plot is shown in Figure \@ref(fig:exm-trace).

(ref:exm-trace-cap) Trace plot for the attribute-level intercept $\lambda_0$.

```{r exm-trace, dependson = "estimate-models", fig.cap = "(ref:exm-trace-cap)"}
gather_draws(pteq, mean_intercept) %>%
  ggplot(aes(x = .iteration, y = .value, color = factor(.chain))) +
    geom_line() +
    labs(x = "Iteration", y = expression(lambda[0]), color = "Chain") -> plot

plot %>%
  ggsave2(fig_path(".png")) %>%
  ggsave2(fig_path(".pdf"))

include_graphics(fig_path(".pdf"))
```

Figure \@ref(fig:exm-trace) shows the trace plot for the attribute-level intercept, $\lambda_0$, and looks the way a trace plot is expected to look. The draws appear to be coming from a stable distribution (i.e., the plot is relatively horizontal with no large upward or downward swings), and all `r words(chains)` are mixing well (as evidenced by the overlap of the `r words(chains)` colors). However, there is no empirical method that uses visual inspection alone to determine how poor a trace plot must be to conclude convergence was not met. Additionally, when there are many parameters, it is impractical to look at each individual trace plot.

To address these shortcomings of evaluating trace plots directly, the $\widehat{R}$ statistic can be used to evaluate convergence [@rhat; @new_rhat]. The $\widehat{R}$ statistic is also known as the potential scale reduction [@bda3] and is a measure of how much variance there is between chains relative to the amount of variation within chains. @rhat_cut suggest that in order to conclude that the model has successfully converged, all $\widehat{R}$ values should be less than 1.1. These results can be summarized, as in Figure \@ref(fig:rhat), to demonstrate the $\widehat{R}$ values for the estimated parameters. In the estimated partial equivalency model, all values are below 1.1, indicating that the model converged.

(ref:rhat-cap) $\widehat{R}$ values for the estimated parameters in the partial equivalency model. Dotted line represents the suggested cutoff by @rhat_cut.   
(ref:rhat-scap) $\widehat{R}$ values for the estimated parameters in the partial equivalency model.

```{r rhat, fig.cap = "(ref:rhat-cap)", fig.scap = "(ref:rhat-scap)"}
parms <- c("mean_intercept", "mean_maineffect", "intercept_dev",
           "maineffect_dev", "intercept_sd", "maineffect_sd", "nu")
labls <- c(expression(lambda[0]), expression(lambda[list(1,1)]),
           expression(italic(b)[list(i,0)]), expression(italic(b)[list(i,1,1)]),
           expression(sigma[italic(b)[list(i,0)]]),
           expression(sigma[italic(b)[list(i,1,1)]]), expression(nu[c]))

sims <- as.array(pteq)

apply(sims, MARGIN = 3, FUN = Rhat) %>%
  enframe(name = "parameter", value = "Rhat") %>%
  mutate(parameter = str_replace_all(parameter, "\\[.*\\]", "")) %>%
  filter(parameter %in% c("nu", "mean_intercept", "mean_maineffect",
                          "intercept_dev", "maineffect_dev", "intercept_sd",
                          "maineffect_sd")) %>%
  mutate(parameter = factor(parameter, levels = parms)) %>%
  ggplot(aes(x = parameter, y = Rhat, color = parameter)) +
    geom_jitter(size = 3, height = 0, width = 0.2, show.legend = FALSE) +
    geom_hline(yintercept = 1.1, linetype = "dashed") +
    scale_x_discrete(labels = labls) +
    labs(x = NULL, y = expression(widehat(R))) -> plot

plot %>%
  ggsave2(fig_path(".png")) %>%
  ggsave2(fig_path(".pdf"))

include_graphics(fig_path(".pdf"))
```

### Efficiency

A second check of the MCMC estimation is the efficiency of the sampler, which verifies that the algorithm adequately sampled the full posterior distribution. There are several ways this can be examined. The first is by examining the effective sample size. This diagnostic takes into account the autocorrelation (or anticorrelation) within chains to determine the effective number of independent draws from the posterior. If the chain is slow moving, the draws will be highly autocorrelated, and effective sample size will be well below the total number of retained iterations [@auto_corr]. Conversely, if the chain is moving quickly, it is possible for the draws to be better than independent, or anticorrelated [@anti_corr]. In this scenario, the effective sample size is actually larger than the true sample size.

There are two types of effective sample size that can be used to evaluate the efficiency of the location and scale of the posterior distributions. The sampling efficiency of the location (e.g., mean or median) can be assessed with the bulk effective sample size. Similarly, the scale can be assessed through tail effective sample size. This can be useful for diagnosing problems with mixing due to posterior samples having different scales across chains [@new_rhat]. For both measures, the @stan_best_practice recommend an effective sample size greater than or equal to the number of chains multiplied by 100.

The effective sample size for all parameters in the model can be summarized, as in Figure \@ref(fig:eff-size). Because the model was estimated with `r words(chains)` chains, the effective sample size should be above `r prettyNum(chains * 100, big.mark = ",")`. Figure \@ref(fig:eff-size) shows that all parameters in the estimated partial equivalency model have both a bulk and tail effective sample size above this threshold.

(ref:eff-size-cap) Effective sample size for each estimated parameter. Dotted line represents the suggested cutoff by @stan_best_practice. ESS = effective sample size.  
(ref:eff-size-scap) Effective sample size for each estimated parameter.

```{r eff-size, fig.cap = "(ref:eff-size-cap)", fig.scap = "(ref:eff-size-scap)"}
bulk_ess <- apply(sims, MARGIN = 3, FUN = ess_bulk)
tail_ess <- apply(sims, MARGIN = 3, FUN = ess_tail)

ess <- list(
  summary(pteq)$summary %>%
    as_tibble(rownames = "parameter") %>%
    select(parameter, n_eff),
  enframe(apply(sims, MARGIN = 3, FUN = ess_bulk), "parameter", "bulk_ess"),
  enframe(apply(sims, MARGIN = 3, FUN = ess_tail), "parameter", "tail_ess")
)

reduce(ess, full_join, by = "parameter") %>%
  mutate(parameter = str_replace_all(parameter, "\\[.*\\]", "")) %>%
  filter(parameter %in% c("nu", "mean_intercept", "mean_maineffect",
                          "intercept_dev", "maineffect_dev", "intercept_sd",
                          "maineffect_sd")) %>%
  mutate(parameter = factor(parameter, levels = parms)) %>%
  gather(key = "measure", value = "value", -parameter) %>%
  mutate(measure = case_when(measure == "n_eff" ~ "ESS",
                             measure == "bulk_ess" ~ "Bulk ESS",
                             measure == "tail_ess" ~ "Tail ESS")) %>%
  filter(measure != "ESS") %>%
  ggplot(aes(x = parameter, y = value, color = parameter)) +
    facet_wrap(~ measure, nrow = 1) +
    geom_jitter(size = 3, height = 0, width = 0.2, show.legend = FALSE) +
    geom_hline(yintercept = chains * 100, linetype = "dashed") +
    expand_limits(y = c(0, (iter - warmup) * chains)) +
    scale_x_discrete(labels = labls) +
    scale_y_continuous(labels = scales::comma_format()) +
    labs(x = NULL, y = "Effective Sample Size") -> plot

plot %>%
  ggsave2(fig_path(".png")) %>%
  ggsave2(fig_path(".pdf"))

include_graphics(fig_path(".pdf"))
```

There are also measures of efficiency that are exclusive to NUTS [@nuts]. For example, the Bayesian factor of missing information gives an estimate of how well the sampler adapted and explored the posterior distribution. The Bayesian factor of missing information generally ranges from zero to one, with zero and one representing poor and excellent estimation, respectively. This is calculated for each chain overall, rather than each individual parameter [@bfmi].

The Bayesian factor of missing information values for this example are shown in Table \@ref(tab:efficiency) and indicate that the sample was able to adequately visit the posterior distributions. Additionally, Table \@ref(tab:efficiency) shows the mean acceptance rate for each chain. As expected, these values are very close to the 0.99 adaptive threshold that was specified during the [model estimation](#estimate). As mentioned previously, a target acceptance rate this high is needed to prevent divergent transitions.

The concern with setting the target acceptance rate this high is that for parameters with wider posteriors, the sampler will not be able to move fast enough. When using NUTS, at each iteration, the sampler looks for a place to "U-Turn" in a series of possible branches. If the sample is terminating before the maximum possible tree depth (which was specified to be 15), then the algorithm is able to adequately find good values for the next iteration of the chain, despite the small steps being enforced by the high target acceptance rate. Bumping up against the maximum allowed tree depth, or going beyond it, indicates that the step size is too small [@stan_user; @stan_warn]. Because the maximum tree depth values in Table \@ref(tab:efficiency) are all below the maximum specified, and the Bayesian factor of missing information values are all close to one, there is strong evidence that in this model, the sampler was able to adequately sample the posteriors.

```{r efficiency}
sampler_params <- get_sampler_params(pteq, inc_warmup = FALSE)
upars <- suppressMessages(stan(here("Stan", "lca_parteqest.stan"),
                               data = stan_data, chains = 0)) %>%
  get_num_upars()
E <- as.matrix(sapply(sampler_params, FUN = function(x) x[, "energy__"]))
EBFMI <- upars / apply(E, 2, var)
mean_accept <- sapply(sampler_params, function(x) mean(x[, "accept_stat__"]))
max_treedepth <- sapply(sampler_params, function(x) max(x[, "treedepth__"]))

tibble(chain = glue("{seq_len(chains)}"),
       bfmi = EBFMI,
       mean_accept = mean_accept,
       max_treedepth = as.integer(max_treedepth)) %>%
  mutate_if(is.double, ~ sprintf("%0.3f", .)) %>%
  kable(align = "c", booktabs = TRUE, linesep = "",
        caption = "Diagnostic Statistics for the No-U-Turn Sampler",
        col.names = c("Chain", "BFMI", "Mean Acceptance Rate",
                      "Max Tree Depth")) %>%
  kable_styling(latex_options = "HOLD_position", full_width = TRUE) %>%
  row_spec(0, bold = TRUE, align = "c") %>%
  footnote(general = "BFMI = Bayesian factor of missing information.",
           footnote_as_chunk = TRUE)
```

### Parameter Recovery

In addition to having diagnostics to ensure that the model is estimated properly, it is also important to establish that the model as defined in the *Stan* code is able to accurately recover the true parameter values. Otherwise, a model may estimate well but be miss-specified, leading to incorrect parameter estimates. Figure \@ref(fig:item-recover) shows the true (from Table \@ref(tab:true-item-param)) versus estimated item parameter values, indicating successful parameter recovery for the partial equivalency model with the simulated data.

```{r item-recover, fig.cap = "Parameter recovery from the example partial equivalency model with simulated data."}
recovery_sum <- summary(pteq)$summary %>%
  as_tibble(rownames = "parameter") %>%
  filter(str_detect(parameter, "(intercept|maineffect)\\[")) %>%
  select(parameter, est = mean) %>%
  separate(parameter, into = c("parameter", "item_id", NA), convert = TRUE) %>%
  left_join(
    items %>%
      select(item_id, intercept, maineffect) %>%
      gather(key = "parameter", value = "true", -item_id),
    by = c("parameter", "item_id")
  ) %>%
  mutate(
    parameter = factor(parameter, levels = c("intercept", "maineffect"),
                       labels = c("lambda[0] + italic(b)[list(i,0)]",
                                  "lambda[1,1] + italic(b)[list(i,1,1)]"))
  )

facet_limits <- recovery_sum %>%
  group_by(parameter) %>%
  summarize(min = min(est, true), max = max(est, true)) %>%
  pivot_longer(cols = c(min, max), names_to = "type", values_to = "est") %>%
  mutate(true = est)

ggplot(recovery_sum, aes(x = true, y = est)) +
  facet_wrap(~ parameter, nrow = 1, scales = "free",
             labeller = label_parsed) +
  geom_point(size = 3) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  geom_blank(data = facet_limits) +
  labs(x = "True Value", y = "Estimated Value") -> plot

plot %>%
  ggsave2(fig_path(".png")) %>%
  ggsave2(fig_path(".pdf"))

include_graphics(fig_path(".pdf"))
```

```{r class-recover-calc}
thresh <- 0.5
class_recovery <- mastery %>%
  rename(true = mastery) %>%
  left_join(
    map_dfr(pteq_ppmc$mastery_probs, function(x) return(x), .id = ".draw") %>%
      group_by(stu_id) %>%
      summarize(prob = median(master_prob)) %>%
      mutate(est = case_when(prob >= thresh ~ 1L, TRUE ~ 0L)) %>%
      select(stu_id, est),
    by = "stu_id"
  ) %>%
  count(true, est)

correct_rate <- class_recovery %>%
  mutate(correct = case_when(true == est ~ n, TRUE ~ 0L)) %>%
  summarize(correct = sum(correct) / sum(n)) %>%
  pull(correct)
```

It is also possible to examine the accuracy of the respondent classifications as a master or non-master. For this analysis, respondents were classified as masters if the median of the posterior distribution for the probability of mastery was greater than or equal to `r sprintf("%0.1f", thresh)`. This threshold places respondents in their most likely class; however, any threshold can be used in practice to facilitate stakeholder understanding of scores [@bradshaw_2019]. Respondent classification results for the partial equivalency model are summarized in Table \@ref(tab:class-recover). In total, `r sprintf("%0.0f", correct_rate * 100)`% of the simulated students were correctly classified as masters or non-masters. This is not surprising, given that the data was simulated to fit the model, and all of the items are fairly discriminating (Table \@ref(tab:true-item-param)).

```{r class-recover}
class_recovery %>%
  mutate(n = prettyNum(n, big.mark = ",")) %>%
  kable(align = c("c", "c", "r"), booktabs = TRUE, linesep = "", escape = FALSE,
        caption = "Respondent Classification Accuracy",
        col.names = c("True Mastery", "Estimated Mastery", "\\textit{n}")) %>%
  kable_styling(latex_options = "HOLD_position", position = "left") %>%
  row_spec(0, bold = TRUE, align = "c")
```

## Evaluating Model Fit

```{r clear-mem, include = FALSE}
rm(fung, pteq, nfng, stan_data); gc()
```

Model fit can be assessed in both an absolute and relative sense. Absolute fit is used to evaluate whether or not the estimated model adequately reflects the observed data and is a prerequisite for the evaluation of relative fit. Relative fit compares the fit of two more models that all show adequate absolute model fit [@chen_2013; @sen_2017]. In this document, methods are presented for assessing absolute and relative fit through posterior predictive model checking and information criteria, respectively. In order to demonstrate how these methods work in practice, both the fungible and non-fungible models were estimated on the same data used to estimate the partial equivalency model in the [previous section](#estimate). Thus, there are a total of three models to compare and calculate posterior predictive checks for. As described previously, the partial equivalency model was the true data generation model.

### Absolute Fit {#abs-fit}

Posterior predictive model checks are used to assess the absolute fit of a specific model to the observed data. Posterior predictive checks involve simulating replications of the data using the values of the posterior distributions and then comparing the replicated data sets back to the observed data [@bda3]. As explained in the [model estimation section](#estimate), a total of `r prettyNum((iter - warmup) * chains, big.mark = ",")` iterations were retained from the MCMC estimation. Thus, `r prettyNum((iter - warmup) * chains, big.mark = ",")` replicated data sets can be simulated, one for each iteration, using the current values of the parameters at each iteration. The process for simulating a replicated data set for a single iteration is as follows:

1. Randomly assign the first respondent to the master or non-master class, with probability equal to the current value of the respondent's probability of attribute mastery.
2. For the first item the respondent takes, simulate a response using the current values of the item parameters and the mastery status that was simulated in step 1.
3. Repeat step 2 for all items the respondent tested on.
4. Repeat steps 1--3 for all respondents.

This process is repeated for each iteration in the chain. Because the replicated data sets are simulated from the current values of the parameters, these data sets represent what the data would be expected to look like *if the specified model were true*. Therefore, summaries of these data sets can then be used to look for systematic differences in the characteristics of the observed data and the replicated data sets, often through visualizations [@gelman_hill].

#### Model-Level Fit

```{r score-dist-calc, cache = TRUE, include = FALSE}
calc_score_dist <- function(x) {
  map_dfr(x$ppmc_data, function(x) {
    x %>%
      group_by(stu_id) %>%
      summarize(raw_score = sum(score)) %>%
      count(raw_score)
  }, .id = ".draw")
}

score_dist <- list(fung_ppmc, pteq_ppmc, nfng_ppmc) %>%
  set_names("Fungible", "Partial Equivalency", "Non-fungible") %>%
  map(calc_score_dist) %>%
  bind_rows(.id = "model") %>%
  complete(model, .draw, raw_score, fill = list(n = 0)) %>%
  mutate(model = factor(model, levels = c("Fungible", "Partial Equivalency",
                                          "Non-fungible")))

score_summary <- score_dist %>%
  group_by(model, raw_score) %>%
  summarize(mean = mean(n),
            median = median(n),
            lb = quantile(n, probs = 0.025),
            ub = quantile(n, probs = 0.975)) %>%
  left_join(
    response_matrix %>%
      group_by(stu_id) %>%
      summarize(raw_score = sum(score)) %>%
      count(raw_score),
    by = "raw_score"
  ) %>%
  rename(obs = n) %>%
  replace_na(list(obs = 0))

fung_text <- score_summary %>%
  filter(model == "Fungible") %>%
  mutate(under = obs > ub,
         over = obs < lb) %>%
  filter(over | under)
```

At the model level, posterior predictive checks can be calculated for the raw score distribution. This is accomplished by counting the number of respondents at each raw score point in each of the `r prettyNum((iter - warmup) * chains, big.mark = ",")` replicated data sets. Thus, a distribution is derived from the number of respondents expected to be present at each raw score point in the observed data. Figure \@ref(fig:score-dist) shows the distribution of expected raw scores along with the number of observed students in the simulated data. 

```{r score-dist, fig.cap = "Posterior predictive model check for the raw score distribution."}
ggplot() +
  facet_wrap(~ model, ncol = 1) +
  geom_jitter(data = group_by(score_dist, model, raw_score) %>% sample_n(500),
              aes(x = raw_score, y = n),
              alpha = 0.2, height = 0, width = 0.3) +
  geom_line(data = score_summary,
            aes(x = raw_score, y = lb, color = "95% Credible Interval"),
            linetype = "dashed", show.legend = FALSE) +
  geom_line(data = score_summary,
            aes(x = raw_score, y = ub, color = "95% Credible Interval"),
            linetype = "dashed", show.legend = FALSE) +
  geom_line(data = score_summary,
            aes(x = raw_score, y = obs, color = "Observed"),
            linetype = "solid") +
  geom_point(data = score_summary,
            aes(x = raw_score, y = obs, color = "Observed"),
            size = 4, show.legend = FALSE) +
  scale_x_continuous(breaks = seq(0, 20, 1)) +
  scale_y_continuous(labels = scales::comma_format()) +
  labs(x = "Raw Score", y = "Students", color = NULL) -> plot

plot %>%
  ggsave2(fig_path(".png"), height = 8) %>%
  ggsave2(fig_path(".pdf"), height = 8)

include_graphics(fig_path(".pdf"))
```

Figure \@ref(fig:score-dist) shows a bimodal distribution, which is a result of the mixture of raw score distributions for masters and non-masters. Additionally, very few students are expected to have a high raw score due to the fact that relatively few students test on more than two testlets (Table \@ref(tab:testlet-prob)). Finally, the expected distribution for fungible model shows some deviations from the observed scores. Specifically, the fungible model overestimates the number of students with a raw score of `r pull(filter(fung_text, over), raw_score) %>% english() %>% combine_words()` and underestimates the number of students with a raw score of `r pull(filter(fung_text, under), raw_score) %>% english() %>% combine_words()`.

Similar to the examination of trace plots [above](#converge) (Figure \@ref(fig:exm-trace)), this visualization alone is insufficient for determining if the amount of misfit in the distribution is significant. Rather, @beguin_2001 suggest a $\chi^2$ discrepancy measure can be calculated according to equation \@ref(eq:chisq).

\begin{equation}
  \chi_{obs}^2=\sum_{s=0}^S\frac{[n_s-E(n_s)]^2}{E(n_s)}
  (\#eq:chisq)
\end{equation}

In equation \@ref(eq:chisq), $s$ represents the score point, $n_s$ is the number of respondents at score point $s$, and $E(n_s)$ is the expected number of respondents at score point $s$, calculated as the average over all of the replicated data sets. Like the $\chi^2$ tests that are used to assess model fit when the expectation-maximization algorithm is used, the $\chi_{obs}^2$ statistic does not follow a true $\chi^2$ distribution. However, when using posterior predictive model checks, none of the distributional assumptions are required. This is because the reference distribution can be generated directly from the replicated data sets, similar to a parametric bootstrap. Using the same definition of $E(n_s)$ as above, a $\chi_{rep}^2$ can be computed for each of the replicated data sets. The `r prettyNum((iter - warmup) * chains, big.mark = ",")` $\chi_{rep}^2$ values then make up the reference distribution to compare back to $\chi_{obs}^2$. A posterior predictive *p*-value (*ppp*) can then be calculated as shown in equation \@ref(eq:ppp).

\begin{equation}
  ppp=P(\chi_{rep}^2\geq\chi_{obs}^2\ |\ n_s)
  (\#eq:ppp)
\end{equation}

Equation \@ref(eq:ppp) says that the posterior predictive *p*-value is the proportion of replicated data sets whose $\chi_{rep}^2$ value is greater than the $\chi_{obs}^2$ value from the observed data. Posterior predictive *p*-values close to zero indicate poor model fit (a cutoff of .05 could be used, for example), whereas values very close to one may indicate possible over-fitting. The $\chi_{obs}^2$ distributions and posterior predictive *p*-values for the fungible, partial equivalency, and non-fungible model are shown in Table \@ref(tab:chisq-stats) and visualized in Figure \@ref(fig:chisq-dist). As expected, given the distributions in Figure \@ref(fig:score-dist), the fungible model shows poor model fit, with a posterior predictive *p*-value of less than .05. In contrast, both the partial equivalency and non-fungible models show acceptable fit to the simulated data.

```{r chisq-calc, include = FALSE}
ppmc_chisq <- score_dist %>%
  left_join(select(score_summary, model, raw_score, exp = mean),
            by = c("model", "raw_score")) %>%
  mutate(exp = na_if(exp, 0)) %>%
  replace_na(list(exp = 0.000001)) %>%
  mutate(piece = ((n - exp) ^ 2) / exp) %>%
  group_by(model, .draw) %>%
  summarize(chisq = sum(piece))

obs_chisq <- response_matrix %>%
  group_by(stu_id) %>%
  summarize(raw_score = sum(score)) %>%
  count(raw_score) %>%
  full_join(select(score_summary, model, raw_score, exp = mean),
            by =  "raw_score") %>%
  mutate(exp = na_if(exp, 0)) %>%
  replace_na(list(exp = 0.000001, n = 0)) %>%
  mutate(piece = ((n - exp) ^ 2) / exp) %>%
  group_by(model) %>%
  summarize(obs_chisq = sum(piece)) %>%
  left_join(ppmc_chisq, by = "model") %>%
  group_by(model) %>%
  summarize(ppp = sprintf("%0.3f", mean(chisq >= obs_chisq)),
            obs_chisq = unique(obs_chisq),
            rep_mean = mean(chisq),
            rep_5 = quantile(chisq, probs = 0.05),
            rep_95 = quantile(chisq, probs = 0.95)) %>%
  mutate(sign = case_when(ppp == "1.000" ~ ">",
                          ppp == "0.000" ~ "<",
                          TRUE ~  "="),
         ppp = case_when(ppp == "1.000" ~ "0.999",
                         ppp == "0.000" ~ "0.001",
                         TRUE ~ ppp),
         lab = paste0(expression(italic(ppp)), '~"', sign, '"~', ppp))
```

(ref:chisq-stats-cap) $\chi_{obs}^2$ Values and Summaries of $\chi_{rep}^2$ Distributions

(ref:chisq-stat-foot) *ppp* = posterior predictive *p*-value.

```{r chisq-stats}
obs_chisq %>%
  select(model, obs_chisq, rep_mean, rep_5, rep_95, ppp) %>%
  mutate_if(is.double, ~ sprintf("%0.2f", .)) %>%
  kable(align = c("l", "r", rep("c", 4)), booktabs = TRUE, linesep = "",
        escape = FALSE, caption = "(ref:chisq-stats-cap)",
        col.names = c("Model", "$\\pmb{\\chi_{obs}^2}$",
                      "$\\pmb{\\chi_{rep}^2}$ Mean",
                      "$\\pmb{\\chi_{rep}^2}$ 5\\%",
                      "$\\pmb{\\chi_{rep}^2}$ 95\\%",
                      "\\textit{ppp}")) %>%
  kable_styling(latex_options = "HOLD_position", position = "left") %>%
  row_spec(0, bold = TRUE, align = "c") %>%
  column_spec(1, width = "10em") %>%
  footnote(general = "(ref:chisq-stat-foot)", footnote_as_chunk = TRUE)
```

(ref:chisq-dist-cap) Posterior predictive model check for $\chi_{rep}^2$ distributions and $\chi_{obs}^2$ values. Dashed lines represent the observed $\chi_{obs}^2$ value.  
(ref:chisq-dist-scap) Posterior predictive model check for $\chi_{rep}^2$ distributions and $\chi_{obs}^2$ values.

```{r chisq-dist, fig.cap = "(ref:chisq-dist-cap)", fig.scap = "(ref:chisq-dist-scap)"}
ggplot() +
  facet_wrap(~ model, ncol = 1) +
  geom_histogram(data = ppmc_chisq,
                 aes(x = chisq, y = stat(density),
                     fill = "95% Credible Interval",
                     color = "95% Credible Interval"),
                 alpha = 0.8, binwidth = 1, boundary = 0, stat = StatBin) +
  geom_histogram(data = ppmc_chisq,
                 aes(x = chisq, y = stat(density), fill = "Observed"),
                 alpha = 0, binwidth = 1, stat = StatBin) +
  geom_line(data = ppmc_chisq, aes(x = chisq), stat = "density", trim = FALSE) +
  geom_vline(data = obs_chisq,
             aes(xintercept = obs_chisq, color = "Observed"),
             linetype = "dashed", size = 1, show.legend = FALSE) +
  # geom_text(data = obs_chisq,
  #           aes(x = 40, y = 0.003, label = lab),
  #           vjust = 0, hjust = 0, parse = TRUE) +
  expand_limits(x = c(0, 65)) +
  scale_x_continuous(breaks = seq(0, 70, 10))  +
  labs(x = expression(chi[italic(rep)]^2), y = "Posterior Density", fill = NULL) +
  guides(color = "none") -> plot

plot %>%
  ggsave2(fig_path(".png"), height = 8) %>%
  ggsave2(fig_path(".pdf"), height = 8)

include_graphics(fig_path(".pdf"))
```

#### Item-Level Fit

Posterior predictive checks can also be used to assess item-level fit by creating posterior summaries of item *p*-values. This is accomplished by calculating a *p*-value for each item in each of the `r prettyNum((iter - warmup) * chains, big.mark = ",")` replicated data sets. This provides a distribution for plausible item *p*-values if the model fits. The item *p*-values from the observed data can then be compared to these distributions. Figure \@ref(fig:ppmc-all-pval) shows an example of these comparisons for the fungible, partial equivalency, and non-fungible models. This shows the assumptions of the fungible model. That is, each item has the same expected *p*-value in the fungible model, but different expectations for the partial equivalency and non-fungible models. Because the simulated data was not fungible (i.e., was generated from the partial equivalency model), Figure \@ref(fig:ppmc-all-pval) correctly shows that the fungible model was unable to successfully recover the observed item *p*-values.

```{r pval-calc, include = FALSE}
all_pval <- bind_rows(
    select(fung_ppmc, .draw, pvals) %>%
      unnest(cols = pvals) %>%
      mutate(model = "Fungible"),
    select(pteq_ppmc, .draw, pvals) %>%
      unnest(cols = pvals) %>%
      mutate(model = "Partial Equivalency"),
    select(nfng_ppmc, .draw, pvals) %>%
      unnest(cols = pvals) %>%
      mutate(model = "Non-fungible")
  ) %>%
    mutate(model = factor(model, levels = c("Fungible", "Partial Equivalency",
                                            "Non-fungible")))
```

(ref:ppmc-all-pval-cap) Posterior predictive model check for overall item *p*-values.

```{r ppmc-all-pval, fig.cap = "(ref:ppmc-all-pval-cap)"}
pval_summary <- all_pval %>%
  group_by(model, item_id) %>%
  summarize(obs = unique(obs_pval),
            ppmc_lb = quantile(ppmc_pval, probs = 0.025),
            ppmc_mb = quantile(ppmc_pval, probs = 0.500),
            ppmc_ub = quantile(ppmc_pval, probs = 0.975))

ggplot() +
  facet_wrap(~ model, ncol = 1) +
  geom_jitter(data = group_by(all_pval, model, item_id) %>% sample_n(500),
              aes(x = item_id, y = ppmc_pval),
              alpha = 0.2, height = 0, width = 0.3) +
  geom_crossbar(data = pval_summary,
                aes(x = item_id, y = ppmc_mb, ymin = ppmc_lb, ymax = ppmc_ub,
                    color = "95% Credible Interval"),
                show.legend = FALSE) +
  geom_point(data = pval_summary,
             aes(x = item_id, y = obs, color = "Observed"),
             shape = 18, size = 3) +
  expand_limits(y = c(0, 1)) +
  scale_x_continuous(breaks = seq(1, nrow(items), 1)) +
  labs(x = "Item", y = expression(Item~~italic(p)*-value), color = NULL) +
  theme(panel.grid.minor.x = element_blank()) +
  guides(color = guide_legend(override.aes = list(shape = 15))) -> plot

plot %>%
  ggsave2(fig_path(".png"), height = 8) %>%
  ggsave2(fig_path(".pdf"), height = 8)

include_graphics(fig_path(".pdf"))
```

One limitation of using the overall item *p*-values is that this method overlooks an important aspect of the model. As shown in equation \@ref(eq:dlm-lcdm), the model actually defines two *conditional* probabilities, not one unconditional probability. In other words, the model defines a probability of non-masters providing a correct response, and a separate probability of masters providing a correct response. Thus, using a single *p*-value may miss important characteristics of the data. To examine this, posterior distributions for *p*-values conditional on mastery status can be estimated, similar to the overall *p*-value method.

This process is slightly complicated by that fact that mastery status is unobserved. Practically, this means there is not an observed *p*-value for each mastery class to compare back to the posterior distributions readily available in the data. To calculate the *p*-value of an item for each mastery class, a procedure similar to that described by @sinharay_2007 is followed. When using a Bayesian MCMC estimation, a mastery classification can be made at each iteration of the Markov chain, as described [above](#abs-fit). Specifically, the probability of mastery is dichotomized at .5, although other thresholds can also be used. Using these classifications, a class-level *p*-value can be calculated for each iteration using the observed item responses and the respondents who were assigned to each class in that iteration. This results in a series of class-level *p*-values (one per item per iteration). The observed class-level *p*-values are then defined as the median for each class and item across all iterations [see @sinharay_2007 for full details].

Similarly, an expected class-level *p*-value can be calculated by following the same procedure, but with item responses generated in the replicated data sets rather than those in the observed data. In this way, a distribution of expected conditional *p*-values can be estimated to compare the observed conditional *p*-values to. This comparison is shown in Figure \@ref(fig:ppmc-cond-pval).

(ref:ppmc-cond-pval-cap) Posterior predictive model check for conditional item *p*-values.

```{r ppmc-cond-pval, fig.cap = "(ref:ppmc-cond-pval-cap)"}
cond_pval_summary <- all_pval %>%
  select(model, item_id, .draw, contains("ms"), contains("nm")) %>%
  pivot_longer(cols = c(-model, -item_id, -.draw)) %>%
  separate(name, into = c("class", "meas"), extra = "merge") %>%
  pivot_wider(names_from = meas, values_from = value) %>%
  group_by(model, item_id, class) %>%
  summarize(obs = median(obs_pval),
            ppmc_lb = quantile(ppmc_pval, probs = 0.025),
            ppmc_mb = quantile(ppmc_pval, probs = 0.500),
            ppmc_ub = quantile(ppmc_pval, probs = 0.975))

ggplot(cond_pval_summary) +
  facet_wrap(~ model, ncol = 1) +
  geom_rect(aes(xmin = item_id - 0.2, xmax = item_id + 0.2,
                ymin = ppmc_lb, ymax = ppmc_ub,
                fill = "95% Credible Interval",
                color = class),
            alpha = 0.7, size = 1) +
  geom_point(aes(x = item_id, y = obs, shape = "Observed",
                 color = class), size = 3) +
  scale_color_okabeito(order = c(3, 6), breaks = c("ms", "nm"),
                       labels = c("Master", "Non-master")) +
  scale_shape_manual(values = c(18)) +
  scale_x_continuous(breaks = seq(1, nrow(items), 1)) +
  labs(x = "Item", y = expression(Item~~italic(p)*-value),
       color = NULL, fill = NULL, shape = NULL) +
  theme(panel.grid.minor.x = element_blank()) +
  guides(color = guide_legend(order = 3, override.aes = list(alpha = 0)),
         shape = guide_legend(order = 2),
         fill = guide_legend(order = 1, override.aes = list(alpha = 1))) -> plot

plot %>%
  ggsave2(fig_path(".png"), height = 8) %>%
  ggsave2(fig_path(".pdf"), height = 8)

include_graphics(fig_path(".pdf"))
```

Much like the overall *p*-values (Figure \@ref(fig:ppmc-all-pval)), the fungible model shows a consistent expectation across items for each mastery class, which often misses the observed value. Additionally, the partial equivalency and non-fungible models successfully recover the observed conditional *p*-values for both classes. However, Figure \@ref(fig:ppmc-cond-pval) shows that the fungible model tends to miss *p*-values for the master class more often and by a greater magnitude than for the non-master class. Thus, it appears that there is more non-fungibility in item parameters among the master than non-master class for the data that was simulated for this document. This can also be seen when examining the true parameter values that were used to simulate the data in Table \@ref(tab:true-item-param), which show wider variability for $\lambda_{1,1} + b_{i,1,1}$ than for $\lambda_0 + b_{i,0}$.

### Relative Fit

```{r rel-fit-calc, include = FALSE}
loo_list <- list(pteq_loo, nfng_loo)
waic_list <- list(pteq_waic, nfng_waic)
names(loo_list) <- names(waic_list) <- c("pteq", "nfng")

relative_fit <- tibble(model = names(loo_list),
                       loo = loo_list,
                       waic = waic_list) %>%
  mutate(loo_est = map(loo, function(x) {
    x$estimates %>%
      as_tibble(rownames = "meas") %>%
      filter(meas == "elpd_loo") %>%
      mutate(psis_loo = paste0(format(round(Estimate, 1), nsmall = 1,
                                      big.mark = ","),
                               " (", sprintf("%0.1f", SE), ")")) %>%
      select(loo_est = Estimate, loo_se = SE, loo_label = psis_loo)
  }),
         waic_est = map(waic, function(x) {
    x$estimates %>%
      as_tibble(rownames = "meas") %>%
      filter(meas == "elpd_waic") %>%
      mutate(waic = paste0(format(round(Estimate, 1), nsmall = 1,
                                  big.mark = ","),
                           " (", sprintf("%0.1f", SE), ")")) %>%
             select(waic_est = Estimate, waic_se = SE, waic_label = waic)
  })) %>%
  unnest(cols = c(loo_est, waic_est)) %>%
  left_join(
    loo_compare(loo_list) %>%
      as_tibble(rownames = "model") %>%
      mutate(loo_diff = paste0(sprintf("%0.1f", elpd_diff), " (",
                               sprintf("%0.1f", se_diff), ")")) %>%
      select(model, loo_elpd_diff = elpd_diff, loo_se_diff = se_diff,
             loo_diff_label = loo_diff),
    by = "model"
  ) %>%
  left_join(
    loo_compare(waic_list) %>%
      as_tibble(rownames = "model") %>%
      mutate(waic_diff = paste0(sprintf("%0.1f", elpd_diff), " (",
                                sprintf("%0.1f", se_diff), ")")) %>%
      select(model, waic_elpd_diff = elpd_diff, waic_se_diff = se_diff,
             waic_diff_label = waic_diff),
    by = "model"
  ) %>%
  left_join(
    enframe(loo_model_weights(loo_list), name = "model", value = "stacking") %>%
      mutate(stacking_label = sprintf("%0.4f", stacking),
             stacking_label = case_when(stacking_label == "1.0000" ~ ">.999",
                                        stacking_label == "0.0000" ~ "<.001",
                                        TRUE ~ stacking_label)),
    by = "model"
  ) %>%
  select(-loo, -waic)

loo_diff <- relative_fit %>%
  filter(model == "nfng") %>%
  pull(loo_elpd_diff)
loo_se <- relative_fit %>%
  filter(model == "nfng") %>%
  pull(loo_se_diff)
```

Relative fit is assessed through the comparison of models to determine if one model has relatively better fit than another. Because there is no direct comparison to the data with these methods, it is important that models show adequate absolute model fit before they are compared to each other to determine the best fitting model [@sen_2017]. Therefore, the fungible model is excluded from these analyses, as adequate absolute model fit was not demonstrated in this example data.

In this document, three methods of comparison are demonstrated: Pareto smoothed importance sampling leave-one-out cross validation [PSIS-LOO; @loo_waic; @psis], the widely applicable information criterion [WAIC; @waic], and Bayesian stacking [@stacking]. Both the PSIS-LOO and WAIC provide point estimates for the out-of-sample prediction accuracy using the log-likelihood posterior distribution. However, although the PSIS-LOO and WAIC are asymptotically equivalent, @loo_waic found that the PSIS-LOO is more robust than the WAIC when weak priors are used and when there are influential observations (e.g., a student providing an incorrect response despite a high probability of success). When comparing these indices in practice, a difference between models is considered significant if the absolute difference in the indices is greater than 2.5 times the standard error of the difference [@bengio_2004].

Additionally, model comparisons can be made using model stacking or averaging. These methods work by assigning weights to each model in the comparison that correspond to the weight that should be given to predictions from each model. Thus, the more weight assigned to a model, the more preferred it would be in isolation. This method has the benefit of being less prone to over-fitting [@piironen_2017] and also allowing for more refined inferences [@vehtari_2012]. For example, because the weights are on a probability scale, they are much easier to interpret and compare across models than the PSIS-LOO or WAIC. The Bayesian stacking method described by @stacking is implemented.

Table \@ref(tab:rel-fit) shows that the PSIS-LOO and WAIC fit indices are very similar for both the partial equivalency and non-fungible models. This is expected, as the PSIS-LOO and WAIC are asymptotically equivalent [@loo_waic]. In the model comparisons, both the PSIS-LOO and WAIC prefer the partial equivalency model, as indicated by the zero in the model comparison (the difference between that model and the preferred model). However, the difference in PSIS-LOO and WAIC of `r sprintf("%0.1f", loo_diff)` between the partial equivalency and non-fungible model is less than 2.5 times the standard error of the difference (`r sprintf("%0.1f", loo_se)`). Thus, using the criteria outlined by @bengio_2004, these models fit equally well. In contrast, Bayesian stacking shows a moderate preference for the partial equivalency. In this comparison, nearly two-thirds of the weight is given to the partial equivalency model. Given the totality of these analyses, the partial equivalency model appears to be the model best suited to this simulated data. This data was generated from the partial equivalency model, so this finding is consistent with what is expected.

(ref:rel-fit-foot) *Note:* PSIS-LOO = Pareto smoothed importance sampling leave-one-out cross validation; WAIC = widely applicable information criterion. Parentheses represent the standard error.

```{r rel-fit}
relative_fit %>%
  select(model, contains("_label")) %>%
  mutate(model = case_when(model == "pteq" ~ "Partial Equivalency",
                           model == "nfng" ~ "Non-fungible")) %>%
  kable(align = c("l", "c", "c", "r", "r", "c"), booktabs = TRUE, linesep = "",
        col.names = c("Model", "PSIS-LOO", "WAIC", "PSIS-LOO", "WAIC",
                      "Bayesian Stacking"),
        caption = "Relative Fit Indices and Model Comparisons") %>%
  kable_styling(latex_options = "HOLD_position", position = "left") %>%
  row_spec(0, bold = TRUE, align = "c") %>%
  add_header_above(c(" " = 1, "Fit Indices" = 2, "Model Comparisons" = 3),
                   bold = TRUE) %>%
  footnote(general = "(ref:rel-fit-foot)", general_title = "",
           footnote_as_chunk = TRUE, threeparttable = TRUE)
```


# Discussion

This document presents a Bayesian approach to the estimation and evaluation of DCMs and latent class analyses. As a proof of concept, a simulated data set mimicking the test design of the DLM assessment was used to demonstrate these methods in practice. The Bayesian framework offers several advantages for model estimation and evaluation. First, the ability to place priors on item-level parameters provides a conceptual framework that allows for a straightforward definition of various equality constraints. This allows for a clear delineation of how various models differ from each other.

Following the model definition, there are several existing and well-tested software programs for implementing these models. Specifically, the *Stan* ecosystem provides software for estimating and evaluating Bayesian models in a variety of interfaces. This proof of concept utilizes the *R* interface [@R-rstan], but other interfaces exist for *Python* [PyStan; @pystan], *MATLAB* [MatlabStan; @matlabstan], *Julia* [Stan.jl; @juliastan], *Stata* [StataStan; @statastan], and the command-line terminal [CmdStan; @cmdstan]. Thus, practitioners have the flexibility to work in the environment they are most comfortable with. In addition to the flexibility in interface, the *Stan* ecosystem also has built-in measures for evaluating the estimation process. This includes measures of model convergence, efficiency, and checks on sampler performance. However, access to this ecosystem currently requires users to be familiar with the *Stan* language in order to write the *Stan* code for each model. Future work should focus on developing a high-level interface to *Stan* for estimating DCMs, similar to the **rstanarm** [@R-rstanarm] and **brms** [@brms_2017; @brms_2018] *R* packages that estimate nonlinear and multilevel regression models.

Additionally, the Bayesian framework facilitates the evaluation of absolute model fit through posterior predictive model checks. This offers a significant improvement over existing model fit measures that rely on unmet assumptions of an asymptotic $\chi^2$ distribution or limited information indices that don't fully capture the complexity of the data. With posterior predictive model checks, replicated data sets from the posterior distributions can be generated and compared to the observed data. This gives a direct and flexible scheme for evaluating model fit. In this document, the raw score distribution, item *p*-values, and conditional item *p*-values were examined. However, other characteristics of the data could be calculated and compared. Thus, researchers and practitioners are able to evaluate model based on characteristics they identify as important.

When there are competing models that show adequate absolute model fit, direct comparisons can be made using information criteria. Although using information criteria to compare models is not exclusive to a Bayesian estimation process [e.g., @sen_2017], some indices, such as the PSIS-LOO, are. In this document, the PSIS-LOO and WAIC were used, as they are integrated into the *Stan* ecosystem via the **loo** *R* package [@R-loo] and perform well under a variety of conditions [@loo_waic]. However, as discussed for posterior predictive model checks, practitioners could use other information criteria for their comparisons, such as the Akaike information criterion [AIC; @aic], Bayesian information criterion [BIC; @bic], or deviance information criterion [DIC; @dic]. 

In summary, this document provides a practical guide to estimating and evaluating DCMs in applied settings. Although the models described represent a simplified one-attribute use case, they are generalizable to additional attributes. For example, @ncme_maps used an expanded version of this framework for evaluating map structures and attribute hierarchies in multivariate extensions of these models. Future work will focus on further developing and improving the Bayesian framework for estimating and evaluating DCMs, including extending the methods for evaluating item-level misfit, as well as illustrating how the framework can be implemented in applied and operational settings.


\newpage

# References {-}

```{r write-packages, include = FALSE}
if (!file.exists("bib/packages.bib")) file.create("bib/packages.bib")
suppressWarnings(
  knitr::write_bib(c(.packages(), "rstanarm"), "bib/packages.bib")
)

# Correct capitalization in packages
read_lines("bib/packages.bib") %>%
  str_replace_all(" Stan", " {Stan}") %>%
  str_replace_all("rstan:", "{RStan}:") %>%
  str_replace_all("rstanarm:", "{RStanArm}:") %>%
  str_replace_all("Bayesian", "{Bayesian}") %>%
  str_replace_all("loo:", "{loo}:") %>%
  str_replace_all("WAIC", "{WAIC}") %>%
  write_lines("bib/packages.bib")
```

\printbibliography[heading=none]

\setlength{\parindent}{15pt}
\setlength{\leftskip}{0pt}